import libraries and load data, then show it so I know the columns and what kind of data they contain.

In [1]:
import pandas as pd
import os

path=os.getcwd()+"\\data\\fire_archive_M-C61_626683.csv.xz"
data=pd.read_csv(path)
data

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,38.5422,-78.3047,304.8,2.8,1.6,2000-11-01,250,Terra,MODIS,23,6.03,280.9,40.3,N,0
1,38.5451,-78.3107,309.9,2.8,1.6,2000-11-01,250,Terra,MODIS,79,6.03,280.7,58.8,N,0
2,38.5563,-78.3084,309.4,2.8,1.6,2000-11-01,250,Terra,MODIS,70,6.03,280.4,54.5,N,0
3,38.5586,-78.3170,302.3,2.8,1.6,2000-11-01,250,Terra,MODIS,45,6.03,279.8,36.0,N,0
4,31.3393,-89.9124,304.9,1.0,1.0,2000-11-01,427,Terra,MODIS,62,6.03,287.5,8.5,N,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,2025-01-31,2028,Aqua,MODIS,80,61.03,284.7,21.2,D,0
2960418,42.4419,-94.3783,300.6,1.2,1.1,2025-01-31,2028,Aqua,MODIS,40,61.03,284.3,6.3,D,0
2960419,41.4014,-97.9485,319.9,1.0,1.0,2025-01-31,2028,Aqua,MODIS,80,61.03,284.9,19.0,D,0
2960420,41.4032,-97.9369,322.9,1.0,1.0,2025-01-31,2028,Aqua,MODIS,82,61.03,285.0,22.1,D,0


combine acq_date and acq_time into acq_datetime for a single datetime column. I also remove redundant columns.

In [2]:
from datetime import timedelta

data['acq_date'] = pd.to_datetime(data['acq_date'])
data['acq_datetime'] = data['acq_date'] + pd.Series(
    [timedelta(minutes=i % 100, hours=i // 100) for i in data['acq_time']])
data.drop(['acq_time','acq_date','instrument'], axis=1, inplace=True)
data

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime
0,38.5422,-78.3047,304.8,2.8,1.6,Terra,23,6.03,280.9,40.3,N,0,2000-11-01 02:50:00
1,38.5451,-78.3107,309.9,2.8,1.6,Terra,79,6.03,280.7,58.8,N,0,2000-11-01 02:50:00
2,38.5563,-78.3084,309.4,2.8,1.6,Terra,70,6.03,280.4,54.5,N,0,2000-11-01 02:50:00
3,38.5586,-78.3170,302.3,2.8,1.6,Terra,45,6.03,279.8,36.0,N,0,2000-11-01 02:50:00
4,31.3393,-89.9124,304.9,1.0,1.0,Terra,62,6.03,287.5,8.5,N,0,2000-11-01 04:27:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,Aqua,80,61.03,284.7,21.2,D,0,2025-01-31 20:28:00
2960418,42.4419,-94.3783,300.6,1.2,1.1,Aqua,40,61.03,284.3,6.3,D,0,2025-01-31 20:28:00
2960419,41.4014,-97.9485,319.9,1.0,1.0,Aqua,80,61.03,284.9,19.0,D,0,2025-01-31 20:28:00
2960420,41.4032,-97.9369,322.9,1.0,1.0,Aqua,82,61.03,285.0,22.1,D,0,2025-01-31 20:28:00


get month from acq_datetime and add it as a column for a discrete column, rather than acq_datetime which is continuous.

In [3]:
data['month']=data['acq_datetime'].dt.month

show the number of data entries for different types

In [4]:
data['type'].value_counts()

type
0    2758619
2     126370
1      71191
3       4242
Name: count, dtype: int64

import libraries for the model and analysis, to predict the type.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample

#group columns so I can change categorical to numerical data.
categorical_cols = ['daynight','satellite','version']
numeric_cols = ['latitude', 'longitude', 'brightness', 'frp', 'scan', 'track',
                'bright_t31', 'confidence','month']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', 'passthrough', numeric_cols)
])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('clf', LogisticRegression(max_iter=100000, solver='saga'))
])

df = data.drop(columns=['acq_datetime'])

dfs = [df[df['type'] == c] for c in df['type'].unique()]
min_count = min(len(d) for d in dfs)
df_balanced = pd.concat([
    resample(d, replace=False, n_samples=min_count, random_state=42)
    for d in dfs
])

X = df.drop('type', axis=1)
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42, stratify=y)

model.fit(X_train, y_train)

y_pred = model.predict(data.drop('type', axis=1))
accuracy = accuracy_score(data['type'], y_pred)
print("Accuracy:", accuracy)
print(classification_report(data['type'], y_pred))

Accuracy: 0.9342208644578375


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.93      1.00      0.97   2758619
           1       0.85      0.10      0.18     71191
           2       0.47      0.00      0.00    126370
           3       0.00      0.00      0.00      4242

    accuracy                           0.93   2960422
   macro avg       0.56      0.28      0.29   2960422
weighted avg       0.91      0.93      0.90   2960422



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


I was able to get good prediction on type 0 and ok prediction on 1 but the rest were pretty bad, I might use the model to check if something is a type 0 or not but using it for other types won't work well.