First import libraries and load data, then transform data datetime to a proper datetime.

In [15]:
import pandas as pd
import os
import numpy as np

path = os.getcwd() + "\\data\\fire_archive_M-C61_626683.csv.xz"
data = pd.read_csv(path)

from datetime import timedelta

data['acq_date'] = pd.to_datetime(data['acq_date'])
data['acq_datetime'] = data['acq_date'] + pd.Series(
    [timedelta(minutes=i % 100, hours=i // 100) for i in data['acq_time']])

#drop redundant columns
data.drop(['acq_time','acq_date','instrument'], axis=1, inplace=True)

data #show data for verification

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime
0,38.5422,-78.3047,304.8,2.8,1.6,Terra,23,6.03,280.9,40.3,N,0,2000-11-01 02:50:00
1,38.5451,-78.3107,309.9,2.8,1.6,Terra,79,6.03,280.7,58.8,N,0,2000-11-01 02:50:00
2,38.5563,-78.3084,309.4,2.8,1.6,Terra,70,6.03,280.4,54.5,N,0,2000-11-01 02:50:00
3,38.5586,-78.3170,302.3,2.8,1.6,Terra,45,6.03,279.8,36.0,N,0,2000-11-01 02:50:00
4,31.3393,-89.9124,304.9,1.0,1.0,Terra,62,6.03,287.5,8.5,N,0,2000-11-01 04:27:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,Aqua,80,61.03,284.7,21.2,D,0,2025-01-31 20:28:00
2960418,42.4419,-94.3783,300.6,1.2,1.1,Aqua,40,61.03,284.3,6.3,D,0,2025-01-31 20:28:00
2960419,41.4014,-97.9485,319.9,1.0,1.0,Aqua,80,61.03,284.9,19.0,D,0,2025-01-31 20:28:00
2960420,41.4032,-97.9369,322.9,1.0,1.0,Aqua,82,61.03,285.0,22.1,D,0,2025-01-31 20:28:00


Next I am going to add year, month, and day. I am also adding the sin/cos variants of month to represent relative distance between times e.g. month 12 to month 1 is the same distance as month 7 to month 8.

In [16]:
data['year'] = data['acq_datetime'].dt.year
data['month'] = data['acq_datetime'].dt.month
data['day'] = data['acq_datetime'].dt.day

data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)

Next I am binning the confidence levels for categorical analysis which I will do later.

In [17]:
data['confidence_binned'] = pd.cut(data['confidence'], bins=[-1, 30, 80, 101], labels=['l', 'n', 'h'])

I import libraries for data processing and model analysis.

In [18]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

I do some preprocessing for the data I will use for training my models.

In [19]:
training_data=data.copy()
training_data=training_data[training_data['type']!=1]
categorical = ['satellite', 'daynight', 'version', 'type']
for cat in categorical:
    training_data[cat] = training_data[cat].map({name:i for i, name in enumerate(training_data[cat].unique())})
training_data.drop(['acq_datetime'], axis=1, inplace=True)

I setup a linear regression model and show the r2 score and mean square error.

In [20]:
from sklearn.linear_model import LinearRegression

X = training_data.drop(['confidence', 'confidence_binned'], axis=1)
y = training_data['confidence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(r2)
print((mse ** 0.5) / 100) #get root of mean square error for just mean error and multiply by 100 for percentile

0.43786607938135247
0.17187393709919913


Next I set up a polynomial version to check for non-linear relationships.

In [21]:
print("degree 2")

X = training_data.drop(['confidence', 'confidence_binned'], axis=1)
y = training_data['confidence']

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

regressor = LinearRegression()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(r2)
print((mse ** 0.5) / 100) #root and get percentile

degree 2
0.5747493096096292
0.14949017243126503


The results for both the normal and polynomial regression models are similar with the polynomial being a decent bit better.

Next I try a decision tree regressor and get feature importances.

In [None]:
from sklearn.ensemble import RandomForestRegressor

X = training_data.drop(['confidence', 'confidence_binned'], axis=1)
y = training_data['confidence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

fr = RandomForestRegressor(random_state=42)
fr.fit(X_train, y_train)

#get predictions
y_pred = fr.predict(X_test)

print("R^2 score:", r2_score(y_test, y_pred))
print("RMSE:", (mean_squared_error(y_test, y_pred) ** 0.5) / 100) # root and percentile
{c: f for c, f in zip(X.columns, fr.feature_importances_)}

Next I import LogisticRegression from sklearn and run the model with polynomial data and normal data. I use saga as the solver because the amount of data is very large.

In [13]:
from sklearn.linear_model import LogisticRegression

X = training_data.drop(['confidence', 'confidence_binned'], axis=1)
y = training_data['confidence_binned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor=LogisticRegression(solver='saga')
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


print()

# go into polynomial model
print("polynomial model")

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_poly_train, X_poly_test, _, _ = train_test_split(X_poly, y, test_size=0.2, random_state=42)

poly_regressor=LogisticRegression(solver='saga')
poly_regressor.fit(X_poly_train,y_train)

y_pred = poly_regressor.predict(X_poly_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.8243618120367502
              precision    recall  f1-score   support

           h       0.86      0.76      0.81    185330
           l       0.00      0.00      0.00     36574
           n       0.81      0.94      0.87    355943

    accuracy                           0.82    577847
   macro avg       0.56      0.57      0.56    577847
weighted avg       0.77      0.82      0.80    577847


polynomial model




Accuracy: 0.6159813929984926


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           h       0.00      0.00      0.00    185330
           l       0.00      0.00      0.00     36574
           n       0.62      1.00      0.76    355943

    accuracy                           0.62    577847
   macro avg       0.21      0.33      0.25    577847
weighted avg       0.38      0.62      0.47    577847



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


The accuracy for the normal model is slightly above 82%, but I rounded it to full percents for simplicity. The non-polynomial model is significantly better, with the polynomial model being worse by about 24%, and having zero accuracy in high, and low confidence prediction.

check coefficients for h/n/l confidence

In [58]:
pd.DataFrame(
    regressor.coef_,
    columns=X.columns,
    index=regressor.classes_
)

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,version,bright_t31,frp,daynight,type,year,month,day,month_sin,month_cos
h,-0.025698,-0.004793,0.137587,-0.005289,-0.001724,-0.012369,0.002599,-0.064113,-0.001751,-0.052818,-0.000256,-0.012302,0.012183,-0.001582,-0.009803,-0.004907
l,0.01589,-0.004912,-0.084673,0.000135,7.7e-05,-0.000319,-0.001069,0.017219,0.001451,-0.005704,0.004751,0.009758,-0.001228,0.001446,-0.008342,-0.017885
n,0.009808,0.009705,-0.052914,0.005154,0.001648,0.012688,-0.001529,0.046894,0.000301,0.058521,-0.004495,0.002545,-0.010955,0.000136,0.018145,0.022792


next I set up a decision tree and get the accuracy and report as well as the feature importances.

In [12]:
from sklearn.ensemble import RandomForestClassifier
X = training_data.drop(['confidence', 'confidence_binned'], axis=1)
y = training_data['confidence_binned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

fc = RandomForestClassifier()
fc.fit(X_train, y_train)

y_pred = fc.predict(X_test)
accuracy=accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

{c: f for c, f in zip(X.columns, fc.feature_importances_)}

Accuracy: 0.8870496861626002
              precision    recall  f1-score   support

           h       0.87      0.94      0.91    184769
           l       0.75      0.20      0.32     37073
           n       0.90      0.93      0.91    356005

    accuracy                           0.89    577847
   macro avg       0.84      0.69      0.71    577847
weighted avg       0.88      0.89      0.87    577847



{'latitude': 0.06121906998618347,
 'longitude': 0.08672830010421334,
 'brightness': 0.37452014780825577,
 'scan': 0.036910263033661475,
 'track': 0.025507379869393547,
 'satellite': 0.006613429113048619,
 'version': 0.0020618835023108613,
 'bright_t31': 0.07252574024590575,
 'frp': 0.19449798127786816,
 'daynight': 0.03851906401809685,
 'type': 0.0027348555022536736,
 'year': 0.02769654557011573,
 'month': 0.01282501706473381,
 'day': 0.03410549397773451,
 'month_sin': 0.008979043896184319,
 'month_cos': 0.014555785030040149}

We got good results; with over a 85% accuracy score. The feature importances for month_cos, daynight, frp, bright_t31, and brightness have the highest values, with brightness being the highest at over 0.78 this means that brightness is likely the most significant variable for the target variable.