First import libraries and load data, then transform data datetime to a proper datetime.

In [3]:
import pandas as pd
import os
import numpy as np

path = os.getcwd() + "\\data\\fire_archive_M-C61_626683.csv.xz"
data = pd.read_csv(path)

from datetime import timedelta

data['acq_date'] = pd.to_datetime(data['acq_date'])
data['acq_datetime'] = data['acq_date'] + pd.Series(
    [timedelta(minutes=i % 100, hours=i // 100) for i in data['acq_time']])

#drop redundant columns
data.drop(['acq_time','acq_date','instrument'], axis=1, inplace=True)

data #show data for verification

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime
0,38.5422,-78.3047,304.8,2.8,1.6,Terra,23,6.03,280.9,40.3,N,0,2000-11-01 02:50:00
1,38.5451,-78.3107,309.9,2.8,1.6,Terra,79,6.03,280.7,58.8,N,0,2000-11-01 02:50:00
2,38.5563,-78.3084,309.4,2.8,1.6,Terra,70,6.03,280.4,54.5,N,0,2000-11-01 02:50:00
3,38.5586,-78.3170,302.3,2.8,1.6,Terra,45,6.03,279.8,36.0,N,0,2000-11-01 02:50:00
4,31.3393,-89.9124,304.9,1.0,1.0,Terra,62,6.03,287.5,8.5,N,0,2000-11-01 04:27:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,Aqua,80,61.03,284.7,21.2,D,0,2025-01-31 20:28:00
2960418,42.4419,-94.3783,300.6,1.2,1.1,Aqua,40,61.03,284.3,6.3,D,0,2025-01-31 20:28:00
2960419,41.4014,-97.9485,319.9,1.0,1.0,Aqua,80,61.03,284.9,19.0,D,0,2025-01-31 20:28:00
2960420,41.4032,-97.9369,322.9,1.0,1.0,Aqua,82,61.03,285.0,22.1,D,0,2025-01-31 20:28:00


next I am going to add year, month, day, hour, and minute as well as their sin/cos variants to represent relative distance between times e.g. month 12 to month 1 is the same distance as month 7 to month 8.

In [4]:
data['year'] = data['acq_datetime'].dt.year
data['month'] = data['acq_datetime'].dt.month
data['day'] = data['acq_datetime'].dt.day
data['hour'] = data['acq_datetime'].dt.hour
data['minute'] = data['acq_datetime'].dt.minute

data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 31)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 31)
data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
data['minute_sin'] = np.sin(2 * np.pi * data['minute'] / 60)
data['minute_cos'] = np.cos(2 * np.pi * data['minute'] / 60)

I import libraries for a linear regression model

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

I setup a linear regression model and show the r2 score and mean square error

In [6]:
X = data[data['type'] != 1].drop(['confidence','acq_datetime'], axis=1) # exclude type one because it is volcano and not fire

categorical = ['satellite', 'daynight', 'version', 'type']
for cat in categorical:
    X[cat] = X[cat].map({name:i for i, name in enumerate(X[cat].unique())})

y = data[data['type'] != 1]['confidence']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(r2)
print((mse ** 0.5) / 100) #get root of mean square error for just mean error and multiply by 100 for percentile

0.43960070331133294
0.17122846151085538


next I set up a polynomial version to check for non-linear relationships

In [7]:
print("degree 2")

X = data[data['type'] != 1].drop(['confidence','acq_datetime'], axis=1) # exclude type one because it is volcano and not fire

categorical = ['satellite', 'daynight', 'version', 'type']
for cat in categorical:
    X[cat] = X[cat].map({name:i for i, name in enumerate(X[cat].unique())})

y = data[data['type'] != 1]['confidence']

poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2)

regressor = LinearRegression()

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(r2)
print((mse ** 0.5) / 100) #root and get percentile

degree 2
0.5786394504612165
0.14840878410035083


The results for both the normal and polynomial regression models are similar with the polynomial being a decent bit better.

than I try a decision tree regressor and get feature importances

In [4]:
from sklearn.tree import DecisionTreeRegressor
X = data[data['type'] != 1].drop(['confidence','acq_datetime'], axis=1) # exclude type one because it is volcano and not fire

categorical = ['satellite', 'daynight', 'version', 'type']
for cat in categorical:
    X[cat] = X[cat].map({name:i for i, name in enumerate(X[cat].unique())})

y = data[data['type'] != 1]['confidence']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

#get predictions
y_pred = dt.predict(X_test)

print("R^2 score:", r2_score(y_test, y_pred))
print("RMSE:", (mean_squared_error(y_test, y_pred) ** 0.5) / 100) # root and percentile
{c: f for c, f in zip(X.columns, dt.feature_importances_)}

R^2 score: 0.36997494122251096
RMSE: 0.18156364760362023


{'latitude': 0.050921977935218805,
 'longitude': 0.052611972090290914,
 'brightness': 0.5329594115132186,
 'scan': 0.021623718839759543,
 'track': 0.006167222511537681,
 'satellite': 0.0016376423267885753,
 'version': 0.0007868826308649792,
 'bright_t31': 0.050853499262769025,
 'frp': 0.08440784911151404,
 'daynight': 0.022356137691964883,
 'type': 0.004158119043926909,
 'year': 0.017475320909761304,
 'month': 0.00805941517861143,
 'day': 0.014432634248683086,
 'hour': 0.007034552275200663,
 'minute': 0.01507507178250059,
 'month_sin': 0.005379242000437235,
 'month_cos': 0.020308761212867742,
 'day_sin': 0.016438332647391186,
 'day_cos': 0.013387045471630516,
 'hour_sin': 0.016943600779302917,
 'hour_cos': 0.004231291632170769,
 'minute_sin': 0.016903530006176084,
 'minute_cos': 0.015846768897412555}

Next I bin the confidence for categorical analysis

In [6]:
data['confidence_binned'] = pd.cut(data['confidence'], bins=[-1, 30, 80, 101], labels=['l', 'n', 'h'])

next I set up the data for the models by importing libraries, and setting up dependent/independent variables.

In [7]:
from sklearn.metrics import accuracy_score, classification_report

X = data[data['type'] != 1].drop(['confidence', 'acq_datetime', 'confidence_binned'], axis=1)

categorical = ['satellite', 'daynight', 'version', 'type']
for cat in categorical:
    X[cat] = X[cat].map({name:i for i, name in enumerate(X[cat].unique())})

y = data[data['type'] != 1]['confidence_binned']

poly = PolynomialFeatures(degree=2)

X_poly = poly.fit_transform(X)

next I import LogisticRegression from sklearn and run the model with polynomial data and normal data. I use saga as the solver because the amount of data is very large.

In [9]:
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

regressor=LogisticRegression(solver='saga')
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


print()

# go into polynomial model
print("polynomial model")

X_poly_train, X_poly_test, _, _ = train_test_split(X_poly, y, test_size=0.2, random_state=42, stratify=y)

poly_regressor=LogisticRegression(solver='saga')
poly_regressor.fit(X_poly_train,y_train)

y_pred = poly_regressor.predict(X_poly_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.8385939530706225
              precision    recall  f1-score   support

           h       0.87      0.80      0.83    185524
           l       0.00      0.00      0.00     36840
           n       0.83      0.94      0.88    355483

    accuracy                           0.84    577847
   macro avg       0.56      0.58      0.57    577847
weighted avg       0.79      0.84      0.81    577847


polynomial model




Accuracy: 0.7995161348938387
              precision    recall  f1-score   support

           h       0.86      0.67      0.75    185524
           l       0.00      0.00      0.00     36840
           n       0.78      0.95      0.86    355483

    accuracy                           0.80    577847
   macro avg       0.55      0.54      0.54    577847
weighted avg       0.76      0.80      0.77    577847



The accuracy is above 80% and the models are very similar with the polynomial model being worse by a small amount.

check coefficients for h/n/l confidence

In [12]:
pd.DataFrame(
    regressor.coef_,
    columns=X.columns,
    index=regressor.classes_
)

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,version,bright_t31,frp,daynight,...,hour,minute,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,minute_sin,minute_cos
h,-0.012601,0.00359,0.155735,-0.000345,-0.000133,0.009162,0.002798,-0.045107,-0.002019,-0.014941,...,-0.118713,-0.002211,0.001765,0.01424,0.000317,0.001187,0.021453,-0.004033,6.3e-05,0.000656
l,0.012657,-0.007269,-0.089193,0.000975,0.000339,0.002892,-0.000933,0.011852,0.001488,0.001055,...,0.030462,0.000867,-0.005432,-0.012536,-0.000108,-5.7e-05,-0.002638,-0.001453,-0.000146,-0.000385
n,-5.6e-05,0.00368,-0.066541,-0.00063,-0.000206,-0.012054,-0.001866,0.033254,0.000531,0.013887,...,0.088251,0.001344,0.003667,-0.001704,-0.000209,-0.00113,-0.018815,0.005486,8.3e-05,-0.000271


next I set up a decision tree and get the accuracy and report as well as the feature importances.

In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

{c: f for c, f in zip(X.columns, dt.feature_importances_)}

Accuracy: 0.8257029975062603
              precision    recall  f1-score   support

           h       0.86      0.85      0.86    185524
           l       0.29      0.32      0.31     36840
           n       0.87      0.86      0.87    355483

    accuracy                           0.83    577847
   macro avg       0.67      0.68      0.68    577847
weighted avg       0.83      0.83      0.83    577847



{'latitude': 0.052281312028193645,
 'longitude': 0.05593738358169366,
 'brightness': 0.5272333682265847,
 'scan': 0.015926427590389425,
 'track': 0.006064053902853737,
 'satellite': 0.0017520196062239934,
 'version': 0.0008114591579965623,
 'bright_t31': 0.07515121541465639,
 'frp': 0.05345416461999269,
 'daynight': 0.048934790395187326,
 'type': 0.0020572543809489413,
 'year': 0.01841402775216647,
 'month': 0.005575558088728949,
 'day': 0.014872170088512384,
 'hour': 0.011135657427692978,
 'minute': 0.01580315066056507,
 'month_sin': 0.00549363409584673,
 'month_cos': 0.015518911704271329,
 'day_sin': 0.01748549677121142,
 'day_cos': 0.013989463472254205,
 'hour_sin': 0.003826983851549169,
 'hour_cos': 0.0038937127439916163,
 'minute_sin': 0.01777621272691782,
 'minute_cos': 0.016611571711570763}

Next I import some libraries to tune hyperparameters and iterate through all options of a param_grid(parameter name than a array of different options) and print results.

The code below is taking too long for me to include results as of now

In [None]:
from sklearn.model_selection import GridSearchCV
import multiprocessing

#setup param_grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10]
}

dt = DecisionTreeClassifier(random_state=42)

#get the number of cores the computer has minus 1 to avoid lack of resources error
n_jobs = max(1, multiprocessing.cpu_count() - 1)

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(best_params)
print(best_model)

Next I use the best parameters for the model.

In [21]:
dt = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


{c: f for c, f in zip(X.columns, dt.feature_importances_)}

Accuracy: 0.8719713003615144
              precision    recall  f1-score   support

           h       0.84      0.93      0.88    185618
           l       0.83      0.13      0.22     37044
           n       0.89      0.92      0.90    355185

    accuracy                           0.87    577847
   macro avg       0.85      0.66      0.67    577847
weighted avg       0.87      0.87      0.85    577847



{'latitude': 0.007053468550189175,
 'longitude': 0.014652101151220207,
 'brightness': 0.7877282445789257,
 'scan': 0.0021189314201473563,
 'track': 0.0003722899444767902,
 'satellite': 7.641903008653659e-05,
 'version': 4.4983656120045555e-06,
 'bright_t31': 0.05206133130007941,
 'frp': 0.020983539174514114,
 'daynight': 0.07929169720409712,
 'type': 0.0021671317867487626,
 'year': 5.7515955468228334e-05,
 'month': 0.0017135967608949373,
 'day': 4.2738759504822664e-05,
 'hour': 0.011057979676305358,
 'minute': 4.0620226353487834e-05,
 'month_sin': 0.0018008012516576735,
 'month_cos': 0.01845980313830326,
 'day_sin': 7.598213724924507e-05,
 'day_cos': 4.17711141230943e-05,
 'hour_sin': 4.986674373196467e-05,
 'hour_cos': 8.185308909855604e-05,
 'minute_sin': 3.676779533979882e-05,
 'minute_cos': 3.10508458724331e-05}

We got good results; with over a 85% accuracy score. The feature importances for month_cos, daynight, frp, bright_t31, and brightness have the highest values, with brightness being the highest at over 0.78 this means that brightness is likely the most significant variable for the target variable.