First import libraries and load data, then transform data datetime to a proper datetime.

In [1]:
import pandas as pd #pandas for data
import os           #os for data path
import numpy as np  #np for math on data

path = os.getcwd() + "\\data\\fire_archive_M-C61_626683.csv.xz" #set data path
data = pd.read_csv(path) #import from data path

from datetime import timedelta #import timedelta from datetime for datetime addition

data['acq_date'] = pd.to_datetime(data['acq_date']) #convert acq_date to datetime
data['acq_datetime'] = data['acq_date'] + pd.Series(
    [timedelta(minutes=i % 100, hours=i // 100) for i in data['acq_time']]) #add acq_time by getting hours as any numbers above or equal to 100 and minutes as anything under 100
data.drop('acq_time', axis=1, inplace=True)  #drop acq_time
data.drop('acq_date', axis=1, inplace=True)  #drop acq_time
data.drop('instrument', axis=1, inplace=True)#drop instrument because it is a constant and is not usefull
data #show data

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime
0,38.5422,-78.3047,304.8,2.8,1.6,Terra,23,6.03,280.9,40.3,N,0,2000-11-01 02:50:00
1,38.5451,-78.3107,309.9,2.8,1.6,Terra,79,6.03,280.7,58.8,N,0,2000-11-01 02:50:00
2,38.5563,-78.3084,309.4,2.8,1.6,Terra,70,6.03,280.4,54.5,N,0,2000-11-01 02:50:00
3,38.5586,-78.3170,302.3,2.8,1.6,Terra,45,6.03,279.8,36.0,N,0,2000-11-01 02:50:00
4,31.3393,-89.9124,304.9,1.0,1.0,Terra,62,6.03,287.5,8.5,N,0,2000-11-01 04:27:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,Aqua,80,61.03,284.7,21.2,D,0,2025-01-31 20:28:00
2960418,42.4419,-94.3783,300.6,1.2,1.1,Aqua,40,61.03,284.3,6.3,D,0,2025-01-31 20:28:00
2960419,41.4014,-97.9485,319.9,1.0,1.0,Aqua,80,61.03,284.9,19.0,D,0,2025-01-31 20:28:00
2960420,41.4032,-97.9369,322.9,1.0,1.0,Aqua,82,61.03,285.0,22.1,D,0,2025-01-31 20:28:00


next I am going to add year, month, day, hour, and minute as well as their sin/cos variants to represent relative distance between times e.g. month 12 to month 1 is the same distance as month 7 to month 8.

In [2]:
#setup year, month, day, hour, and minute from the datetime setup in previous code
data['year'] = data['acq_datetime'].dt.year
data['month'] = data['acq_datetime'].dt.month
data['day'] = data['acq_datetime'].dt.day
data['hour'] = data['acq_datetime'].dt.hour
data['minute'] = data['acq_datetime'].dt.minute

#get sin/cos of month, day, hour, and minute using np
data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
data['day_sin'] = np.sin(2 * np.pi * data['day'] / 31)
data['day_cos'] = np.cos(2 * np.pi * data['day'] / 31)
data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
data['minute_sin'] = np.sin(2 * np.pi * data['minute'] / 60)
data['minute_cos'] = np.cos(2 * np.pi * data['minute'] / 60)

next I get the confidence in a categorical form by binning it.

In [3]:
data['confidence_binned'] = pd.cut(data['confidence'], bins=[-1, 60, 80, 101], labels=['l', 'n', 'h']) #binning data into l(low), n(nominal), and h(high) confidence levels

next I set up the data for the models by importing libraries, and setting up dependent/independent variables. P.S. I am excluding type 1 from the data because it represents a volcano and not a fire.

In [4]:
from sklearn.model_selection import train_test_split #import train_test_split to get training and testing sets of data
from sklearn.preprocessing import PolynomialFeatures #import polynomial features for non-linear relationships between dependent and independent variables
from sklearn.metrics import accuracy_score, classification_report #import accuracy_score and classification_report for model analysis

X = data[data['type'] != 1].drop(['confidence', 'acq_datetime', 'confidence_binned'], axis=1) #set up independent variables

categorical = ['satellite', 'daynight', 'version', 'type'] #array of categorical variables for transformation into numerical
for cat in categorical: #iterate through categorical array
    X[cat] = X[cat].map({name:i for i, name in enumerate(X[cat].unique())}) #map variables by unique entries to a number

y = data[data['type'] != 1]['confidence_binned'] #set confidence_binned as target/dependent variable

poly = PolynomialFeatures(degree=2) #setup polynomial features on the second degree

X_poly = poly.fit_transform(X) #transform independent variables to polynomials for non-linear relationships

next I set up the train/test sets of data

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_poly_train, X_poly_test, _, _ = train_test_split(X, y, test_size=0.2, random_state=42) #y_train and y_test is same as previous so ignore them

next I import LogisticRegression from sklearn and run the model with polynomial data and normal data. I use saga as the solver because the amount of data is very large.

In [9]:
from sklearn.linear_model import LogisticRegression

#setup model and fit data
regressor=LogisticRegression(solver='saga')
regressor.fit(X_train,y_train)

#get predictions
y_pred = regressor.predict(X_test)

#analyze model accuracy and get details
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


print()


print("polynomial model")

#setup polynomial model and fit data
regressor=LogisticRegression(solver='saga')
regressor.fit(X_poly_train,y_train)

#get predictions
y_pred = regressor.predict(X_poly_test)

#analyze model accuracy and get details
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.7005643362343319
              precision    recall  f1-score   support

           h       0.85      0.84      0.84    185618
           l       0.65      0.74      0.70    203117
           n       0.60      0.52      0.56    189112

    accuracy                           0.70    577847
   macro avg       0.70      0.70      0.70    577847
weighted avg       0.70      0.70      0.70    577847


polynomial model




Accuracy: 0.7005626056724358
              precision    recall  f1-score   support

           h       0.85      0.84      0.84    185618
           l       0.65      0.74      0.70    203117
           n       0.60      0.52      0.56    189112

    accuracy                           0.70    577847
   macro avg       0.70      0.70      0.70    577847
weighted avg       0.70      0.70      0.70    577847



The accuracy is above 70% and the models are almost identical with the polynomial model being better by a negligible amount.

next I set up a decision tree and get the accuracy and report as well as the feature importances.

In [7]:
from sklearn.tree import DecisionTreeClassifier #import DecisionTreeClassifier

#setup and fit decision tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

#get predictions
y_pred = dt.predict(X_test)

#get accuracy, report, and feature importances
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

{c: f for c, f in zip(X.columns, dt.feature_importances_)}

Accuracy: 0.746941664489043
              precision    recall  f1-score   support

           h       0.86      0.85      0.86    185618
           l       0.73      0.73      0.73    203117
           n       0.66      0.66      0.66    189112

    accuracy                           0.75    577847
   macro avg       0.75      0.75      0.75    577847
weighted avg       0.75      0.75      0.75    577847



{'latitude': 0.059573445324816725,
 'longitude': 0.06210134221683668,
 'brightness': 0.44361623953719487,
 'scan': 0.026976670466270832,
 'track': 0.006982541029488504,
 'satellite': 0.0021309147209667826,
 'version': 0.0009724089636609864,
 'bright_t31': 0.07263093289657646,
 'frp': 0.09371486075801229,
 'daynight': 0.03226591536150576,
 'type': 0.003956404835810346,
 'year': 0.022750704108070226,
 'month': 0.006330561823209275,
 'day': 0.01787585440315591,
 'hour': 0.008993941849912115,
 'minute': 0.01882762088410438,
 'month_sin': 0.010442302056256866,
 'month_cos': 0.022840755883060084,
 'day_sin': 0.02102306981925181,
 'day_cos': 0.01682603031496115,
 'hour_sin': 0.00421539463342919,
 'hour_cos': 0.0044385855259681944,
 'minute_sin': 0.020610465216377424,
 'minute_cos': 0.01990303737110314}

Next I import some libraries to tune hyperparameters and iterate through all options of a param_grid(parameter name than a array of different options) and print results.

The code below is taking too long for me to include results as of now

In [None]:
from sklearn.model_selection import GridSearchCV #import GridSearchCV which will iterate through different combinations of the param_grid
import multiprocessing #import multiprocessing to prevent use of all computer cores and cause program to fail due to lack of computer resources

#setup param_grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10, 20],
    'min_samples_leaf': [2, 5, 10]
}

#create decision tree
dt = DecisionTreeClassifier(random_state=42)

#get the number of cores the computer has minus 1
n_jobs = max(1, multiprocessing.cpu_count() - 1)
#setup GridSearchCV
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=3, scoring='accuracy')

#fit the GridSearchCV to the data
grid_search.fit(X_train, y_train)

#get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

#print best parameters and model
print(best_params)
print(best_model)

Next I use the best parameters for the model.

In [8]:
dt = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10) #setup decision tree with best parameters
dt.fit(X_train, y_train) #fit decision tree model

y_pred = dt.predict(X_test) #predict results

#print accuracy and details
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

#show feature importances
{c: f for c, f in zip(X.columns, dt.feature_importances_)}

Accuracy: 0.7768440434924816
              precision    recall  f1-score   support

           h       0.82      0.96      0.88    185618
           l       0.80      0.69      0.74    203117
           n       0.70      0.70      0.70    189112

    accuracy                           0.78    577847
   macro avg       0.78      0.78      0.77    577847
weighted avg       0.78      0.78      0.77    577847



{'latitude': 0.007319868276604551,
 'longitude': 0.017444345866559064,
 'brightness': 0.7272516706553218,
 'scan': 0.014093512133096415,
 'track': 0.0015666467240539738,
 'satellite': 0.0,
 'version': 0.0,
 'bright_t31': 0.03937144105382471,
 'frp': 0.07988395201036479,
 'daynight': 0.05875329465925075,
 'type': 0.0033883661578079714,
 'year': 8.651828418442294e-05,
 'month': 0.0026325353978673637,
 'day': 4.6406616925535576e-05,
 'hour': 0.00812145167854428,
 'minute': 4.9653364955112836e-05,
 'month_sin': 0.008034702177709581,
 'month_cos': 0.031641069354403084,
 'day_sin': 4.7713016094113525e-05,
 'day_cos': 7.502984140865522e-06,
 'hour_sin': 4.5485659698537946e-05,
 'hour_cos': 0.00013949520992502913,
 'minute_sin': 2.1140302451466874e-05,
 'minute_cos': 5.322841621663485e-05}

We got good results; with over a 77% accuracy score. The feature importances for month_cos, daynight, frp, bright_t31, and brightness have the highest values, with brightness being the highest at over 0.72 this means that brightness is likely the most significant variable for the target variable.