First I import the libraries and load the dataset.

In [13]:
import pandas as pd
import os

path = os.getcwd() + "\\data\\fire_archive_M-C61_626683.csv.xz"
data = pd.read_csv(path)

#print data columns so I have a general idea of the data
data.columns

Index(['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date',
       'acq_time', 'satellite', 'instrument', 'confidence', 'version',
       'bright_t31', 'frp', 'daynight', 'type'],
      dtype='object')

next I am going to transform the acq_date(acquired data) and acq_time(acquired time) to be a single datetime, which makes time analysis easier.

In [14]:
from datetime import timedelta

data['acq_date'] = pd.to_datetime(data['acq_date'])
data['acq_datetime'] = data['acq_date'] + pd.Series(
    [timedelta(minutes=i % 100, hours=i // 100) for i in data['acq_time']])

data['acq_datetime']=data['acq_datetime'].dt.normalize()

# remove redundant columns.
data.drop(['acq_time','acq_date','instrument'], axis=1, inplace=True)
# show data after adding datetime and removing redundant columns for verification
data

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime
0,38.5422,-78.3047,304.8,2.8,1.6,Terra,23,6.03,280.9,40.3,N,0,2000-11-01
1,38.5451,-78.3107,309.9,2.8,1.6,Terra,79,6.03,280.7,58.8,N,0,2000-11-01
2,38.5563,-78.3084,309.4,2.8,1.6,Terra,70,6.03,280.4,54.5,N,0,2000-11-01
3,38.5586,-78.3170,302.3,2.8,1.6,Terra,45,6.03,279.8,36.0,N,0,2000-11-01
4,31.3393,-89.9124,304.9,1.0,1.0,Terra,62,6.03,287.5,8.5,N,0,2000-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,Aqua,80,61.03,284.7,21.2,D,0,2025-01-31
2960418,42.4419,-94.3783,300.6,1.2,1.1,Aqua,40,61.03,284.3,6.3,D,0,2025-01-31
2960419,41.4014,-97.9485,319.9,1.0,1.0,Aqua,80,61.03,284.9,19.0,D,0,2025-01-31
2960420,41.4032,-97.9369,322.9,1.0,1.0,Aqua,82,61.03,285.0,22.1,D,0,2025-01-31


next I bin the confidence into high low and nominal, which makes the data categorical, which can be easier to analyze.

In [15]:
data['confidence_binned'] = pd.cut(data['confidence'], bins=[-1, 30, 80, 101], labels=['l', 'n', 'h'])

than I show the data

In [16]:
data

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime,confidence_binned
0,38.5422,-78.3047,304.8,2.8,1.6,Terra,23,6.03,280.9,40.3,N,0,2000-11-01,l
1,38.5451,-78.3107,309.9,2.8,1.6,Terra,79,6.03,280.7,58.8,N,0,2000-11-01,n
2,38.5563,-78.3084,309.4,2.8,1.6,Terra,70,6.03,280.4,54.5,N,0,2000-11-01,n
3,38.5586,-78.3170,302.3,2.8,1.6,Terra,45,6.03,279.8,36.0,N,0,2000-11-01,n
4,31.3393,-89.9124,304.9,1.0,1.0,Terra,62,6.03,287.5,8.5,N,0,2000-11-01,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2960417,41.6966,-99.1437,319.8,1.1,1.0,Aqua,80,61.03,284.7,21.2,D,0,2025-01-31,n
2960418,42.4419,-94.3783,300.6,1.2,1.1,Aqua,40,61.03,284.3,6.3,D,0,2025-01-31,n
2960419,41.4014,-97.9485,319.9,1.0,1.0,Aqua,80,61.03,284.9,19.0,D,0,2025-01-31,n
2960420,41.4032,-97.9369,322.9,1.0,1.0,Aqua,82,61.03,285.0,22.1,D,0,2025-01-31,h


next I want to see if there are any significant correlations between data.

In [17]:
temp_data=data.copy()

categorical=['satellite', 'version', 'daynight', 'confidence_binned']
for column in categorical:
    temp_data[column] = temp_data[column].map({v:i for i,v in enumerate(temp_data[column].unique())})

temp_data.corr()

Unnamed: 0,latitude,longitude,brightness,scan,track,satellite,confidence,version,bright_t31,frp,daynight,type,acq_datetime,confidence_binned
latitude,1.0,-0.630771,0.106153,0.007537,0.005959,-0.056559,0.028873,-0.043253,-0.083351,0.046068,-0.138474,-0.168761,-0.08201,0.054206
longitude,-0.630771,1.0,-0.255469,-0.055141,-0.055634,0.071663,-0.165178,0.068939,0.008175,-0.140732,0.358537,-0.006491,0.046174,-0.188572
brightness,0.106153,-0.255469,1.0,-0.039608,-0.038723,0.057953,0.606423,-0.026611,0.588255,0.647182,0.051976,-0.051158,0.023178,0.595169
scan,0.007537,-0.055141,-0.039608,1.0,0.983202,0.002463,-0.031408,-0.012391,-0.078385,0.206488,0.007138,-0.052716,0.001144,-0.030899
track,0.005959,-0.055634,-0.038723,0.983202,1.0,0.003405,-0.030218,-0.011942,-0.079343,0.202717,0.007882,-0.055961,0.002126,-0.030066
satellite,-0.056559,0.071663,0.057953,0.002463,0.003405,1.0,0.014991,0.02146,0.107998,0.0209,0.1808,-0.048801,0.051065,0.007064
confidence,0.028873,-0.165178,0.606423,-0.031408,-0.030218,0.014991,1.0,-0.013252,0.270839,0.270278,-0.100826,-0.08148,0.019041,0.873073
version,-0.043253,0.068939,-0.026611,-0.012391,-0.011942,0.02146,-0.013252,1.0,-0.009753,-0.011755,0.016193,-0.005785,0.452202,-0.014975
bright_t31,-0.083351,0.008175,0.588255,-0.078385,-0.079343,0.107998,0.270839,-0.009753,1.0,0.333082,0.303581,-0.010224,0.007856,0.248586
frp,0.046068,-0.140732,0.647182,0.206488,0.202717,0.0209,0.270278,-0.011755,0.333082,1.0,-0.011848,-0.011488,0.026992,0.265238


There's a number of significant correlations specifically with confidence with brightness, bright_t31, and frp, with a huge correlation between confidence and brightness.

Next I get VIF(Variance Inflation Factor) to check for multicollinearity

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

data_vif = add_constant(data)
data_vif=data_vif[data_vif['type'] != 1]

categorical = ['satellite', 'daynight', 'version', 'type', 'confidence_binned']
for cat in categorical:
    data_vif[cat] = data_vif[cat].map({name:i for i, name in enumerate(data_vif[cat].unique())})

data_vif['acq_datetime'] = data_vif['acq_datetime'].apply(lambda x: x.toordinal())

vif_data = pd.DataFrame()
vif_data["Feature"] = data_vif.columns
vif_data["VIF"] = [variance_inflation_factor(data_vif.values, i) for i in range(data_vif.shape[1])]

print(vif_data)

              Feature            VIF
0               const  116035.861885
1            latitude       3.240463
2           longitude       3.613247
3          brightness       4.107575
4                scan      30.003901
5               track      29.910299
6           satellite       1.045990
7          confidence       4.420108
8             version       1.267944
9          bright_t31       1.817791
10                frp       2.049211
11           daynight       1.360565
12               type       1.043211
13       acq_datetime       1.288088
14  confidence_binned       4.317374


I notice scan and track seem to have a strong relationship

Next I get OLS(Ordinary Least Squares) to check for relativity between the independent variables and the target variable

In [23]:
import statsmodels.api as sm

temp_data = data.drop(['confidence_binned','confidence'], axis=1)
temp_data=temp_data[temp_data['type'] != 1]

categorical = ['satellite', 'daynight', 'version', 'type']
for cat in categorical:
    temp_data[cat] = temp_data[cat].map({name:i for i, name in enumerate(temp_data[cat].unique())})

temp_data['year']=temp_data['acq_datetime'].dt.year
temp_data['month']=temp_data['acq_datetime'].dt.month
temp_data['day']=temp_data['acq_datetime'].dt.day
temp_data.drop(['acq_datetime'], axis=1, inplace=True)

X=sm.add_constant(temp_data)
y=data[data['type'] != 1]['confidence']
model=sm.OLS(y,X).fit()

model.summary()

0,1,2,3
Dep. Variable:,confidence,R-squared:,0.432
Model:,OLS,Adj. R-squared:,0.432
Method:,Least Squares,F-statistic:,157000.0
Date:,"Mon, 21 Jul 2025",Prob (F-statistic):,0.0
Time:,15:57:14,Log-Likelihood:,-12327000.0
No. Observations:,2889231,AIC:,24650000.0
Df Residuals:,2889216,BIC:,24660000.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-63.9364,3.460,-18.478,0.000,-70.718,-57.155
latitude,-0.1321,0.002,-73.862,0.000,-0.136,-0.129
longitude,0.0419,0.001,44.634,0.000,0.040,0.044
brightness,0.8246,0.001,1178.326,0.000,0.823,0.826
scan,1.3613,0.068,19.966,0.000,1.228,1.495
track,-0.3480,0.223,-1.559,0.119,-0.785,0.089
satellite,-0.1085,0.021,-5.224,0.000,-0.149,-0.068
version,0.5833,0.043,13.550,0.000,0.499,0.668
bright_t31,-0.2574,0.001,-182.003,0.000,-0.260,-0.255

0,1,2,3
Omnibus:,803668.298,Durbin-Watson:,1.655
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2752516.82
Skew:,-1.394,Prob(JB):,0.0
Kurtosis:,6.884,Cond. No.,704000.0


I notice that every independent variable is statistically significant except track because they all have P-values under 0.05

Next I use OLS with binned confidence

In [25]:
y=data[data['type'] != 1]['confidence_binned'].map({'l':0, 'n':1, 'h':2})
model=sm.OLS(y,X).fit()

model.summary()

0,1,2,3
Dep. Variable:,confidence_binned,R-squared:,0.419
Model:,OLS,Adj. R-squared:,0.419
Method:,Least Squares,F-statistic:,148600.0
Date:,"Mon, 21 Jul 2025",Prob (F-statistic):,0.0
Time:,15:58:39,Log-Likelihood:,-1663800.0
No. Observations:,2889231,AIC:,3328000.0
Df Residuals:,2889216,BIC:,3328000.0
Df Model:,14,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.3535,0.086,-27.260,0.000,-2.523,-2.184
latitude,-0.0023,4.46e-05,-52.223,0.000,-0.002,-0.002
longitude,0.0006,2.34e-05,27.179,0.000,0.001,0.001
brightness,0.0202,1.75e-05,1155.901,0.000,0.020,0.020
scan,0.0378,0.002,22.246,0.000,0.035,0.041
track,-0.0263,0.006,-4.726,0.000,-0.037,-0.015
satellite,-0.0057,0.001,-11.084,0.000,-0.007,-0.005
version,0.0101,0.001,9.389,0.000,0.008,0.012
bright_t31,-0.0074,3.53e-05,-210.651,0.000,-0.008,-0.007

0,1,2,3
Omnibus:,324005.492,Durbin-Watson:,1.73
Prob(Omnibus):,0.0,Jarque-Bera (JB):,839585.768
Skew:,-0.644,Prob(JB):,0.0
Kurtosis:,5.306,Cond. No.,704000.0


I see very similar results with this one meaning that almost all/all the independent variables are statistically significant