In [93]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

path = 'labelled_historical_modus.csv'

df = pd.read_csv(path, index_col=0)
# Display the first 5 rows

df.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type,LAT_ROUND,LON_ROUND,FIRE_YEAR,STAT_CAUSE_DESCR,DISCOVERY_DOY
0,19.353,-155.0576,400.1,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,311.8,255.8,N,2,19.35,-155.06,2001,,
1,19.3378,-155.036,311.6,1.0,1.0,2001-01-01,849,Terra,MODIS,83,6.2,296.9,9.8,N,2,19.34,-155.04,2001,,
2,19.3544,-155.0482,322.2,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,293.6,19.6,N,2,19.35,-155.05,2001,,
3,19.3468,-155.0375,362.5,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,303.1,92.9,N,2,19.35,-155.04,2001,,
4,19.3516,-155.067,313.6,1.0,1.0,2001-01-01,849,Terra,MODIS,87,6.2,291.4,11.8,N,2,19.35,-155.07,2001,,


In [94]:
df.shape

(1787135, 20)

In [95]:
# Label fires as rows without NANs in the STAT_CAUSE_DESCR column
df.loc[df['STAT_CAUSE_DESCR'].isna() != True, 'isFire'] = 1
df.loc[df['STAT_CAUSE_DESCR'].isna() == True, 'isFire'] = 0

df['acq_date'] = pd.to_datetime(df['acq_date'])
# df['acq_date'].apply(pd.Timestamp.to_julian_date())

# drop excess columns
df.drop(columns = ['LAT_ROUND', 'LON_ROUND','FIRE_YEAR', 'STAT_CAUSE_DESCR', 'DISCOVERY_DOY', 'instrument'], inplace=True)



In [96]:
df.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,confidence,version,bright_t31,frp,daynight,type,isFire
0,19.353,-155.0576,400.1,1.0,1.0,2001-01-01,849,Terra,100,6.2,311.8,255.8,N,2,0.0
1,19.3378,-155.036,311.6,1.0,1.0,2001-01-01,849,Terra,83,6.2,296.9,9.8,N,2,0.0
2,19.3544,-155.0482,322.2,1.0,1.0,2001-01-01,849,Terra,100,6.2,293.6,19.6,N,2,0.0
3,19.3468,-155.0375,362.5,1.0,1.0,2001-01-01,849,Terra,100,6.2,303.1,92.9,N,2,0.0
4,19.3516,-155.067,313.6,1.0,1.0,2001-01-01,849,Terra,87,6.2,291.4,11.8,N,2,0.0


In [97]:
from sklearn.model_selection import TimeSeriesSplit

random_state = 314

X = df.drop(columns = 'isFire')
y = df[['acq_date', 'isFire']]


X_train = X[X['acq_date'].dt.year <= 2014]
y_train = y[y['acq_date'].dt.year <= 2014].drop(columns = 'acq_date')

X_test = X[X['acq_date'].dt.year > 2014]
y_test = y[y['acq_date'].dt.year > 2014].drop(columns = 'acq_date')

In [98]:
# Now I'll set up pipelines

# scikit-learn pipelines
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

# feature processing
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

# pre-processing pipeline
column_trans = ColumnTransformer(
    [('onehot', ce.OneHotEncoder(), ['satellite', 'daynight', 'type', 'version']),
    ('scale', StandardScaler(), ['brightness', 'track', 'scan', 'confidence', 'bright_t31', 'frp'])],
    n_jobs=-1, remainder='passthrough', verbose=True)


In [99]:
# we create classification pipelines
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


pipelines ={ 
    'rfc' : make_pipeline(column_trans, RandomForestClassifier(random_state = random_state )),
    'gbc' : make_pipeline(column_trans, GradientBoostingClassifier(random_state = random_state))
}

for key, value in pipelines.items():
  value.fit(X_train, y_train)
  print(key + ' score: ', value.score(X_test, y_test))

TypeError: float() argument must be a string or a number, not 'Timestamp'

In [None]:
# suspiciously high accuracy, let's check ROC and f1
from sklearn.metrics import f1_score, roc_auc_score

for key, value in pipelines.items():
    y_preds = value.predict(X_test)
    print(key + ' f1 score: ', f1_score(y_test, y_preds))
    print(key + ' ROC score: ', roc_auc_score(y_test, y_preds))


In [89]:
# This shows that the model is not picking up the 'signal' from the fire data, it's just predicting the majority class
y_train['isFire'].value_counts()

0.0    1595084
1.0      44686
Name: isFire, dtype: int64

In [90]:
X_train

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,19.3530,-155.0576,400.1,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,311.8,255.8,N,2
1,19.3378,-155.0360,311.6,1.0,1.0,2001-01-01,849,Terra,MODIS,83,6.2,296.9,9.8,N,2
2,19.3544,-155.0482,322.2,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,293.6,19.6,N,2
3,19.3468,-155.0375,362.5,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,303.1,92.9,N,2
4,19.3516,-155.0670,313.6,1.0,1.0,2001-01-01,849,Terra,MODIS,87,6.2,291.4,11.8,N,2
5,19.3620,-155.0590,371.1,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,307.4,121.8,N,2
6,19.3440,-155.0563,403.3,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,315.8,275.1,N,2
7,19.3351,-155.0549,315.4,1.0,1.0,2001-01-01,849,Terra,MODIS,91,6.2,292.2,13.0,N,2
8,19.3365,-155.0455,307.1,1.0,1.0,2001-01-01,849,Terra,MODIS,70,6.2,295.8,6.4,N,2
9,19.3454,-155.0469,368.2,1.0,1.0,2001-01-01,849,Terra,MODIS,100,6.2,309.3,110.3,N,2
