dt vs rf vs gb: https://www.datasciencecentral.com/profiles/blogs/decision-tree-vs-random-forest-vs-boosted-trees-explained
     rf hyper tune: https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d
partial dependency: https://scikit-learn.org/stable/auto_examples/inspection/plot_partial_dependence.html#sphx-glr-auto-examples-inspection-plot-partial-dependence-py


# Load data from csv files

In [22]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.model_selection import GridSearchCV

import numpy as np

from tpot import TPOTClassifier

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence

In [4]:
# Read data from csv files
label = pd.read_csv('BurnedArea-100k.csv')
elevation = pd.read_csv('Elevation-100k.csv')
forest = pd.read_csv('Forest-100k.csv')
humanMod = pd.read_csv('HumanModification-100k.csv')
leaf = pd.read_csv('LeafArea-100k.csv')
rain = pd.read_csv('Precipitation-100k.csv')
radiation = pd.read_csv('Radiation-100k.csv')
soil = pd.read_csv('SoilType-100k.csv')
temp = pd.read_csv('Temperature-100k.csv')

In [5]:
# Merge all dataframes via inner join
samples = pd.merge(left=elevation, right=forest, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=humanMod, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=leaf, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=rain, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=radiation, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=soil, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=temp, left_on='system:index', right_on='system:index')

fireIdx = label.copy()
del fireIdx['BurnDate']
del fireIdx['FirstDay']
del fireIdx['LastDay']
del fireIdx['QA']
del fireIdx['BurnMonth']

In [27]:
# Label samples
samplesCopy = samples.copy()
samplesCopy = pd.merge(left=fireIdx, right=samplesCopy, how='right', left_on='system:index', right_on='system:index')

sampleLabel = samplesCopy['Uncertainty'].fillna(100).tolist()
firstNoFireIdx = sampleLabel.index(100)

# Drop label column
del samplesCopy['Uncertainty']
# Drop system index
del samplesCopy['system:index']
samplesList = samplesCopy.values.tolist()
# Filter out fire samples
fireSamples = samplesList[:firstNoFireIdx]
fireUncertain = sampleLabel[:firstNoFireIdx]
# Filter out no fire samples
noFireSamples = samplesList[firstNoFireIdx:]
noFireLabel = sampleLabel[firstNoFireIdx:]

# Split the training datasets
X_train_fire, X_test_fire, y_train_fire, y_test_fire = train_test_split(fireSamples, fireUncertain, test_size=0.2, random_state=0)
X_train_nofire, X_test_nofire, y_train_nofire, y_test_nofire = train_test_split(noFireSamples, noFireLabel, test_size=0.9, random_state=42)

X_train = X_train_fire + X_train_nofire
y_train = y_train_fire + y_train_nofire
X_test = X_test_fire + X_test_nofire
y_test = y_test_fire + y_test_nofire

X_train, y_train = shuffle(X_train, y_train)

## One-level system

In [8]:
# 'scoring':'average_precision', 
tpot_param_dist={'generations':5, 'population_size':100, 'warm-start':True, 'memory':auto, 'verbosity':2, 'random_state':20, 'n_jobs':-1}
tpot = TPOTClassifier(**tpot_param_dist)
tpot.fit(np.asarray(X_train), np.asarray(y_train))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.9679281782427575
Generation 2 - Current best internal CV score: 0.9679281782427575
Generation 3 - Current best internal CV score: 0.9679281782427575
Generation 4 - Current best internal CV score: 0.9711127754483524
Generation 5 - Current best internal CV score: 0.971340306733904

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.5, max_depth=9, min_child_weight=2, n_estimators=100, nthread=1, subsample=0.8)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=50,
               random_state=20, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [12]:
print(tpot.score(np.asarray(X_test_fire), np.asarray(y_test_fire_bool)))

0.7962085308056872


## XGBClassifier

In [15]:
import xgboost.sklearn as xgb

In [20]:
param_dist={'learning_rate':0.5, 'max_depth':9, 'min_child_weight':2, 'n_estimators':100, 'nthread':1, 'subsample':0.8}
xgb_fnf = xgb.XGBClassifier(**param_dist)
xgb_fnf = xgb_fnf.fit(np.asarray(X_train), np.asarray(y_train))
y_pred_xgb_fnf = xgb_fnf.predict(X_test)
print(classification_report(y_test, y_pred_xgb_fnf))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     71584
           1       0.21      0.82      0.33       211

    accuracy                           0.99     71795
   macro avg       0.60      0.90      0.66     71795
weighted avg       1.00      0.99      0.99     71795

