dt vs rf vs gb: https://www.datasciencecentral.com/profiles/blogs/decision-tree-vs-random-forest-vs-boosted-trees-explained
     rf hyper tune: https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d
partial dependency: https://scikit-learn.org/stable/auto_examples/inspection/plot_partial_dependence.html#sphx-glr-auto-examples-inspection-plot-partial-dependence-py


# Load data from csv files

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.model_selection import GridSearchCV

import numpy as np

from tpot import TPOTClassifier

import xgboost.sklearn as xgb

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence

In [2]:
# Read data from csv files
label = pd.read_csv('BurnedArea-100k.csv')
elevation = pd.read_csv('Elevation-100k.csv')
forest = pd.read_csv('Forest-100k.csv')
humanMod = pd.read_csv('HumanModification-100k.csv')
leaf = pd.read_csv('LeafArea-100k.csv')
rain = pd.read_csv('Precipitation-100k.csv')
radiation = pd.read_csv('Radiation-100k.csv')
soil = pd.read_csv('SoilType-100k.csv')
temp = pd.read_csv('Temperature-100k.csv')

In [3]:
# Merge all dataframes via inner join
samples = pd.merge(left=elevation, right=forest, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=humanMod, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=leaf, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=rain, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=radiation, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=soil, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=temp, left_on='system:index', right_on='system:index')

fireIdx = label.copy()
del fireIdx['BurnDate']
del fireIdx['FirstDay']
del fireIdx['LastDay']
del fireIdx['QA']
del fireIdx['BurnMonth']

In [12]:
# Mask out fire uncertainty larger than 10
fireIdx = fireIdx[fireIdx['Uncertainty'] < 11]
print(fireIdx['Uncertainty'].value_counts())

2     311
1     277
3     158
4     105
5      73
6      34
7      31
8      24
10     11
9       9
Name: Uncertainty, dtype: int64


In [13]:
# Label samples
samplesCopy = samples.copy()
samplesCopy = pd.merge(left=fireIdx, right=samplesCopy, how='right', left_on='system:index', right_on='system:index')

sampleLabel = samplesCopy['Uncertainty'].fillna(100).tolist()
firstNoFireIdx = sampleLabel.index(100)

# Drop label column
del samplesCopy['Uncertainty']
# Drop system index
del samplesCopy['system:index']
samplesList = samplesCopy.values.tolist()
# Filter out fire samples
fireSamples = samplesList[:firstNoFireIdx]
fireUncertain = sampleLabel[:firstNoFireIdx]
# Filter out no fire samples
noFireSamples = samplesList[firstNoFireIdx:]
noFireLabel = sampleLabel[firstNoFireIdx:]

# Split the training datasets
X_train_fire, X_test_fire, y_train_fire, y_test_fire = train_test_split(fireSamples, fireUncertain, test_size=0.2, random_state=0)
X_train_nofire, X_test_nofire, y_train_nofire, y_test_nofire = train_test_split(noFireSamples, noFireLabel, test_size=0.98, random_state=42)

X_train = X_train_fire + X_train_nofire
y_train = y_train_fire + y_train_nofire
X_test = X_test_fire + X_test_nofire
y_test = y_test_fire + y_test_nofire

X_train, y_train = shuffle(X_train, y_train)

In [14]:
samplesCopy2 = samples.copy()
samplesCopy2 = pd.merge(left=fireIdx, right=samplesCopy2, how='right', left_on='system:index', right_on='system:index')

sampleLabel2 = samplesCopy2['Uncertainty'].fillna(100)
print(sampleLabel2.value_counts())

100.0    79567
2.0        307
1.0        274
3.0        156
4.0        102
5.0         73
6.0         34
7.0         31
8.0         24
10.0        11
9.0          9
Name: Uncertainty, dtype: int64


In [33]:
print(len(y_train_fire))
print(len(y_train_nofire))

840
1590


## One-level system

In [15]:
# 'scoring':'average_precision', 
tpot_param_dist={'generations':5, 'population_size':100, 'warm_start':True, 'memory':'auto', 'verbosity':2, 'random_state':20, 'n_jobs':-1}
tpot = TPOTClassifier(**tpot_param_dist)
tpot.fit(np.asarray(X_train), np.asarray(y_train))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.7756708103067117
Generation 2 - Current best internal CV score: 0.7756708103067117
Generation 3 - Current best internal CV score: 0.7756708103067117
Generation 4 - Current best internal CV score: 0.7756708103067117
Generation 5 - Current best internal CV score: 0.7756708103067117

Best pipeline: XGBClassifier(input_matrix, learning_rate=0.5, max_depth=10, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.55)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory='auto',
               mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=20, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=True)

In [16]:
print(tpot.score(np.asarray(X_test_fire), np.asarray(y_test_fire)))

0.3804878048780488


In [38]:
print(tpot.fitted_pipeline_)

Pipeline(memory=Memory(location=C:\Users\Yi\AppData\Local\Temp\tmp6nzwpf8p\joblib),
         steps=[('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=10,
                               min_child_weight=4, missing=None,
                               n_estimators=100, n_jobs=1, nthread=1,
                               objective='multi:softprob', random_state=20,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                               seed=None, silent=None,
                               subsample=0.6500000000000001, verbosity=1))],
         verbose=False)


## XGBClassifier

In [20]:
param_dist={'learning_rate':0.5, 'max_depth':10, 'min_child_weight':4, 'n_estimators':100, 'nthread':1, 'subsample':0.8}
xgb_uncert = xgb.XGBClassifier(**param_dist)
xgb_uncert = xgb_uncert.fit(np.asarray(X_train), np.asarray(y_train))

              precision    recall  f1-score   support

         1.0       0.02      0.41      0.04        54
         2.0       0.05      0.55      0.09        62
         3.0       0.04      0.31      0.06        29
         4.0       0.04      0.35      0.08        23
         5.0       0.05      0.24      0.08        17
         6.0       0.00      0.00      0.00         8
         7.0       0.05      0.25      0.08         8
         8.0       0.00      0.00      0.00         2
         9.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         0
       100.0       1.00      0.97      0.99     77976

    accuracy                           0.97     78181
   macro avg       0.11      0.28      0.13     78181
weighted avg       1.00      0.97      0.98     78181



In [21]:
y_pred_xgb_uncert = xgb_uncert.predict(X_test_fire)
print(classification_report(y_test_fire, y_pred_xgb_uncert))

              precision    recall  f1-score   support

         1.0       0.52      0.41      0.46        54
         2.0       0.59      0.55      0.57        62
         3.0       0.45      0.31      0.37        29
         4.0       0.40      0.35      0.37        23
         5.0       0.31      0.24      0.27        17
         6.0       0.00      0.00      0.00         8
         7.0       0.50      0.25      0.33         8
         8.0       0.00      0.00      0.00         2
         9.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         0
       100.0       0.00      0.00      0.00         0

    accuracy                           0.39       205
   macro avg       0.25      0.19      0.21       205
weighted avg       0.47      0.39      0.42       205

