dt vs rf vs gb: https://www.datasciencecentral.com/profiles/blogs/decision-tree-vs-random-forest-vs-boosted-trees-explained
     rf hyper tune: https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d
partial dependency: https://scikit-learn.org/stable/auto_examples/inspection/plot_partial_dependence.html#sphx-glr-auto-examples-inspection-plot-partial-dependence-py


# Load data from csv files

In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from sklearn.model_selection import GridSearchCV

import numpy as np

from tpot import TPOTClassifier

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.utils.multiclass import unique_labels

from sklearn.inspection import partial_dependence
from sklearn.inspection import plot_partial_dependence

In [2]:
# Read data from csv files
label = pd.read_csv('BurnedArea-100k.csv')
elevation = pd.read_csv('Elevation-100k.csv')
forest = pd.read_csv('Forest-100k.csv')
humanMod = pd.read_csv('HumanModification-100k.csv')
leaf = pd.read_csv('LeafArea-100k.csv')
rain = pd.read_csv('Precipitation-100k.csv')
radiation = pd.read_csv('Radiation-100k.csv')
soil = pd.read_csv('SoilType-100k.csv')
temp = pd.read_csv('Temperature-100k.csv')

In [3]:
# Merge all dataframes via inner join
samples = pd.merge(left=elevation, right=forest, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=humanMod, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=leaf, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=rain, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=radiation, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=soil, left_on='system:index', right_on='system:index')
samples = pd.merge(left=samples, right=temp, left_on='system:index', right_on='system:index')

fireIdx = label.copy()
del fireIdx['BurnDate']
del fireIdx['FirstDay']
del fireIdx['LastDay']
del fireIdx['QA']
del fireIdx['BurnMonth']

In [4]:
fireIdx = fireIdx[fireIdx['Uncertainty'] < 11]
print(fireIdx['Uncertainty'].value_counts())

2     311
1     277
3     158
4     105
5      73
6      34
7      31
8      24
10     11
9       9
Name: Uncertainty, dtype: int64


In [5]:
# Label samples
samplesCopy = samples.copy()
samplesCopy = pd.merge(left=fireIdx, right=samplesCopy, how='right', left_on='system:index', right_on='system:index')

sampleLabel = samplesCopy['Uncertainty'].fillna(100).tolist()
firstNoFireIdx = sampleLabel.index(100)

# Drop label column
del samplesCopy['Uncertainty']
# Drop system index
del samplesCopy['system:index']
samplesList = samplesCopy.values.tolist()
# Filter out fire samples
fireSamples = samplesList[:firstNoFireIdx]
fireUncertain = sampleLabel[:firstNoFireIdx]
# Filter out no fire samples
noFireSamples = samplesList[firstNoFireIdx:]
noFireLabel = sampleLabel[firstNoFireIdx:]

# Split the training datasets
X_train_fire, X_test_fire, y_train_fire, y_test_fire = train_test_split(fireSamples, fireUncertain, test_size=0.2, random_state=0)
X_train_nofire, X_test_nofire, y_train_nofire, y_test_nofire = train_test_split(noFireSamples, noFireLabel, test_size=0.9, random_state=42)

X_train = X_train_fire + X_train_nofire
y_train = y_train_fire + y_train_nofire
X_test = X_test_fire + X_test_nofire
y_test = y_test_fire + y_test_nofire

X_train, y_train = shuffle(X_train, y_train)

In [6]:
samplesCopy2 = samples.copy()
samplesCopy2 = pd.merge(left=fireIdx, right=samplesCopy2, how='right', left_on='system:index', right_on='system:index')

sampleLabel2 = samplesCopy2['Uncertainty'].fillna(100)
print(sampleLabel2.value_counts())

100.0    79567
2.0        307
1.0        274
3.0        156
4.0        102
5.0         73
6.0         34
7.0         31
8.0         24
10.0        11
9.0          9
Name: Uncertainty, dtype: int64


In [33]:
print(len(y_train_fire))
print(len(y_train_nofire))

840
1590


## One-level system

In [7]:
# 'scoring':'average_precision', 
tpot_param_dist={'generations':5, 'population_size':100, 'warm_start':True, 'memory':'auto', 'verbosity':2, 'random_state':20, 'n_jobs':-1}
tpot = TPOTClassifier(**tpot_param_dist)
tpot.fit(np.asarray(X_train), np.asarray(y_train))

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=600, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.9321764052488642
Generation 2 - Current best internal CV score: 0.932750361349091
Generation 3 - Current best internal CV score: 0.932750361349091
Generation 4 - Current best internal CV score: 0.932750361349091
Generation 5 - Current best internal CV score: 0.932750361349091

Best pipeline: XGBClassifier(BernoulliNB(input_matrix, alpha=10.0, fit_prior=False), learning_rate=1.0, max_depth=9, min_child_weight=7, n_estimators=100, nthread=1, subsample=1.0)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory='auto',
               mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=20, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=True)

In [47]:
print(tpot.score(np.asarray(X_test_fire), np.asarray(y_test_fire)))

0.35071090047393366


In [8]:
print(tpot.fitted_pipeline_)

Pipeline(memory=Memory(location=C:\Users\Lenovo\AppData\Local\Temp\tmpp07_4aq1\joblib),
         steps=[('stackingestimator',
                 StackingEstimator(estimator=BernoulliNB(alpha=10.0,
                                                         binarize=0.0,
                                                         class_prior=None,
                                                         fit_prior=False))),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=1.0,
                               max_delta_step=0, max_depth=9,
                               min_child_weight=7, missing=None,
                               n_estimators=100, n_jobs=1, nthread=1,
                               objective='multi:softprob', random_state=20,
                               reg_alpha=0, reg_lambda=1, s

In [9]:
tpot.export('tpot_constrained_fire_uncertain_pipeline.py')

## Stacking Classifiers

In [10]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from tpot.export_utils import set_param_recursive

In [11]:
# Average CV score on the training set was: 0.932750361349091
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=BernoulliNB(alpha=10.0, fit_prior=False)),
    XGBClassifier(learning_rate=1.0, max_depth=9, min_child_weight=7, n_estimators=100, nthread=1, subsample=1.0)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 20)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test_fire)

In [12]:
print(classification_report(y_test_fire, results))

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

         1.0       0.53      0.33      0.41        54
         2.0       0.71      0.56      0.63        62
         3.0       0.50      0.34      0.41        29
         4.0       0.31      0.22      0.26        23
         5.0       0.33      0.18      0.23        17
         6.0       0.14      0.12      0.13         8
         7.0       0.33      0.25      0.29         8
         8.0       0.00      0.00      0.00         2
         9.0       0.00      0.00      0.00         2
        10.0       0.00      0.00      0.00         0
       100.0       0.00      0.00      0.00         0

    accuracy                           0.36       205
   macro avg       0.26      0.18      0.21       205
weighted avg       0.51      0.36      0.42       205

