# Gradient Boost Classifier (GBC) as model
*by Max*

In this notebook we will fit our data via GBC models. Here we use both, the raw band information and the calculated spectral indices. We start by loading the needed modules and the data.

In [14]:
# set the directory of the data
# depending on colab or vscode environment
on_colabs = True
import sys
if on_colabs:
  from google.colab import drive
  drive.mount('/content/drive')
  OUTPUT_DIR = "/content/drive/MyDrive/Radiant_Earth_Spot_Crop/data"
  sys.path.append("/content/drive/MyDrive/Radiant_Earth_Spot_Crop/src")
else:
  OUTPUT_DIR = "../data"
  sys.path.append("../src/")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
# import the needed modules
import numpy as np
import pandas as pd

# import the machine learning modules
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier, DMatrix, cv
from sklearn.model_selection import GridSearchCV 
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import hyperopt.pyll.stochastic
from sklearn.metrics import accuracy_score, f1_score, log_loss

# import plotting modules and style
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_theme(context="notebook", style="darkgrid", palette="crest", font="helvetica")
sns.set(rc={"figure.dpi": 300})
sns.set(rc={"figure.figsize": (6, 3)})

# import own modules from the scr folder
from train_test_function import train_test_split_fields

# set a random seed
RSEED = 42
np.random.seed(RSEED)

In [17]:
# load the base data from the CSV files
df = pd.read_csv(f"{OUTPUT_DIR}/Train.csv")
df

Unnamed: 0,field_id,B02_04,B02_05,B02_06,B02_07,B02_08,B02_09,B02_10,B02_11,B03_04,...,WET_11,PVR_04,PVR_05,PVR_06,PVR_07,PVR_08,PVR_09,PVR_10,PVR_11,label
0,1,16.721329,24.704984,143.294212,120.819938,44.800108,16.204582,84.610932,71.071544,24.630225,...,1.962072,-0.129347,-0.118203,-0.027443,0.040792,0.171888,0.038998,-0.064742,-0.069020,4
1,2,32.724639,55.923914,125.333332,69.246375,40.289855,40.369565,56.710145,121.275364,42.333333,...,7.068886,-0.146733,-0.087567,-0.032327,0.022328,0.115969,-0.060363,-0.128947,-0.093419,7
2,4,32.280353,34.968543,154.134656,98.110374,37.219647,16.107616,37.224062,49.027595,43.079471,...,-27.280756,-0.123696,-0.118763,-0.029688,0.046448,0.043605,0.055476,-0.027601,-0.118679,8
3,6,18.155303,31.757102,145.524621,108.803978,60.709280,33.967330,53.688447,106.650568,28.818182,...,11.352537,-0.118640,-0.073378,-0.001560,0.080290,0.042376,0.018127,0.021250,0.000732,4
4,8,24.826734,79.171811,105.283333,74.694014,68.673711,13.678272,50.742448,111.811466,39.079529,...,5.653202,-0.209057,-0.126597,-0.055015,-0.009886,0.056202,-0.122661,-0.154477,-0.103681,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152374,121040,20.403039,59.414845,80.847889,27.824667,59.708337,81.547986,46.761128,67.944430,33.696896,...,-41.660880,-0.183287,-0.130227,-0.083794,-0.111953,-0.087094,-0.080153,-0.126261,-0.119986,9
152375,81419,43.263652,64.798399,70.318978,31.452927,53.690466,67.588925,76.104462,72.158007,57.171448,...,-43.322944,-0.194625,-0.165170,-0.121343,-0.184359,-0.155505,-0.130677,-0.168950,-0.146476,9
152376,120632,24.294387,80.966079,97.427150,82.883655,68.383974,62.480107,85.343281,75.238125,40.287612,...,-37.588233,-0.209406,-0.139067,-0.101158,-0.086092,-0.092242,-0.099815,-0.094287,-0.123467,9
152377,55097,23.256538,46.628218,94.869888,35.849067,70.241120,41.519023,44.394830,68.003466,39.282254,...,-49.131428,-0.155267,-0.089067,-0.099821,-0.116138,-0.090815,-0.125115,-0.128937,-0.115460,9


In [18]:
df_wo_Unkown = pd.read_csv(f"{OUTPUT_DIR}/Train_wo_Unknown.csv")
df_wo_Unkown

Unnamed: 0,field_id,B02,B03,B04,B08,B11,B12,NDVI,WET,PVR,month,days_from_april,label
0,1,14.844051,23.114147,30.607718,58.736336,73.435690,48.863342,0.314835,-37.725527,-0.139488,4,10,4
1,1,13.385852,21.596462,29.223473,57.065918,73.668810,49.313503,0.322664,-39.639456,-0.150079,4,20,4
2,1,15.408361,22.471062,29.371382,56.434082,71.057880,46.557877,0.315396,-36.211639,-0.133102,5,30,4
3,1,12.768489,20.252410,27.516077,50.303860,73.053055,50.136658,0.292827,-42.799231,-0.152060,5,50,4
4,1,15.813505,23.000000,30.237942,48.397106,79.366560,58.991962,0.230930,-50.093389,-0.135955,5,60,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4385515,27601,11.598385,20.338670,20.335830,66.359340,50.410893,33.210295,0.530801,-16.002849,-0.000296,7,121,9
4385516,80941,26.502663,40.840188,56.615357,92.743572,110.065737,92.243787,0.241507,-58.242955,-0.162045,10,199,9
4385517,37598,10.142290,22.513122,31.430678,75.312455,70.353291,49.911049,0.411627,-30.924741,-0.165983,8,141,9
4385518,50911,22.084471,44.297454,60.332364,108.992686,111.351467,94.316708,0.287394,-53.336437,-0.153179,7,118,9


---
## Train-test split, validation split & resampling
Next we do the train-test split and also a validation split.

In [19]:
# do the train-test-split
df_train, df_test = train_test_split_fields(
    df, train_size=0.7, random_state=RSEED
)

# do the validation split
df_train_val, df_test_val = train_test_split_fields(
    df_train, train_size=0.7, random_state=RSEED
)

We setup the the features (X) and the target (y) for the validation train and test data.

In [20]:
# get X for the train and validation data
X_train = df_train_val.drop(columns=["label", "field_id"])
X_val = df_test_val.drop(columns=["label", "field_id"])

# get y for the train and validation data
y_train = df_train_val["label"]
y_train = y_train.astype(int)
y_val = df_test_val["label"]
y_val = y_val.astype(int)

# set the classes from 0 to 8 
y_train = y_train-1
y_val = y_val-1

---
## Modelling with XGBoost
We will start this process by using the XGBClassifier with the default hyperparameters (only objective, evaluation metric and GPU support are given).


In [21]:
# initialize a default GradientBoostingClassifier
xgb = XGBClassifier(
    objective='multi:softprob', 
    num_class='9',
    eval_metric='mlogloss', 
    disable_default_eval_metric=1, 
    random_state=RSEED,
    n_jobs = -1, 
    tree_method='gpu_hist', 
    gpu_id=0
    )
xgb.fit(X_train, y_train)

XGBClassifier(disable_default_eval_metric=1, eval_metric='mlogloss', gpu_id=0,
              n_jobs=-1, num_class='9', objective='multi:softprob',
              random_state=42, tree_method='gpu_hist')

In [22]:
# predict the absolute classes and probabilities
y_pred_train = xgb.predict(X_train)
y_pred_val = xgb.predict(X_val)

# predict the probabilities for each  class
y_proba_train = xgb.predict_proba(X_train)
y_proba_val = xgb.predict_proba(X_val)

In [23]:
print("---" * 12)
print(f"Accuracy on train data: {round(accuracy_score(y_train, y_pred_train), 3)}")
print(f"Accuracy on test data: {round(accuracy_score(y_val, y_pred_val), 3)}")
print("---" * 12)
print(f'F1-score on train data: {round(f1_score(y_train, y_pred_train, average="macro"), 3)}')
print(f'F1-score on test data: {round(f1_score(y_val, y_pred_val, average="macro"), 3)}')
print("---" * 12)
print(f"Cross-entropy on train data: {round(log_loss(y_train, y_proba_train), 3)}")
print(f"Cross-entropy on test data: {round(log_loss(y_val, y_proba_val), 3)}")
print("---" * 12)

------------------------------------
Accuracy on train data: 0.613
Accuracy on test data: 0.596
------------------------------------
F1-score on train data: 0.603
F1-score on test data: 0.588
------------------------------------
Cross-entropy on train data: 1.121
Cross-entropy on test data: 1.159
------------------------------------


In [24]:
xgb.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'disable_default_eval_metric': 1,
 'eval_metric': 'mlogloss',
 'gamma': 0,
 'gpu_id': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'nthread': None,
 'num_class': '9',
 'objective': 'multi:softprob',
 'random_state': 42,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'tree_method': 'gpu_hist',
 'verbosity': 1}

---
## Hyperparameter tuning via Bayesian optimization
The next step is the tuning of the hyperparameters. We will try a new method, the Bayesian optimization via the hyperopt module.

In [29]:
# hp.uniform for float number
# hp.quniform for int that are a multiple from the last number, 
# for example 3, 15, 1 means any whole number between 3 and 15
space={
    'objective': hp.choice('objective', ('multi:softmax', 'multi:softprob')),
    'n_estimators': 150,
    'random_state': RSEED,
    'disable_default_eval_metric': 1,
    'gpu_id':0,
    'tree_method': 'gpu_hist',
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 3, 1),
    'gamma': hp.uniform ('gamma', 0, 0.3),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.9),
    'learning_rate': hp.uniform('learning_rate', 0, 1)
    }
print(hyperopt.pyll.stochastic.sample(space))

{'colsample_bytree': 0.6146523152776479, 'disable_default_eval_metric': 1, 'gamma': 0.20188293465987836, 'gpu_id': 0, 'learning_rate': 0.7510771701553898, 'max_depth': 3.0, 'min_child_weight': 1.0, 'n_estimators': 150, 'objective': 'multi:softprob', 'random_state': 42, 'tree_method': 'gpu_hist'}


In [30]:
def objective(space):
    clf = XGBClassifier(
        objective=space['objective'],
        n_estimators=space['n_estimators'],
        random_state=space['random_state'],
        disable_default_eval_metric=space['disable_default_eval_metric'],
        gpu_id=space['gpu_id'],
        tree_method=space['tree_method'],
        max_depth=int(space['max_depth']),
        min_child_weight=int(space['min_child_weight']),
        gamma=space['gamma'],
        colsample_bytree=int(space['colsample_bytree']),
        learning_rate=space['learning_rate']
    )
    
    evaluation = [
                  ( X_train, y_train), 
                  ( X_val, y_val)
    ]
                
    clf.fit(
        X_train, y_train,
        eval_set=evaluation, 
        eval_metric="mlogloss",
        early_stopping_rounds=10,
        verbose=False
    )
    
    y_pred_val = clf.predict(X_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    print ("SCORE:", f1)
    return {'loss': -f1, 'status': STATUS_OK }

In [31]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials)

SCORE:
0.49948252071221977
SCORE:
0.48098618429794154
SCORE:
0.5153234664601806
SCORE:
0.5376558118482885
SCORE:
0.5067110518518223
SCORE:
0.505644068538592
SCORE:
0.51526928743917
SCORE:
0.506779991833771
SCORE:
0.5491963557035907
SCORE:
0.5052805546093982
SCORE:
0.49512681675240294
SCORE:
0.4918279815369167
SCORE:
0.5052916385081834
SCORE:
0.5216546015983199
SCORE:
0.49976342062188944
SCORE:
0.510484814895445
SCORE:
0.4995891064055366
SCORE:
0.5111907078523588
SCORE:
0.5145235770934832
SCORE:
0.5184152102551304
SCORE:
0.550914941788334
SCORE:
0.554673740795626
SCORE:
0.4342165221283378
SCORE:
0.561290965864778
SCORE:
0.552573142986163
SCORE:
0.5477066487182183
SCORE:
0.5417318738627347
SCORE:
0.5516708000271806
SCORE:
0.5421389838124979
SCORE:
0.5415125681574847
SCORE:
0.5333200149667426
SCORE:
0.532038466981527
SCORE:
0.35555155895329105
SCORE:
0.5438214516090146
SCORE:
0.5564496663474049
SCORE:
0.5309575767776421
SCORE:
0.5358335243605974
SCORE:
0.4867163940576496
SCORE:
0.48109215

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.655349597562582, 'gamma': 0.09019240188259454, 'max_depth': 9.0, 'min_child_weight': 2.0, 'objective': 0}


---
## Second round of improvement via Bayesian optimization

In [33]:
# hp.uniform for float number
# hp.quniform for int that are a multiple from the last number, 
# for example 3, 15, 1 means any whole number between 3 and 15
space={
    'objective': 'multi:softmax',
    'n_estimators': hp.quniform('n_estimators', 50, 500, 10),
    'random_state': RSEED,
    'disable_default_eval_metric': 1,
    'gpu_id':0,
    'tree_method': 'gpu_hist',
    'max_depth': hp.quniform('max_depth', 3, 15, 1),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
    'gamma': hp.uniform ('gamma', 0, 5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.9),
    'learning_rate': hp.uniform('learning_rate', 0, 5)
    }
print(hyperopt.pyll.stochastic.sample(space))

{'colsample_bytree': 0.6696240766878037, 'disable_default_eval_metric': 1, 'gamma': 4.683339319269378, 'gpu_id': 0, 'learning_rate': 4.096092220744751, 'max_depth': 3.0, 'min_child_weight': 0.0, 'n_estimators': 420.0, 'objective': 'multi:softmax', 'random_state': 42, 'tree_method': 'gpu_hist'}


In [40]:
def objective(space):
    clf = XGBClassifier(
        objective=space['objective'],
        n_estimators=int(space['n_estimators']),
        random_state=space['random_state'],
        disable_default_eval_metric=space['disable_default_eval_metric'],
        gpu_id=space['gpu_id'],
        tree_method=space['tree_method'],
        max_depth=int(space['max_depth']),
        min_child_weight=int(space['min_child_weight']),
        gamma=space['gamma'],
        colsample_bytree=space['colsample_bytree'],
        learning_rate=space['learning_rate']
    )
    
    evaluation = [( X_train, y_train), ( X_val, y_val)]
    
    clf.fit(
        X_train, y_train,
        eval_set=evaluation, 
        eval_metric="mlogloss",
        early_stopping_rounds=10,
        verbose=False
    )
    
    y_pred_val = clf.predict(X_val)
    f1 = f1_score(y_val, y_pred_val, average="macro")
    print ("SCORE:", f1)
    return {'loss': -f1, 'status': STATUS_OK }

In [41]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:
0.7445535271669521
SCORE:
0.5353837257813621
SCORE:
0.5493731856361427
SCORE:
0.5555502127106511
SCORE:
0.7136435703453707
SCORE:
0.5704124087344719
SCORE:
0.5723199849271998
SCORE:
0.5682165169483482
SCORE:
0.7662627922775729
SCORE:
0.7581258894680842
SCORE:
0.7793541831939065
SCORE:
0.400153297534228
SCORE:
0.5675888419782156
SCORE:
0.7277564171258172
SCORE:
0.8096542447159898
SCORE:
0.5815518341359591
SCORE:
0.7816636071626948
SCORE:
0.7750514368603478
SCORE:
0.5844744694479505
SCORE:
0.47250400054115654
SCORE:
0.8052404105461439
SCORE:
0.7731492372989301
SCORE:
0.7646954847877768
SCORE:
0.7984153562320055
SCORE:
0.7226661548118507
SCORE:
0.5637934749704635
SCORE:
0.7422758261567854
SCORE:
0.796097429533845
SCORE:
0.7288049695299406
SCORE:
0.6712989351252342
SCORE:
0.5267234088909876
SCORE:
0.7896457007831427
SCORE:
0.5777586817300217
SCORE:
0.7637189180081683
SCORE:
0.7967547957083002
SCORE:
0.5714001190992924
SCORE:
0.7842978697171711
SCORE:
0.7574600727294425
SCORE:
0.4845

In [42]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.5777187270022676, 'gamma': 1.377784932836379, 'learning_rate': 0.10161907358428368, 'max_depth': 14.0, 'min_child_weight': 6.0, 'n_estimators': 480.0}


The best hyperparameters are for a F1-score of 0.81 are:  

{'colsample_bytree': 0.5777187270022676, 'gamma': 1.377784932836379, 'learning_rate': 0.10161907358428368, 'max_depth': 14.0, 'min_child_weight': 6.0, 'n_estimators': 480.0}