In [160]:
import pandas as pd
import numpy as np
import time
import sklearn
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

In [161]:
import warnings
warnings.filterwarnings("ignore")

In [162]:
# hyperparameters
# learning_rate = [x/10 for x in range(10, 0, -1)] # [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
learning_rate = [0.4, 0.2, 0.1, 0.08, 0.06, 0.04, 0.02]
max_depth = [1, 3, 5, 7]

In [163]:
train_df = pd.read_csv("../datasets/merged/training_dataset.csv")
train_df.drop('Unnamed: 0',axis=1,inplace=True)
train_df

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,88,5,31,2,2008,1,3,29,False,True,...,True,False,False,False,False,False,True,False,False,1
1,32,9,30,3,2003,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
2,35,12,31,4,2003,0,5,34,False,False,...,False,False,False,False,False,True,False,False,False,1
3,94,11,30,4,2008,1,3,29,False,True,...,False,True,False,False,False,False,False,True,False,2
4,62,3,31,1,2006,0,5,34,False,False,...,False,True,False,False,False,False,False,True,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5573,99,4,30,2,2009,0,3,29,False,True,...,True,False,False,False,False,False,True,False,False,1
5574,209,6,30,2,2018,0,5,28,False,False,...,False,True,False,False,False,False,False,True,False,1
5575,22,11,30,4,2002,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
5576,42,7,31,3,2004,1,5,28,True,False,...,False,False,False,False,False,False,False,True,False,1


In [165]:
train_df_X = train_df.drop(columns=["Total_Value"])
train_df_Y = train_df["Total_Value"]

In [166]:
eval_df = pd.read_csv("../datasets/merged/validate_dataset.csv")
eval_df.drop('Unnamed: 0',axis=1,inplace=True)
eval_df

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,105,10,31,4,2009,0,5,28,True,False,...,False,False,False,False,False,False,True,False,False,1
1,85,2,29,1,2008,1,5,28,True,False,...,False,False,False,False,False,False,False,True,False,1
2,241,2,28,1,2021,0,5,28,False,False,...,False,False,False,False,False,False,True,False,False,1
3,186,7,31,3,2016,1,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
4,215,12,31,4,2018,0,5,34,False,False,...,False,False,False,False,False,False,False,True,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190,145,2,28,1,2013,0,5,29,False,False,...,False,False,False,False,False,False,False,True,False,1
1191,17,6,30,2,2002,0,5,28,True,False,...,False,False,False,False,False,False,True,False,False,1
1192,43,8,31,3,2004,1,5,28,False,False,...,True,False,False,False,False,False,True,False,False,1
1193,217,2,28,1,2019,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1


In [167]:
eval_df_X = eval_df.drop(columns=["Total_Value"])
eval_df_Y = eval_df["Total_Value"]

In [168]:
def compute_metrics(labels,predictions):

  accuracy = accuracy_score(labels, predictions)
  precision = precision_score(labels, predictions, average='weighted')
  recall = recall_score(labels, predictions, average='weighted')
  f1 = f1_score(labels, predictions, average='weighted')

  # print("...ACCURACY: " + str(accuracy))
  # print("...PRECISION: " + str(precision))
  # print("...RECALL: " + str(recall))

  return accuracy, precision, recall, f1

In [171]:
def train_model(learning_rate, max_depth):

    train_time = [0,0]
    eval_time = [0,0]

    #constructing model
    classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_depth=max_depth, random_state=42)

    #training model
    train_time[0] = time.time()
    classifier = classifier.fit(train_df_X, train_df_Y)
    train_time[1] = time.time()

    #scoring model
    eval_time[0] = time.time()
    predicted = classifier.predict(eval_df_X)
    accuracy, precision, recall, f1 = compute_metrics(eval_df_Y, predicted)
    eval_time[1] = time.time()
    
    return [learning_rate, max_depth, train_time[1] - train_time[0], eval_time[1] - eval_time[0], accuracy, precision, recall, f1]

In [173]:
results = pd.DataFrame(columns=['Trial', 'Learning Rate', 'Max Depth', 'Training time', 'Validation time', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
trial_id = 0

for lr in learning_rate:
    for md in max_depth:
        print("running...", trial_id)
        results.loc[trial_id] = [trial_id]+train_model(lr, md)
        trial_id = trial_id + 1

display(results)

running... 0
running... 1
running... 2
running... 3
running... 4
running... 5
running... 6
running... 7
running... 8
running... 9
running... 10
running... 11
running... 12
running... 13
running... 14
running... 15
running... 16
running... 17
running... 18
running... 19
running... 20
running... 21
running... 22
running... 23
running... 24
running... 25
running... 26
running... 27


Unnamed: 0,Trial,Learning Rate,Max Depth,Training time,Validation time,Accuracy,Precision,Recall,F1 Score
0,0.0,0.4,1.0,5.469659,0.005999,0.912134,0.872907,0.912134,0.89081
1,1.0,0.4,3.0,14.510558,0.014,0.093724,0.822063,0.093724,0.155456
2,2.0,0.4,5.0,19.372949,0.016002,0.066109,0.777105,0.066109,0.111321
3,3.0,0.4,7.0,14.867104,0.017999,0.92636,0.928908,0.92636,0.927445
4,4.0,0.2,1.0,5.421843,0.006,0.928033,0.917797,0.928033,0.92086
5,5.0,0.2,3.0,12.760542,0.010998,0.906276,0.923423,0.906276,0.912785
6,6.0,0.2,5.0,17.689109,0.019998,0.933891,0.931935,0.933891,0.932754
7,7.0,0.2,7.0,19.932674,0.025,0.929707,0.929689,0.929707,0.929579
8,8.0,0.1,1.0,5.38803,0.007001,0.914644,0.901343,0.914644,0.89461
9,9.0,0.1,3.0,13.966454,0.014002,0.923013,0.919098,0.923013,0.920324


In [174]:
results["Max Depth"] = results["Max Depth"].astype(int)
results.dtypes

Trial              float64
Learning Rate      float64
Max Depth            int32
Training time      float64
Validation time    float64
Accuracy           float64
Precision          float64
Recall             float64
F1 Score           float64
dtype: object

In [175]:
results.to_csv("temp_results/gradient_boosting.csv", index=None)

In [176]:
best = results[results['F1 Score'] == max([i for i in results["F1 Score"].values])]
best

Unnamed: 0,Trial,Learning Rate,Max Depth,Training time,Validation time,Accuracy,Precision,Recall,F1 Score
15,15.0,0.08,7,30.682939,0.037,0.937238,0.936553,0.937238,0.936601


In [177]:
test_df = pd.read_csv("../datasets/merged/test_dataset.csv")
test_df.drop('Unnamed: 0',axis=1,inplace=True)
test_df

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,209,6,30,2,2018,0,5,28,True,False,...,False,False,False,False,False,False,True,False,False,1
1,17,6,30,2,2002,0,3,29,False,True,...,False,True,False,False,False,False,False,True,False,1
2,246,7,31,3,2021,0,5,29,False,False,...,False,False,False,False,False,False,False,True,False,2
3,107,12,31,4,2009,0,3,29,False,True,...,False,False,False,False,False,False,False,True,False,2
4,19,8,31,3,2002,0,5,34,False,False,...,False,False,False,False,False,False,False,True,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,218,3,31,1,2019,0,5,29,False,False,...,False,False,False,False,False,False,False,True,False,1
1192,37,2,29,1,2004,1,5,28,False,False,...,False,True,False,False,False,False,False,True,False,1
1193,141,10,31,4,2012,1,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
1194,242,3,31,1,2021,0,3,29,False,True,...,False,True,False,False,False,False,False,True,False,7


In [178]:
test_df_X = eval_df.drop(columns=["Total_Value"])
test_df_Y = eval_df["Total_Value"]

In [179]:
# using best fine-tuned classifier
model = GradientBoostingClassifier(n_estimators=100, learning_rate=best['Learning Rate'].iloc[0], max_depth=best['Max Depth'].iloc[0], random_state=42)
train_time_0 = time.time()
model = model.fit(train_df_X, train_df_Y)
train_time_1 = time.time()

start_time = time.time()
predicted = model.predict(test_df_X)
end_time = time.time()

precision, accuracy, recall, f1 = compute_metrics(test_df_Y, predicted)

print('Training time:', end_time - start_time)
print('Inference time:', end_time - start_time)
print("...ACCURACY: " + str(accuracy))
print("...PRECISION: " + str(precision))
print("...RECALL: " + str(recall))


Training time: 0.03500032424926758
Inference time: 0.03500032424926758
...ACCURACY: 0.9365531523964247
...PRECISION: 0.9372384937238494
...RECALL: 0.9372384937238494


In [183]:
imp_features = model.feature_importances_

df_imp_features = pd.DataFrame({"features":train_df_X.columns}).join(pd.DataFrame({"weights":imp_features}))
weigthed = df_imp_features.sort_values(by=['weights'], ascending=False)
display(weigthed)

Unnamed: 0,features,weights
0,Date_ID,0.165754
73,Prairies_Region,0.157358
20,Triticum,0.082404
49,Land_Binned_Encoded,0.049204
14,Wheat,0.044460
...,...,...
60,Northwest Territories,0.000000
59,Newfoundland and Labrador,0.000000
58,New Brunswick,0.000000
56,Canada,0.000000
