In [87]:
import pandas as pd
import numpy as np
import time
import sklearn
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

In [68]:
import warnings
warnings.filterwarnings("ignore")

In [97]:
# hyperparameters
# learning_rate = [x/10 for x in range(10, 0, -1)] # [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
learning_rate = [0.4, 0.2, 0.1, 0.08, 0.06, 0.04, 0.02]
max_depth = [1, 3, 5, 7]

In [70]:
train_df = pd.read_csv("../datasets/merged/training_dataset.csv")
train_df.drop('Unnamed: 0',axis=1,inplace=True)
train_df

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,88,5,31,2,2008,1,3,29,False,True,...,True,False,False,False,False,False,True,False,False,1
1,32,9,30,3,2003,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
2,35,12,31,4,2003,0,5,34,False,False,...,False,False,False,False,False,True,False,False,False,1
3,94,11,30,4,2008,1,3,29,False,True,...,False,True,False,False,False,False,False,True,False,2
4,62,3,31,1,2006,0,5,34,False,False,...,False,True,False,False,False,False,False,True,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5573,99,4,30,2,2009,0,3,29,False,True,...,True,False,False,False,False,False,True,False,False,1
5574,209,6,30,2,2018,0,5,28,False,False,...,False,True,False,False,False,False,False,True,False,1
5575,22,11,30,4,2002,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
5576,42,7,31,3,2004,1,5,28,True,False,...,False,False,False,False,False,False,False,True,False,1


In [71]:
train_df_X = train_df.drop(columns=["Total_Value"])
train_df_Y = train_df["Total_Value"]

In [72]:
eval_df = pd.read_csv("../datasets/merged/validate_dataset.csv")
eval_df.drop('Unnamed: 0',axis=1,inplace=True)
eval_df

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,105,10,31,4,2009,0,5,28,True,False,...,False,False,False,False,False,False,True,False,False,1
1,85,2,29,1,2008,1,5,28,True,False,...,False,False,False,False,False,False,False,True,False,1
2,241,2,28,1,2021,0,5,28,False,False,...,False,False,False,False,False,False,True,False,False,1
3,186,7,31,3,2016,1,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
4,215,12,31,4,2018,0,5,34,False,False,...,False,False,False,False,False,False,False,True,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1190,145,2,28,1,2013,0,5,29,False,False,...,False,False,False,False,False,False,False,True,False,1
1191,17,6,30,2,2002,0,5,28,True,False,...,False,False,False,False,False,False,True,False,False,1
1192,43,8,31,3,2004,1,5,28,False,False,...,True,False,False,False,False,False,True,False,False,1
1193,217,2,28,1,2019,0,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1


In [73]:
eval_df_X = eval_df.drop(columns=["Total_Value"])
eval_df_Y = eval_df["Total_Value"]

In [88]:
def compute_metrics(labels,predictions):

  accuracy = accuracy_score(labels, predictions)
  precision = precision_score(labels, predictions, average='weighted')
  recall = recall_score(labels, predictions, average='weighted')
  f1 = f1_score(labels, predictions, average='weighted')

  # print("...ACCURACY: " + str(accuracy))
  # print("...PRECISION: " + str(precision))
  # print("...RECALL: " + str(recall))

  return accuracy, precision, recall, f1

In [89]:
def train_model(learning_rate, max_depth):

    train_time = [0,0]
    eval_time = [0,0]

    #constructing model
    classifier = HistGradientBoostingClassifier(learning_rate=learning_rate, max_depth=max_depth, random_state=42, validation_fraction = None)
    # classifier = GradientBoostingClassifier(n_estimators=n_estimators[0], learning_rate=learning_rate[0], max_depth=max_depth[0], random_state=42)
    # ValueError: Input X contains NaN.
    # GradientBoostingClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and ...

    #training model
    train_time[0] = time.time()
    classifier = classifier.fit(train_df_X, train_df_Y)
    train_time[1] = time.time()

    #scoring model
    eval_time[0] = time.time()
    # eval_loss = classifier.score(eval_df_X, eval_df_Y)
    predicted = classifier.predict(eval_df_X)
    accuracy, precision, recall, f1 = compute_metrics(eval_df_Y, predicted)
    eval_time[1] = time.time()

    # print("Total training time:", train_time[1] - train_time[0])
    # print("Total eval time:", eval_time[1] - eval_time[0])
    # print("-> Accuracy:", accuracy, "Precision:", precision, 'Recall', recall)
    
    return [learning_rate, max_depth, train_time[1] - train_time[0], eval_time[1] - eval_time[0], accuracy, precision, recall, f1]

In [98]:
results = pd.DataFrame(columns=['Trial', 'Learning Rate', 'Max Depth', 'Training time', 'Validation time', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
trial_id = 0

for lr in learning_rate:
    for md in max_depth:
        results.loc[trial_id] = [trial_id]+train_model(lr, md)
        trial_id = trial_id + 1

results

Unnamed: 0,Trial,Learning Rate,Max Depth,Training time,Validation time,Accuracy,Precision,Recall,F1 Score
0,0.0,0.4,1.0,0.283617,0.014999,0.725523,0.805167,0.725523,0.76256
1,1.0,0.4,3.0,0.383646,0.018999,0.89205,0.797167,0.89205,0.841944
2,2.0,0.4,5.0,0.888244,0.017001,0.813389,0.882597,0.813389,0.842237
3,3.0,0.4,7.0,0.891624,0.015,0.877824,0.810658,0.877824,0.839087
4,4.0,0.2,1.0,0.705594,0.015,0.933891,0.92518,0.933891,0.92848
5,5.0,0.2,3.0,0.974614,0.015999,0.820084,0.835643,0.820084,0.827649
6,6.0,0.2,5.0,1.231217,0.016,0.876987,0.85611,0.876987,0.865775
7,7.0,0.2,7.0,1.465001,0.017999,0.863598,0.842333,0.863598,0.852711
8,8.0,0.1,1.0,0.705228,0.016,0.931381,0.921939,0.931381,0.925515
9,9.0,0.1,3.0,1.800588,0.016998,0.894561,0.903453,0.894561,0.898888


In [107]:
results["Max Depth"] = results["Max Depth"].astype(int)
results.dtypes

Trial              float64
Learning Rate      float64
Max Depth            int32
Training time      float64
Validation time    float64
Accuracy           float64
Precision          float64
Recall             float64
F1 Score           float64
dtype: object

In [108]:
results.to_csv("temp_results/gradient_boosting.csv", index=None)

In [109]:
best = results[results['F1 Score'] == max([i for i in results["F1 Score"].values])]
best

Unnamed: 0,Trial,Learning Rate,Max Depth,Training time,Validation time,Accuracy,Precision,Recall,F1 Score
21,21.0,0.04,3,2.828823,0.057999,0.939749,0.931199,0.939749,0.934771


In [110]:
test_df = pd.read_csv("../datasets/merged/test_dataset.csv")
test_df.drop('Unnamed: 0',axis=1,inplace=True)
test_df

Unnamed: 0,Date_ID,Month,NumberOfDaysInMonth,Quarter,Year,LeapYear,Minimum Temperature,Maximum Temperature,Barley,Canola,...,Quebec,Saskatchewan,Territories,Yukon,Atlantic_Region,British Columbia_Region,Canada_Region,Prairies_Region,Territories_Region,Total_Value
0,209,6,30,2,2018,0,5,28,True,False,...,False,False,False,False,False,False,True,False,False,1
1,17,6,30,2,2002,0,3,29,False,True,...,False,True,False,False,False,False,False,True,False,1
2,246,7,31,3,2021,0,5,29,False,False,...,False,False,False,False,False,False,False,True,False,2
3,107,12,31,4,2009,0,3,29,False,True,...,False,False,False,False,False,False,False,True,False,2
4,19,8,31,3,2002,0,5,34,False,False,...,False,False,False,False,False,False,False,True,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1191,218,3,31,1,2019,0,5,29,False,False,...,False,False,False,False,False,False,False,True,False,1
1192,37,2,29,1,2004,1,5,28,False,False,...,False,True,False,False,False,False,False,True,False,1
1193,141,10,31,4,2012,1,5,30,False,False,...,False,False,False,False,False,False,False,True,False,1
1194,242,3,31,1,2021,0,3,29,False,True,...,False,True,False,False,False,False,False,True,False,7


In [111]:
test_df_X = eval_df.drop(columns=["Total_Value"])
test_df_Y = eval_df["Total_Value"]

In [113]:
# using best fine-tuned classifier
model = HistGradientBoostingClassifier(learning_rate=best['Learning Rate'].iloc[0], max_depth=best['Max Depth'].iloc[0], random_state=42, validation_fraction = None)
train_time_0 = time.time()
model = model.fit(train_df_X, train_df_Y)
train_time_1 = time.time()

start_time = time.time()
predicted = model.predict(test_df_X)
end_time = time.time()

precision, accuracy, recall, f1 = compute_metrics(test_df_Y, predicted)

print('Training time:', end_time - start_time)
print('Inference time:', end_time - start_time)
print("...ACCURACY: " + str(accuracy))
print("...PRECISION: " + str(precision))
print("...RECALL: " + str(recall))


Training time: 0.018000125885009766
Inference time: 0.018000125885009766
...ACCURACY: 0.9311988408608521
...PRECISION: 0.9397489539748954
...RECALL: 0.9397489539748954
