## Gradient Boosting Model on Weather Station Data + Previous ergot engineer features

- Gradient builds an additive model in a forward stage-wise fashion, which allows for the optimization of arbitrary differentiable loss functions. 

In [2]:
# import dependencies
import pandas as pd
import sqlalchemy as sq
import sys, os
import pickle
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from sklearn.ensemble import (  # type: ignore
    GradientBoostingClassifier,
)
from imblearn.ensemble import (  # type: ignore
    RUSBoostClassifier,
)

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn, extractYears

In [3]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

### <u>**Step 1**</u>: Data Selection

In this step, we would choose the particular data/table, pick attributes from existing tables. Further aggregation/feature engineer can be done here to support the point of the research.

Particular, for this notebook, we grab the following data and merge them (on year, district) into a single table:
- Weather station data
- ergot data (downgrade)
- ergot previous feature engineer

In [4]:
weatherStationQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_station
"""
)

ergotPrevYearsAggQuery = sq.text(
    """
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

In [5]:
conn = getConn("./.env")

stationDf = pd.read_sql(weatherStationQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [6]:
# merge on year and district
tempdf = pd.merge(stationDf, ergotPrevDf, on=["year", "district"], how="left")
del stationDf
del ergotPrevDf
# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

In [7]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])
datasetDf["present_prev1"] = datasetDf["present_prev1"].astype("bool")
datasetDf["present_prev2"] = datasetDf["present_prev2"].astype("bool")
datasetDf["present_prev3"] = datasetDf["present_prev3"].astype("bool")

del temp

In [8]:
# train 1995 - 2015 test 2016 - 2020
trainDf = extractYears(datasetDf, 1995, 2015)
testDf = extractYears(datasetDf, 2016, 2020)
del datasetDf

In [9]:
# drop year
trainDf = trainDf.drop(columns=["year"])
testDf = testDf.drop(columns=["year"])

In [10]:
# pre balancing check
# print value counts downgrade
print(trainDf["downgrade"].value_counts())
print(testDf["downgrade"].value_counts())

downgrade
False    122202
True       2082
Name: count, dtype: int64
downgrade
False    26307
True      1016
Name: count, dtype: int64


In [11]:
# count nan
print(trainDf.isna().sum())
# set nan to 0
# trainDf = trainDf.fillna(0)

# drop nan
trainDf = trainDf.dropna()

downgrade                     0
1:min_temp_x               1246
1:max_temp_x               1246
1:mean_temp_x              1246
1:min_dew_point_temp       1246
1:max_dew_point_temp       1246
1:mean_dew_point_temp      1246
1:min_humidex              1246
1:max_humidex              1246
1:mean_humidex             1246
1:min_precip               1246
1:max_precip               1246
1:mean_precip              1246
1:min_rel_humid            1246
1:max_rel_humid            1246
1:mean_rel_humid           1246
1:min_stn_press            1246
1:max_stn_press            1246
1:mean_stn_press           1246
1:min_visibility           1246
1:max_visibility           1246
1:mean_visibility          1246
1:max_temp_y               1246
1:min_temp_y               1246
1:mean_temp_y              1246
1:min_total_rain           1246
1:max_total_rain           1246
1:mean_total_rain          1246
1:min_total_snow           1246
1:max_total_snow           1246
1:mean_total_snow          1246
1:min_to

In [12]:
balancer = SMOTEENN(sampling_strategy=1, random_state=42)
balancedTrainDfX, balancedTrainDfY = balancer.fit_resample(
    trainDf.drop(columns="downgrade"), trainDf["downgrade"]
)

In [13]:
# post balancing check
# print value counts downgrade
print(balancedTrainDfY.value_counts())

downgrade
False    76741
True     15427
Name: count, dtype: int64


In [14]:
def printMetrics(model_name, y_true, y_pred):
    print(model_name)
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision: ", precision_score(y_true, y_pred))
    print("Recall: ", recall_score(y_true, y_pred))
    print("F1: ", f1_score(y_true, y_pred))
    print("ROC AUC: ", roc_auc_score(y_true, y_pred))
    print("Classification Report: \n", classification_report(y_true, y_pred))
    print()

### <u>**Step 5**</u>: Gradient Boosting Classifier Model

##### <u>**Step 5.1**</u>: Initialize the model

In [15]:
ESTIMATORS = 400
DEPTH = 40
CORES = -1
MINSPLSPLIT = 8
MINSAMPLELEAF = 4

gradient_boosting_model = GradientBoostingClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    verbose=1,
    n_iter_no_change=200,
)
balanced_gradient_boosting_model = GradientBoostingClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    verbose=1,
    n_iter_no_change=200,
)
rusboost_model = RUSBoostClassifier(
    n_estimators=ESTIMATORS, random_state=42, sampling_strategy=0.5
)
balanced_rusboost_model = RUSBoostClassifier(
    n_estimators=ESTIMATORS, random_state=42, sampling_strategy=0.5
)
xgboost_model = XGBClassifier(
    n_estimators=ESTIMATORS, random_state=42, max_depth=DEPTH, verbosity=1, n_jobs=CORES
)
balanced_xgboost_model = XGBClassifier(
    n_estimators=ESTIMATORS, random_state=42, max_depth=DEPTH, verbosity=1, n_jobs=CORES
)

##### <u>**Step 5.2**</u>: Fit the training data to the model

In [16]:
gradient_boosting_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_gradient_boosting_model.fit(balancedTrainDfX, balancedTrainDfY)

      Iter       Train Loss   Remaining Time 
         1           0.1933          282.51m
         2           0.1874          280.11m
         3           0.1832          245.38m
         4           0.1800          220.05m
         5           0.1774          202.94m
         6           0.1754          192.20m
         7           0.1736          185.27m
         8           0.1722          180.29m
         9           0.1709          175.43m
        10           0.1699          171.06m
        20           0.1646          117.71m
        30           0.1630           90.22m
        40           0.1625           75.66m
        50           0.1623           66.15m
        60           0.1622           59.22m
        70           0.1622           53.87m
        80           0.1622           49.53m
        90           0.1622           45.87m
       100           0.1622           42.68m
       200           0.1622           21.66m
      Iter       Train Loss   Remaining Time 
        

In [17]:
rusboost_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_rusboost_model.fit(balancedTrainDfX, balancedTrainDfY)

In [18]:
xgboost_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_xgboost_model.fit(balancedTrainDfX, balancedTrainDfY)

##### <u>**Step 5.3**</u>: Test the model on the testing dataset

In [19]:
testDf = testDf.dropna()

In [20]:
# get predictions

predictions_gradient_boosting = gradient_boosting_model.predict(
    testDf.drop(columns="downgrade")
)
predictions_balanced_gradient_boosting = balanced_gradient_boosting_model.predict(
    testDf.drop(columns="downgrade")
)
predictions_rusboost = rusboost_model.predict(testDf.drop(columns="downgrade"))
predictions_balanced_rusboost = balanced_rusboost_model.predict(
    testDf.drop(columns="downgrade")
)
predictions_xgboost = xgboost_model.predict(testDf.drop(columns="downgrade"))
predictions_balanced_xgboost = balanced_xgboost_model.predict(
    testDf.drop(columns="downgrade")
)

In [21]:
print(pd.DataFrame(predictions_gradient_boosting).value_counts())
print(pd.DataFrame(predictions_balanced_gradient_boosting).value_counts())
print(pd.DataFrame(predictions_rusboost).value_counts())
print(pd.DataFrame(predictions_balanced_rusboost).value_counts())
print(pd.DataFrame(predictions_xgboost).value_counts())
print(pd.DataFrame(predictions_balanced_xgboost).value_counts())

False    27295
Name: count, dtype: int64
True     15808
False    11487
Name: count, dtype: int64
False    22617
True      4678
Name: count, dtype: int64
False    21000
True      6295
Name: count, dtype: int64
0    27295
Name: count, dtype: int64
0    22850
1     4445
Name: count, dtype: int64


##### <u>**Step 5.4**</u>: Evaluate models based on different metrics:
- ACCURACY:
- PRECISION:
- RECALL:
- F1:
- ROC AUC:

In [22]:
# get accuracy precision recall f1 roc_auc
printMetrics(
    "sk GB imbalanced train set", testDf["downgrade"], predictions_gradient_boosting
)
printMetrics(
    "imb GB balanced train set",
    testDf["downgrade"],
    predictions_balanced_gradient_boosting,
)
printMetrics("sk RUS imbalanced train set", testDf["downgrade"], predictions_rusboost)
printMetrics(
    "imb RUS balanced train set", testDf["downgrade"], predictions_balanced_rusboost
)
printMetrics("sk XGB imbalanced train set", testDf["downgrade"], predictions_xgboost)
printMetrics(
    "imb XGB balanced train set", testDf["downgrade"], predictions_balanced_xgboost
)

sk GB imbalanced train set
Accuracy:  0.9627770653965928
Precision:  0.0
Recall:  0.0
F1:  0.0
ROC AUC:  0.5
Classification Report: 
               precision    recall  f1-score   support

       False       0.96      1.00      0.98     26279
        True       0.00      0.00      0.00      1016

    accuracy                           0.96     27295
   macro avg       0.48      0.50      0.49     27295
weighted avg       0.93      0.96      0.94     27295


imb GB balanced train set
Accuracy:  0.43791903278988825
Precision:  0.046875
Recall:  0.7293307086614174
F1:  0.08808844507845934
ROC AUC:  0.577991584400346
Classification Report: 
               precision    recall  f1-score   support

       False       0.98      0.43      0.59     26279
        True       0.05      0.73      0.09      1016

    accuracy                           0.44     27295
   macro avg       0.51      0.58      0.34     27295
weighted avg       0.94      0.44      0.57     27295


sk RUS imbalanced train se

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report: 
               precision    recall  f1-score   support

       False       0.96      1.00      0.98     26279
        True       0.00      0.00      0.00      1016

    accuracy                           0.96     27295
   macro avg       0.48      0.50      0.49     27295
weighted avg       0.93      0.96      0.94     27295


imb XGB balanced train set
Accuracy:  0.8080600842645174
Precision:  0.024971878515185602
Recall:  0.10925196850393701
F1:  0.04065189525727889
ROC AUC:  0.47216470338131133
Classification Report: 
               precision    recall  f1-score   support

       False       0.96      0.84      0.89     26279
        True       0.02      0.11      0.04      1016

    accuracy                           0.81     27295
   macro avg       0.49      0.47      0.47     27295
weighted avg       0.93      0.81      0.86     27295


