how this notebook works

In [1]:
# dependencies

import pandas as pd
import sqlalchemy as sq
import sys, os
import pickle
from imblearn.combine import SMOTEENN, SMOTETomek
from xgboost import XGBRFClassifier
from sklearn.ensemble import (  # type: ignore
    RandomForestClassifier,
)
from imblearn.ensemble import (  # type: ignore
    BalancedRandomForestClassifier,
)

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn, extractYears, scaleColumns, encodeColumns
from Models.models_ensemble import getBalancedClassifier, getClassifier

In [2]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

setting up a dataset

In [3]:
weatherStationQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_station
"""
)

weatherSatQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_sat
"""
)

ergotPrevYearsAggQuery = sq.text(
    """
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

In [4]:
conn = getConn("./.env")

stationDf = pd.read_sql(weatherStationQuery, conn)
# satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [5]:
# merge on year and district
# tempdf = pd.merge(satelliteDf, ergotPrevDf, on=["year", "district"], how="left")
# del satelliteDf
# del ergotPrevDf
# tempdf = satelliteDf
tempdf = pd.merge(stationDf, ergotPrevDf, on=["year", "district"], how="left")
del stationDf
del ergotPrevDf
# tempdf = stationDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

categorical values [one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  


In [6]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])

del temp

splitting the dataset

In [7]:
# train 1995 - 2015 test 2016 - 2020
trainDf = extractYears(datasetDf, 1995, 2015)
testDf = extractYears(datasetDf, 2016, 2020)
del datasetDf

In [8]:
# drop year
trainDf = trainDf.drop(columns=["year"])
testDf = testDf.drop(columns=["year"])

balancing the dataset https://imbalanced-learn.org/stable/



In [9]:
# pre balancing check
# print value counts downgrade
print(trainDf["downgrade"].value_counts())
print(testDf["downgrade"].value_counts())

downgrade
False    122202
True       2082
Name: count, dtype: int64
downgrade
False    26307
True      1016
Name: count, dtype: int64


In [10]:
# count nan
print(trainDf.isna().sum())
# set nan to 0
# trainDf = trainDf.fillna(0)

# drop nan
trainDf = trainDf.dropna()

downgrade                    0
1:min_temp_x              1246
1:max_temp_x              1246
1:mean_temp_x             1246
1:min_dew_point_temp      1246
1:max_dew_point_temp      1246
1:mean_dew_point_temp     1246
1:min_humidex             1246
1:max_humidex             1246
1:mean_humidex            1246
1:min_precip              1246
1:max_precip              1246
1:mean_precip             1246
1:min_rel_humid           1246
1:max_rel_humid           1246
1:mean_rel_humid          1246
1:min_stn_press           1246
1:max_stn_press           1246
1:mean_stn_press          1246
1:min_visibility          1246
1:max_visibility          1246
1:mean_visibility         1246
1:max_temp_y              1246
1:min_temp_y              1246
1:mean_temp_y             1246
1:min_total_rain          1246
1:max_total_rain          1246
1:mean_total_rain         1246
1:min_total_snow          1246
1:max_total_snow          1246
1:mean_total_snow         1246
1:min_total_precip        1246
1:max_to

In [11]:
balancer = SMOTEENN(sampling_strategy=1, random_state=42)
balancedTrainDfX, balancedTrainDfY = balancer.fit_resample(
    trainDf.drop(columns="downgrade"), trainDf["downgrade"]
)

In [12]:
# post balancing check
# print value counts downgrade
print(balancedTrainDfY.value_counts())

downgrade
False    113943
True      23138
Name: count, dtype: int64


### normalization / scaling
some blurb about scalers  
0 [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)             
1 [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html)  
2 [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)  
3 [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)  
4 [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html)  
5 [PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)  
6 [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html)  

In [13]:
# df = pd.DataFrame()
# scaled = scaleColumns(df, ['max_temp'], None, 1)

categorical values [one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  


In [14]:
# encoded = encodeColumns(df, ['max_temp'], None)

In [15]:
def printMetrics(model_name, y_true, y_pred):
    print(model_name)
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision: ", precision_score(y_true, y_pred))
    print("Recall: ", recall_score(y_true, y_pred))
    print("F1: ", f1_score(y_true, y_pred))
    print("ROC AUC: ", roc_auc_score(y_true, y_pred))
    print("Classification Report: \n", classification_report(y_true, y_pred))
    print()

selecting a model

In [16]:
ESTIMATORS = 400
DEPTH = 40
CORES = 10
MINSPLSPLIT = 8
MINSAMPLELEAF = 4

model_rf = RandomForestClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    n_jobs=CORES,
    min_samples_split=MINSPLSPLIT,
    min_samples_leaf=MINSAMPLELEAF,
)
model_nobalance_rf = RandomForestClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    n_jobs=CORES,
    min_samples_split=MINSPLSPLIT,
    min_samples_leaf=MINSAMPLELEAF,
)
balanced_model_rf = BalancedRandomForestClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    n_jobs=CORES,
    min_samples_split=MINSPLSPLIT,
    min_samples_leaf=MINSAMPLELEAF,
)
balanced_model_balanced_rf = BalancedRandomForestClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    n_jobs=CORES,
    min_samples_split=MINSPLSPLIT,
    min_samples_leaf=MINSAMPLELEAF,
)
model_xgbrf = XGBRFClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    n_jobs=CORES,
)
model_balance_xgbrf = XGBRFClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    n_jobs=CORES,
)

In [17]:
model_nobalance_rf.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
model_rf.fit(balancedTrainDfX, balancedTrainDfY)
balanced_model_rf.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_model_balanced_rf.fit(balancedTrainDfX, balancedTrainDfY)
model_xgbrf.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
model_balance_xgbrf.fit(balancedTrainDfX, balancedTrainDfY)

  warn(
  warn(
  warn(
  warn(


eval procedure

In [18]:
# set nan to 0
# testDf = testDf.fillna(0)

# drop nan
testDf = testDf.dropna()

In [19]:
# get predictions
predictions = model_rf.predict(testDf.drop(columns="downgrade"))
predictions_nobalance = model_nobalance_rf.predict(testDf.drop(columns="downgrade"))
predictions_balanced = balanced_model_rf.predict(testDf.drop(columns="downgrade"))
predictions_balanced_balanced = balanced_model_balanced_rf.predict(
    testDf.drop(columns="downgrade")
)
predictions_xgbrf = model_xgbrf.predict(testDf.drop(columns="downgrade"))
predictions_balance_xgbrf = model_balance_xgbrf.predict(
    testDf.drop(columns="downgrade")
)

In [20]:
print(pd.DataFrame(predictions).value_counts())
print(pd.DataFrame(predictions_nobalance).value_counts())
print(pd.DataFrame(predictions_balanced).value_counts())
print(pd.DataFrame(predictions_balanced_balanced).value_counts())
print(pd.DataFrame(predictions_xgbrf).value_counts())
print(pd.DataFrame(predictions_balance_xgbrf).value_counts())

True     21732
False     5563
Name: count, dtype: int64
False    27295
Name: count, dtype: int64
False    22064
True      5231
Name: count, dtype: int64
False    14978
True     12317
Name: count, dtype: int64
0    27295
Name: count, dtype: int64
0    19820
1     7475
Name: count, dtype: int64


print model performance metrics on test data

In [21]:
printMetrics("sk RF balanced train set", testDf["downgrade"], predictions)
printMetrics("sk RF imbalanced train set", testDf["downgrade"], predictions_nobalance)
printMetrics("imb RF imbalanced train set", testDf["downgrade"], predictions_balanced)
printMetrics(
    "imb RF balanced train set", testDf["downgrade"], predictions_balanced_balanced
)
printMetrics("xgb RF imbalanced train set", testDf["downgrade"], predictions_xgbrf)
printMetrics(
    "xgb RF balanced train set", testDf["downgrade"], predictions_balance_xgbrf
)

sk RF balanced train set
Accuracy:  0.22733101300604505
Precision:  0.03814651205595435
Recall:  0.8159448818897638
F1:  0.07288552839810093
ROC AUC:  0.5102594381669985
Classification Report: 
               precision    recall  f1-score   support

       False       0.97      0.20      0.34     26279
        True       0.04      0.82      0.07      1016

    accuracy                           0.23     27295
   macro avg       0.50      0.51      0.21     27295
weighted avg       0.93      0.23      0.33     27295


sk RF imbalanced train set
Accuracy:  0.9627770653965928
Precision:  0.0
Recall:  0.0
F1:  0.0
ROC AUC:  0.5
Classification Report: 
               precision    recall  f1-score   support

       False       0.96      1.00      0.98     26279
        True       0.00      0.00      0.00      1016

    accuracy                           0.96     27295
   macro avg       0.48      0.50      0.49     27295
weighted avg       0.93      0.96      0.94     27295


imb RF imbalanc

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision:  0.04361204013377926
Recall:  0.32086614173228345
F1:  0.07678718643269344
ROC AUC:  0.5244119132878472
Classification Report: 
               precision    recall  f1-score   support

       False       0.97      0.73      0.83     26279
        True       0.04      0.32      0.08      1016

    accuracy                           0.71     27295
   macro avg       0.50      0.52      0.45     27295
weighted avg       0.93      0.71      0.80     27295


