# Boost.ipynb
- Implementation for building, training, and evaluating machine learning models for a specific classification task using various ensemble methods like Gradient Boosting, RUSBoost, and XGBoost. 
- The task involves predicting the "downgrade" label from a given dataset.

In [None]:
# dependencies

import pandas as pd
import sqlalchemy as sq
import sys, os
import pickle
from imblearn.combine import SMOTEENN, SMOTETomek
from xgboost import XGBClassifier
from sklearn.ensemble import (  # type: ignore
    GradientBoostingClassifier,
)
from imblearn.ensemble import (  # type: ignore
    RUSBoostClassifier,
)

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
)

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn, extractYears, scaleColumns, encodeColumns
from Models.models_ensemble import getBalancedClassifier, getClassifier

In [None]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

setting up a dataset

Purpose :
- Code to retrieve datasets from different tables to test and train the model.

In [None]:
weatherStationQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_station
"""
)

weatherSatQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_sat
"""
)

ergotPrevYearsAggQuery = sq.text(
    """
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

Purpose :
- [To Load the data from the database directly into a DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html) 

In [None]:
conn = getConn("./.env")

stationDf = pd.read_sql(weatherStationQuery, conn)
# satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

Purpose :
- To merge target and temporary dataframe

In [None]:
# merge on year and district
# tempdf = pd.merge(satelliteDf, ergotPrevDf, on=["year", "district"], how="left")
# del satelliteDf
# del ergotPrevDf
# tempdf = satelliteDf
tempdf = stationDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

categorical values [one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  


In [None]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])

del temp

splitting the dataset

In [None]:
# train 1995 - 2015 test 2016 - 2020
trainDf = extractYears(datasetDf, 1995, 2015)
testDf = extractYears(datasetDf, 2016, 2020)
del datasetDf

In [None]:
# drop year
trainDf = trainDf.drop(columns=["year"])
testDf = testDf.drop(columns=["year"])

balancing the dataset https://imbalanced-learn.org/stable/



In [None]:
# pre balancing check
# print value counts downgrade
print(trainDf["downgrade"].value_counts())
print(testDf["downgrade"].value_counts())

In [None]:
# count nan
print(trainDf.isna().sum())
# set nan to 0
trainDf = trainDf.fillna(0)

In [None]:
balancer = SMOTEENN(sampling_strategy=1, random_state=42)
balancedTrainDfX, balancedTrainDfY = balancer.fit_resample(
    trainDf.drop(columns="downgrade"), trainDf["downgrade"]
)

In [None]:
# post balancing check
# print value counts downgrade
print(balancedTrainDfY.value_counts())

### normalization / scaling
some blurb about scalers  
0 [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)             
1 [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html)  
2 [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)  
3 [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)  
4 [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html)  
5 [PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)  
6 [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html)  

In [None]:
# df = pd.DataFrame()
# scaled = scaleColumns(df, ['max_temp'], None, 1)

categorical values [one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  


In [None]:
# encoded = encodeColumns(df, ['max_temp'], None)

In [None]:
def printMetrics(model_name, y_true, y_pred):
    print(model_name)
    print("Accuracy: ", accuracy_score(y_true, y_pred))
    print("Precision: ", precision_score(y_true, y_pred))
    print("Recall: ", recall_score(y_true, y_pred))
    print("F1: ", f1_score(y_true, y_pred))
    print("ROC AUC: ", roc_auc_score(y_true, y_pred))
    print("Classification Report: \n", classification_report(y_true, y_pred))
    print()

Purpose:
- It initializes multiple instances of different ensemble models for the classification task. 
- Models Initialize :
    - [Gradient Boosting Model](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)
    - Balanced Gradient Boosting Model
    - [RUSBoost Model](https://imbalanced-learn.org/stable/references/generated/imblearn.ensemble.RUSBoostClassifier.html)
    - Balanced RUSBoost Model
    - [XGBoost Model](https://xgboost.readthedocs.io/en/stable/python/python_api.html)
    - Balanced XGBoost Model
Note:
- The models are set up with specific hyperparameters for training and evaluation.

In [None]:
ESTIMATORS = 400
DEPTH = 40
CORES = -1
MINSPLSPLIT = 8
MINSAMPLELEAF = 4

gradient_boosting_model = GradientBoostingClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    verbose=1,
    n_iter_no_change=200,
)
balanced_gradient_boosting_model = GradientBoostingClassifier(
    n_estimators=ESTIMATORS,
    random_state=42,
    max_depth=DEPTH,
    verbose=1,
    n_iter_no_change=200,
)
rusboost_model = RUSBoostClassifier(
    n_estimators=ESTIMATORS, random_state=42, sampling_strategy=0.5
)
balanced_rusboost_model = RUSBoostClassifier(
    n_estimators=ESTIMATORS, random_state=42, sampling_strategy=0.5
)
xgboost_model = XGBClassifier(
    n_estimators=ESTIMATORS, random_state=42, max_depth=DEPTH, verbosity=1, n_jobs=CORES
)
balanced_xgboost_model = XGBClassifier(
    n_estimators=ESTIMATORS, random_state=42, max_depth=DEPTH, verbosity=1, n_jobs=CORES
)

Purpose : 
- Train the models on data and predict the target variables.
- Evaluate the metrics retrieved.

In [None]:
gradient_boosting_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_gradient_boosting_model.fit(balancedTrainDfX, balancedTrainDfY)

In [None]:
rusboost_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_rusboost_model.fit(balancedTrainDfX, balancedTrainDfY)

In [None]:
xgboost_model.fit(trainDf.drop(columns="downgrade"), trainDf["downgrade"])
balanced_xgboost_model.fit(balancedTrainDfX, balancedTrainDfY)

eval procedure

In [None]:
# set nan to 0
testDf = testDf.fillna(0)

In [None]:
# get predictions

predictions_gradient_boosting = gradient_boosting_model.predict(
    testDf.drop(columns="downgrade")
)
predictions_balanced_gradient_boosting = balanced_gradient_boosting_model.predict(
    testDf.drop(columns="downgrade")
)
predictions_rusboost = rusboost_model.predict(testDf.drop(columns="downgrade"))
predictions_balanced_rusboost = balanced_rusboost_model.predict(
    testDf.drop(columns="downgrade")
)
predictions_xgboost = xgboost_model.predict(testDf.drop(columns="downgrade"))
predictions_balanced_xgboost = balanced_xgboost_model.predict(
    testDf.drop(columns="downgrade")
)

In [None]:
print(pd.DataFrame(predictions_gradient_boosting).value_counts())
print(pd.DataFrame(predictions_balanced_gradient_boosting).value_counts())
print(pd.DataFrame(predictions_rusboost).value_counts())
print(pd.DataFrame(predictions_balanced_rusboost).value_counts())
print(pd.DataFrame(predictions_xgboost).value_counts())
print(pd.DataFrame(predictions_balanced_xgboost).value_counts())

print model performance metrics on test data

In [None]:
# get accuracy precision recall f1 roc_auc
printMetrics(
    "sk GB imbalanced train set", testDf["downgrade"], predictions_gradient_boosting
)
printMetrics(
    "imb GB balanced train set",
    testDf["downgrade"],
    predictions_balanced_gradient_boosting,
)
printMetrics("sk RUS imbalanced train set", testDf["downgrade"], predictions_rusboost)
printMetrics(
    "imb RUS balanced train set", testDf["downgrade"], predictions_balanced_rusboost
)
printMetrics("sk XGB imbalanced train set", testDf["downgrade"], predictions_xgboost)
printMetrics(
    "imb XGB balanced train set", testDf["downgrade"], predictions_balanced_xgboost
)