how this notebook works

In [None]:
# dependencies

import pandas as pd
import sqlalchemy as sq
import sys, os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import RandomOverSampler

import tensorflow
from tensorflow import keras
from keras import backend as K
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras_tuner.tuners import RandomSearch, BayesianOptimization

from sklearn.metrics import (  # type: ignore
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    classification_report,
)

sys.path.append("../../")
os.chdir("../../")
from ModelBuilderMethods import getConn
from Datasets.DataTestSplit import splitData

In [None]:
tensorflow.config.set_visible_devices([], 'GPU')  # Hide GPU devices
tensorflow.config.set_visible_devices(tensorflow.config.list_physical_devices('CPU'), 'CPU')  # Show CPU devices

In [None]:
# unlimited line output
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 500)

setting up a dataset

In [None]:
weatherStationQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_station
"""
)

weatherSatQuery = sq.text(
    """
    SELECT * from dataset_cross_monthly_sat
"""
)

ergotPrevYearsAggQuery = sq.text(
    """
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
"""
)

ergotTargetQuery = sq.text(
    """
    SELECT year, district, downgrade from ergot_sample_feat_eng
"""
)

In [None]:
conn = getConn("./.env")

stationDf = pd.read_sql(weatherStationQuery, conn)
# satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [None]:
# merge on year and district
# tempdf = pd.merge(satelliteDf, ergotPrevDf, on=["year", "district"], how="left")
# del satelliteDf
# del ergotPrevDf
# tempdf = satelliteDf
tempdf = stationDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

categorical values [one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)  


In [None]:
# encode district
datasetDf["district"] = datasetDf["district"].astype("category")

temp = pd.get_dummies(datasetDf["district"], prefix="district", drop_first=True)
datasetDf = pd.concat([datasetDf, temp], axis=1)

datasetDf = datasetDf.drop(columns=["district"])

del temp

splitting the dataset

In [None]:
drop_features = ["year"]
target_variable = "downgrade"
X_train, X_val, X_test, y_train, y_val, y_test = splitData(datasetDf, drop_features, target_variable, 2019,0.2, False)

balancing the dataset https://imbalanced-learn.org/stable/



In [None]:
# pre balancing check
# print value counts downgrade
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
# oversampling data
ros = RandomOverSampler(random_state=42)
X_train_rs, y_train_rs = ros.fit_resample(X_train, y_train)

### normalization / scaling
some blurb about scalers  
0 [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)             
1 [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html)  
2 [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)  
3 [RobustScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html)  
4 [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html)  
5 [PowerTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html)  
6 [QuantileTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html)  

In [None]:
# normalizing data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

selecting a model

In [None]:
def auc(y_true, y_pred):
    auc = tensorflow.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tensorflow.local_variables_initializer())
    return auc

In [None]:
# use this model if you want to see the accuray
def build_model(hp):
    model = keras.Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=256, step=32),
                           activation='relu', input_shape=(X_train.shape[1],)))
    for i in range(hp.Int('num_layers', min_value=1, max_value=5)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
                               activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model with the desired optimizer, loss, and metrics
    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

# # use this moel if you want to see the auc
# def build_model(hp):
#     model = keras.Sequential()
#     model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=256, step=32),
#                            activation='relu', input_shape=(X_train.shape[1],)))
#     for i in range(hp.Int('num_layers', min_value=1, max_value=5)):
#         model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
#                                activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))

#     # Compile the model with the desired optimizer, loss, and metrics
#     model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
#                   loss='binary_crossentropy', metrics=['accuracy', tensorflow.keras.metrics.AUC(name='auc')])
#     return model

In [None]:
tuner = BayesianOptimization(
    build_model,
    # objective=kt.Objective("val_auc", direction="max"),           # if you want to maximize AUC
    objective="val_accuracy",                                       # if you want to maximize accuracy
    max_trials=10,
    overwrite=True,
    executions_per_trial=2,
    directory='data/BayesianOptimization',
    project_name='ergot_random_search')

In [None]:
EPOCHES = 20
# run the search
tuner.search(X_train_rs, y_train_rs, epochs=EPOCHES, validation_data=(X_val, y_val))

build the model

In [None]:
# Method : 1
# model = tuner.hypermodel.build(best_hps)
# model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test))

# Method : 2
model = tuner.get_best_models(num_models=1)[0]
model.build(X_train.shape)

In [None]:
model.summary()

In [None]:
# using  validation_data

history = model.fit(X_train_rs, y_train_rs, epochs=EPOCHES, batch_size=64, validation_data=(X_val, y_val), verbose=1)

visualize training

In [None]:
# function to plot the training and validation loss for each epoch
def evaluate_model(history):
    # Get the training and validation loss from the history
    training_loss = history.history['loss']
    validation_loss = history.history['val_loss']

    # Get the training and validation accuracy from the history
    training_accuracy = history.history['accuracy']
    validation_accuracy = history.history['val_accuracy']

    # Plot the training and validation loss
    plt.figure(figsize=(20, 6))
    plt.subplot(1, 2, 1)
    plt.plot(training_loss, label='Training Loss')
    plt.plot(validation_loss, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()

    # Plot the training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(training_accuracy, label='Training Accuracy')
    plt.plot(validation_accuracy, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()

    plt.show()

In [None]:
evaluate_model(history)

prediction

In [None]:
def model_predict(model: Sequential, X_test: np.ndarray, y_test: np.ndarray):
    y_log = model.predict(X_test)
    y_pred = np.where(y_log > 0.7, 1, 0)

    conf_matrix = confusion_matrix(y_test, y_pred)

    accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
    print("Accuracy: ", accuracy)

    precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
    print("Precision: ", precision)

    recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
    print("Recall: ", recall)

    f1_score = 2 * (precision * recall) / (precision + recall)
    print("F1 Score: ", f1_score)

    auc_score = roc_auc_score(y_val, y_pred)
    print("AUC Score: ", auc_score)

In [None]:
model_predict(model, X_test)