In [42]:
import sys
import os
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import sqlalchemy as sq
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from imblearn.over_sampling import RandomOverSampler

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, VarianceThreshold

import tensorflow
from tensorflow import keras
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

import kerastuner as kt
from keras_tuner.tuners import RandomSearch, Hyperband, BayesianOptimization

from ann_visualizer.visualize import ann_viz
import graphviz



sys.path.append("../Shared/")
from DataService import DataService

sys.path.append("../Datasets/")
# print(os.getcwd())
from DataCreation import getDatasetV1, getDatasetV2, getDatasetV3, getDatasetV4
from DataTestSplit import splitData

In [2]:
# disable GPU
tensorflow.config.set_visible_devices([], 'GPU')  # Hide GPU devices
tensorflow.config.set_visible_devices(tensorflow.config.list_physical_devices('CPU'), 'CPU')  # Show CPU devices

2023-07-28 14:01:04.019474: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE: forward compatibility was attempted on non supported HW
2023-07-28 14:01:04.019513: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: woodswallow-02.cs.umanitoba.ca
2023-07-28 14:01:04.019518: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: woodswallow-02.cs.umanitoba.ca
2023-07-28 14:01:04.019601: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 525.125.6
2023-07-28 14:01:04.019619: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 510.108.3
2023-07-28 14:01:04.019623: E tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:312] kernel version 510.108.3 does not match DSO version 525.125.6 -- cannot find working 

### fetch data

In [3]:
load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

# connecting to database
db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [4]:
weatherStationQuery = sq.text("""
    SELECT * from dataset_cross_monthly_station
""")

weatherSatQuery = sq.text("""
    SELECT * from dataset_monthly_sat
""")

ergotPrevYearsAggQuery = sq.text("""
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
""")

ergotTargetQuery = sq.text("""
    SELECT year, district, downgrade from ergot_sample_feat_eng
""")

In [5]:
stationDf = pd.read_sql(weatherStationQuery, conn)
satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [6]:
ergotTargetDf.drop_duplicates( inplace=True)
ergotTargetDf

Unnamed: 0,year,district,downgrade
0,1995,4810,False
48,1995,4820,False
190,1995,4830,False
230,1995,4840,False
280,1995,4840,True
...,...,...,...
157041,2022,4751,True
157126,2022,4791,True
157219,2022,4731,True
157727,2022,4604,True


In [7]:
ergotPrevDf.fillna(0, inplace=True)
ergotPrevDf

Unnamed: 0,year,district,present_prev1,present_prev2,present_prev3,percnt_true_prev1,percnt_true_prev2,percnt_true_prev3
0,1995,4810,False,False,False,0.000000,0.000000,0.000000
1,1995,4820,False,False,False,0.000000,0.000000,0.000000
2,1995,4830,False,False,False,0.000000,0.000000,0.000000
3,1995,4840,False,False,False,0.000000,0.000000,0.000000
4,1995,4850,False,False,False,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
1087,2022,4761,False,True,True,0.010204,0.076923,0.062893
1088,2022,4771,False,True,False,0.000000,0.153333,0.013072
1089,2022,4790,True,True,True,0.025316,0.229167,0.147727
1090,2022,4791,False,True,True,0.028986,0.223881,0.328571


In [8]:
stationDf

Unnamed: 0,year,district,1:min_temp_x,1:max_temp_x,1:mean_temp_x,1:min_dew_point_temp,1:max_dew_point_temp,1:mean_dew_point_temp,1:min_humidex,1:max_humidex,...,12:mean_total_rain,12:min_total_snow,12:max_total_snow,12:mean_total_snow,12:min_total_precip,12:max_total_precip,12:mean_total_precip,12:min_snow_on_grnd,12:max_snow_on_grnd,12:mean_snow_on_grnd
0,1953,4606,-37.2,-3.3,-16.650918,-22.2,0.0,-3.170833,0.0,0.0,...,0.0,0.0,10.2,1.047826,0.0,10.2,1.047826,0.0,0.0,0.000000
1,1953,4607,-31.7,-4.4,-14.224855,-33.9,-5.0,-14.939367,0.0,0.0,...,0.0,0.0,3.3,0.508696,0.0,3.3,0.508696,0.0,0.0,0.000000
2,1953,4611,-31.7,-5.0,-13.608918,-32.2,-5.6,-14.508056,0.0,0.0,...,0.0,0.0,2.3,0.231818,0.0,2.3,0.231818,0.0,0.0,0.000000
3,1953,4612,-38.9,-7.2,-23.922143,-40.0,0.0,-23.888548,0.0,0.0,...,0.0,0.0,20.3,1.161905,0.0,20.3,1.161905,0.0,0.0,0.000000
4,1953,4710,-33.9,3.3,-11.815733,-33.9,-1.1,-12.818630,0.0,0.0,...,0.0,0.0,3.3,0.191304,0.0,3.3,0.191304,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2655,2022,4605,-36.8,1.7,-18.537634,-40.7,-1.0,-21.749866,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,9.3,0.879310,12.0,29.0,17.275862
2656,2022,4604,-37.6,-0.1,-18.692339,-41.0,-0.8,-21.278898,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,11.1,0.700000,10.0,36.0,18.966667
2657,2022,4771,-36.7,3.2,-14.102554,-41.9,-1.0,-17.953763,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,6.0,0.706667,4.0,7.0,4.966667
2658,2022,4609,-35.4,-0.8,-18.504344,-39.3,-1.9,-22.128021,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,10.2,0.566667,0.0,17.0,2.988889


In [9]:
# merge on year and district
datasetDf = pd.merge(ergotTargetDf, stationDf, on=["year", "district"], how="left")
datasetDf = pd.merge(datasetDf, ergotPrevDf, on=["year", "district"], how="left")
# del ergotTargetDf

In [12]:
mean = datasetDf.mean()
datasetDf.fillna(mean, inplace=True)
t = pd.get_dummies(datasetDf["district"], drop_first=True)
datasetDf = pd.concat([datasetDf, t], axis=1)
datasetDf.columns = datasetDf.columns.astype(str)

In [14]:
df = datasetDf.copy()

In [15]:
X_train, X_val, X_test, y_train, y_val, y_test = splitData(datasetDf, drop_features=["year"], target_variable="downgrade", pivot=2015, val_size=0.2, stratified=False)
# del datasetDf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_features, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_features, inplace=True)


In [16]:
# normalizing data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# oversampling data
ros = RandomOverSampler(random_state=42)
X_train_rs, y_train_rs = ros.fit_resample(X_train, y_train)

### Model 

In [17]:
# Define the hyperparameter search space
pbounds = {
    'hidden_layer_sizes': (10, 100),  # Range for the number of neurons in the hidden layer
    'alpha': (1e-5, 1e-1),           # Range for L2 regularization parameter (alpha)
    'learning_rate_init': (1e-4, 1e-2) # Range for the initial learning rate
}

In [20]:
# Define the objective function for Bayesian optimization
def train_evaluate_model(hidden_layer_sizes, alpha, learning_rate_init):
    # Convert hidden_layer_sizes to integer values
    hidden_layer_sizes = int(hidden_layer_sizes)

    # Initialize and train the model with the specified hyperparameters
    model = MLPClassifier(hidden_layer_sizes=(hidden_layer_sizes,), alpha=alpha, learning_rate_init=learning_rate_init, random_state=42)
    model.fit(X_train_rs, y_train_rs)

    # Make predictions on the test set
    y_pred_probs = model.predict_proba(X_val)[:, 1]

    # Calculate and return the AUC score
    auc_score = roc_auc_score(y_val, y_pred_probs)
    return auc_score

In [26]:
from bayes_opt import BayesianOptimization
# %pip install bayesian-optimization


In [27]:
# Perform Bayesian optimization
optimizer = BayesianOptimization(f=train_evaluate_model, pbounds=pbounds, verbose=2)
optimizer.maximize(init_points=5, n_iter=20)

|   iter    |  target   |   alpha   | hidden... | learni... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.4363   [0m | [0m0.06508  [0m | [0m29.06    [0m | [0m0.004858 [0m |
| [95m2        [0m | [95m0.5044   [0m | [95m0.07976  [0m | [95m63.71    [0m | [95m0.0042   [0m |
| [0m3        [0m | [0m0.4213   [0m | [0m0.05447  [0m | [0m61.59    [0m | [0m0.0002176[0m |
| [0m4        [0m | [0m0.463    [0m | [0m0.0821   [0m | [0m56.1     [0m | [0m0.002188 [0m |
| [0m5        [0m | [0m0.3475   [0m | [0m0.02315  [0m | [0m64.61    [0m | [0m0.001738 [0m |
| [0m6        [0m | [0m0.4578   [0m | [0m0.09069  [0m | [0m56.11    [0m | [0m0.003054 [0m |
| [95m7        [0m | [95m0.5174   [0m | [95m0.003157 [0m | [95m63.62    [0m | [95m0.005493 [0m |
| [0m8        [0m | [0m0.4268   [0m | [0m0.08104  [0m | [0m63.46    [0m | [0m0.004    [0m |
| [0m9        [0m | [0m0.4244   [0m | [0m0.00



In [28]:
# Get the best hyperparameters found by Bayesian optimization
best_params = optimizer.max['params']
best_auc_score = optimizer.max['target']

In [29]:
print("Best Hyperparameters:")
print(best_params)
print("Best AUC Score:", best_auc_score)

Best Hyperparameters:
{'alpha': 0.06826994435032743, 'hidden_layer_sizes': 11.863332027942574, 'learning_rate_init': 0.00896645653280489}
Best AUC Score: 0.5323902027027027


In [47]:
model_test = MLPClassifier(hidden_layer_sizes=(int(best_params["hidden_layer_sizes"]),), alpha=best_params["alpha"], learning_rate_init=best_params["learning_rate_init"], random_state=42)

In [48]:
model_test.fit(X_train_rs, y_train_rs)

In [49]:
# Make predictions on the test set
y_pred_probs = model_test.predict_proba(X_test)[:, 1]

In [52]:
print(f"ROC = {roc_auc_score(y_test, y_pred_probs)}")

y_pred_probs = np.where(y_pred_probs > 0.5, 1, 0)

print(f"Accuracy = {accuracy_score(y_test, y_pred_probs)}")
print(f"Precision = {precision_score(y_test, y_pred_probs)}")
print(f"Recall = {recall_score(y_test, y_pred_probs)}")
print(f"F1 Score = {f1_score(y_test, y_pred_probs)}")

ROC = 0.5625380896274327
Accuracy = 0.556989247311828
Precision = 0.38974358974358975
Recall = 0.4662576687116564
F1 Score = 0.42458100558659223


In [51]:
# y_main_log = model_test.predict(X_test)
# y_main_pred = np.where(y_main_log > 0.5, 1, 0)

y_main_pred = model_test.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_main_pred)

accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
print("Accuracy: ", accuracy)

precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
print("Precision: ", precision)

recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Recall: ", recall)

f1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: ", f1_score)

auc_score = roc_auc_score(y_test, y_main_pred)
print("AUC Score: ", auc_score)

ValueError: Classification metrics can't handle a mix of binary and continuous targets