In [23]:
import sys
import os
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import sqlalchemy as sq
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from imblearn.over_sampling import RandomOverSampler

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, VarianceThreshold

import tensorflow
from tensorflow import keras
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

import kerastuner as kt
from keras_tuner.tuners import RandomSearch, Hyperband, BayesianOptimization

from ann_visualizer.visualize import ann_viz
import graphviz



sys.path.append("../Shared/")
from DataService import DataService

sys.path.append("../Datasets/")
# print(os.getcwd())
from DataCreation import getDatasetV1, getDatasetV2, getDatasetV3, getDatasetV4
from DataTestSplit import splitData

In [24]:
# disable GPU
tensorflow.config.set_visible_devices([], 'GPU')  # Hide GPU devices
tensorflow.config.set_visible_devices(tensorflow.config.list_physical_devices('CPU'), 'CPU')  # Show CPU devices

### fetch data

In [25]:
load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

# connecting to database
db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [26]:
weatherStationQuery = sq.text("""
    SELECT * from dataset_monthly_station
""")

weatherSatQuery = sq.text("""
    SELECT * from dataset_monthly_sat
""")

ergotPrevYearsAggQuery = sq.text("""
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
""")

ergotTargetQuery = sq.text("""
    SELECT year, district, downgrade from ergot_sample_feat_eng
""")

In [27]:
stationDf = pd.read_sql(weatherStationQuery, conn)
satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

In [28]:
ergotTargetDf.drop_duplicates( inplace=True)
ergotTargetDf

Unnamed: 0,year,district,downgrade
0,1995,4810,False
48,1995,4820,False
190,1995,4830,False
230,1995,4840,False
280,1995,4840,True
...,...,...,...
157041,2022,4751,True
157126,2022,4791,True
157219,2022,4731,True
157727,2022,4604,True


In [29]:
ergotPrevDf.fillna(0, inplace=True)
ergotPrevDf

Unnamed: 0,year,district,present_prev1,present_prev2,present_prev3,percnt_true_prev1,percnt_true_prev2,percnt_true_prev3
0,1995,4810,False,False,False,0.000000,0.000000,0.000000
1,1995,4820,False,False,False,0.000000,0.000000,0.000000
2,1995,4830,False,False,False,0.000000,0.000000,0.000000
3,1995,4840,False,False,False,0.000000,0.000000,0.000000
4,1995,4850,False,False,False,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
1087,2022,4761,False,True,True,0.010204,0.076923,0.062893
1088,2022,4771,False,True,False,0.000000,0.153333,0.013072
1089,2022,4790,True,True,True,0.025316,0.229167,0.147727
1090,2022,4791,False,True,True,0.028986,0.223881,0.328571


In [30]:
stationDf

Unnamed: 0,year,month,district,min_temp_x,max_temp_x,mean_temp_x,min_dew_point_temp,max_dew_point_temp,mean_dew_point_temp,min_humidex,...,mean_total_rain,min_total_snow,max_total_snow,mean_total_snow,min_total_precip,max_total_precip,mean_total_precip,min_snow_on_grnd,max_snow_on_grnd,mean_snow_on_grnd
0,1953,1,4606,-37.2,-3.3,-16.650918,-22.2,0.0,-3.170833,0.0,...,0.000000,0.0,18.3,1.375000,0.0,18.3,1.375000,0.0,0.0,0.000000
1,1953,1,4607,-31.7,-4.4,-14.224855,-33.9,-5.0,-14.939367,0.0,...,0.000000,0.0,2.8,0.466667,0.0,2.8,0.466667,0.0,0.0,0.000000
2,1953,1,4611,-31.7,-5.0,-13.608918,-32.2,-5.6,-14.508056,0.0,...,0.000000,0.0,7.4,1.366667,0.0,7.4,1.366667,0.0,0.0,0.000000
3,1953,1,4612,-38.9,-7.2,-23.922143,-40.0,0.0,-23.888548,0.0,...,0.000000,0.0,3.8,0.439583,0.0,3.8,0.439583,0.0,0.0,0.000000
4,1953,1,4710,-33.9,3.3,-11.815733,-33.9,-1.1,-12.818630,0.0,...,0.000000,0.0,2.5,0.220000,0.0,2.5,0.220000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22602,2022,12,4830,-46.5,11.7,-11.121018,-51.8,2.8,-13.209914,0.0,...,0.000357,0.0,7.2,0.028596,0.0,8.4,0.420344,0.0,49.0,4.857147
22603,2022,12,4840,-41.9,5.9,-17.592067,-46.0,2.7,-20.147237,0.0,...,0.000000,0.0,12.6,0.030833,0.0,8.7,0.501989,0.0,44.0,4.498098
22604,2022,12,4850,-41.8,9.9,-16.074466,-46.6,1.5,-19.138173,0.0,...,0.000000,0.0,2.7,0.022471,0.0,10.0,0.291207,0.0,36.0,5.410004
22605,2022,12,4860,-48.1,4.9,-18.400136,-52.4,0.4,-20.571919,0.0,...,0.000000,0.0,15.5,0.052028,0.0,10.1,0.570979,0.0,47.0,7.136807


In [31]:
# merge on year and district
datasetDf = pd.merge(ergotTargetDf, stationDf, on=["year", "district"], how="left")
datasetDf = pd.merge(datasetDf, ergotPrevDf, on=["year", "district"], how="left")
# del ergotTargetDf

In [32]:
mean = datasetDf.mean()
datasetDf.fillna(mean, inplace=True)
t = pd.get_dummies(datasetDf["district"], drop_first=True)
datasetDf = pd.concat([datasetDf, t], axis=1)
datasetDf.columns = datasetDf.columns.astype(str)

In [33]:
df = datasetDf.copy()

In [34]:
X_train, X_val, X_test, y_train, y_val, y_test = splitData(datasetDf, drop_features=["year"], target_variable="downgrade", pivot=2015, val_size=0.2, stratified=False)
# del datasetDf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_features, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_features, inplace=True)


In [35]:
# normalizing data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# oversampling data
ros = RandomOverSampler(random_state=42)
X_train_rs, y_train_rs = ros.fit_resample(X_train, y_train)

### Model 

In [40]:
# Define the hyperparameter search space
pbounds = {
    'hidden_layer_sizes': (10, 200),  # Range for the number of neurons in the hidden layer
    'alpha': (1e-5, 1e-1),           # Range for L2 regularization parameter (alpha)
    'learning_rate_init': (1e-4, 1e-2) # Range for the initial learning rate
}

In [41]:
# Define the objective function for Bayesian optimization
def train_evaluate_model(hidden_layer_sizes, alpha, learning_rate_init):
    # Convert hidden_layer_sizes to integer values
    hidden_layer_sizes = int(hidden_layer_sizes)

    # Initialize and train the model with the specified hyperparameters
    model = MLPClassifier(hidden_layer_sizes=(hidden_layer_sizes,), alpha=alpha, learning_rate_init=learning_rate_init, random_state=42)
    model.fit(X_train_rs, y_train_rs)

    # Make predictions on the test set
    y_pred_probs = model.predict_proba(X_val)[:, 1]

    # Calculate and return the AUC score
    auc_score = roc_auc_score(y_val, y_pred_probs)
    return auc_score

In [42]:
from bayes_opt import BayesianOptimization
# %pip install bayesian-optimization


In [43]:
# Perform Bayesian optimization
optimizer = BayesianOptimization(f=train_evaluate_model, pbounds=pbounds, verbose=2)
optimizer.maximize(init_points=5, n_iter=20)

|   iter    |  target   |   alpha   | hidden... | learni... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.6267   [0m | [0m0.09658  [0m | [0m133.8    [0m | [0m0.008146 [0m |
| [0m2        [0m | [0m0.583    [0m | [0m0.03845  [0m | [0m131.9    [0m | [0m0.007438 [0m |
| [0m3        [0m | [0m0.5679   [0m | [0m0.08937  [0m | [0m158.2    [0m | [0m0.003659 [0m |
| [0m4        [0m | [0m0.5806   [0m | [0m0.07729  [0m | [0m73.93    [0m | [0m0.004662 [0m |
| [0m5        [0m | [0m0.5692   [0m | [0m0.03451  [0m | [0m181.1    [0m | [0m0.006769 [0m |
| [95m6        [0m | [95m0.6307   [0m | [95m0.07802  [0m | [95m14.72    [0m | [95m0.006932 [0m |
| [0m7        [0m | [0m0.5428   [0m | [0m0.0159   [0m | [0m188.0    [0m | [0m0.007797 [0m |
| [0m8        [0m | [0m0.5521   [0m | [0m0.07286  [0m | [0m86.13    [0m | [0m0.001871 [0m |
| [95m9        [0m | [95m0.6346   [0m | [95m0.0760



| [0m12       [0m | [0m0.6325   [0m | [0m0.04494  [0m | [0m10.0     [0m | [0m0.0001187[0m |
| [0m13       [0m | [0m0.5966   [0m | [0m0.00112  [0m | [0m22.28    [0m | [0m0.004053 [0m |
| [0m14       [0m | [0m0.6154   [0m | [0m0.09096  [0m | [0m45.09    [0m | [0m0.007131 [0m |
| [0m15       [0m | [0m0.5439   [0m | [0m0.1      [0m | [0m51.08    [0m | [0m0.001824 [0m |
| [0m16       [0m | [0m0.6228   [0m | [0m0.06829  [0m | [0m40.15    [0m | [0m0.008026 [0m |
| [0m17       [0m | [0m0.6188   [0m | [0m0.1      [0m | [0m34.42    [0m | [0m0.006253 [0m |
| [0m18       [0m | [0m0.5907   [0m | [0m0.009852 [0m | [0m29.23    [0m | [0m0.008193 [0m |
| [0m19       [0m | [0m0.5833   [0m | [0m0.07206  [0m | [0m108.7    [0m | [0m0.003479 [0m |
| [0m20       [0m | [0m0.633    [0m | [0m0.1      [0m | [0m200.0    [0m | [0m0.01     [0m |
| [0m21       [0m | [0m0.5711   [0m | [0m0.05385  [0m | [0m196.4    [0m | 

In [44]:
# Get the best hyperparameters found by Bayesian optimization
best_params = optimizer.max['params']
best_auc_score = optimizer.max['target']

In [45]:
print("Best Hyperparameters:")
print(best_params)
print("Best AUC Score:", best_auc_score)

Best Hyperparameters:
{'alpha': 0.1, 'hidden_layer_sizes': 136.42836145520099, 'learning_rate_init': 0.00961848924042072}
Best AUC Score: 0.6418563108837925


In [46]:
model_test = MLPClassifier(hidden_layer_sizes=(int(best_params["hidden_layer_sizes"]),), alpha=best_params["alpha"], learning_rate_init=best_params["learning_rate_init"], random_state=42)

In [47]:
model_test.fit(X_train_rs, y_train_rs)

In [48]:
# Make predictions on the test set
y_pred_probs = model_test.predict_proba(X_test)[:, 1]

In [49]:
print(f"ROC = {roc_auc_score(y_test, y_pred_probs)}")

y_pred_probs = np.where(y_pred_probs > 0.5, 1, 0)

print(f"Accuracy = {accuracy_score(y_test, y_pred_probs)}")
print(f"Precision = {precision_score(y_test, y_pred_probs)}")
print(f"Recall = {recall_score(y_test, y_pred_probs)}")
print(f"F1 Score = {f1_score(y_test, y_pred_probs)}")

ROC = 0.6100076362812434
Accuracy = 0.5487915682355079
Precision = 0.4182945736434108
Recall = 0.6896728016359919
F1 Score = 0.5207488901756419


In [51]:
# y_main_log = model_test.predict(X_test)
# y_main_pred = np.where(y_main_log > 0.5, 1, 0)

y_main_pred = model_test.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_main_pred)

accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
print("Accuracy: ", accuracy)

precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
print("Precision: ", precision)

recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Recall: ", recall)

f1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: ", f1_score)

auc_score = roc_auc_score(y_test, y_main_pred)
print("AUC Score: ", auc_score)

ValueError: Classification metrics can't handle a mix of binary and continuous targets