In [92]:
import sys
import os
from dotenv import load_dotenv

import numpy as np
import pandas as pd
import sqlalchemy as sq
import matplotlib.pyplot as plt
import seaborn as sns
from pandas_profiling import ProfileReport
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, VarianceThreshold

import tensorflow
from tensorflow import keras
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

import kerastuner as kt
from keras_tuner.tuners import RandomSearch, Hyperband, BayesianOptimization

from ann_visualizer.visualize import ann_viz
import graphviz



sys.path.append("../Shared/")
from DataService import DataService

sys.path.append("../Datasets/")
# print(os.getcwd())
from DataCreation import getDatasetV1, getDatasetV2, getDatasetV3, getDatasetV4
from DataTestSplit import splitData



In [93]:
tensorflow.config.set_visible_devices([], 'GPU')  # Hide GPU devices
tensorflow.config.set_visible_devices(tensorflow.config.list_physical_devices('CPU'), 'CPU')  # Show CPU devices

### import data

In [94]:
# from typing import List, Optional, Tuple
# def extractYears(df: pd.DataFrame, year: int, yearEnd: Optional[int] = None) -> pd.DataFrame:
#     """Extract the rows of a dataframe that correspond to a given year.

#     Args:
#         df (pd.DataFrame): The dataframe to extract from.
#         year (int): The year to extract.
#         yearEnd (int, optional): The end year to extract. Defaults to None.

#     Returns:
#         pd.DataFrame: The extracted dataframe.
#     """
    
#     if yearEnd is None:
#         return df.loc[df["year"] == year]
#     else:
#         return df.loc[(df["year"] >= year) & (df["year"] <= yearEnd)]

In [95]:
# conning to database
load_dotenv()
PG_DB = os.getenv("POSTGRES_DB")
PG_ADDR = os.getenv("POSTGRES_ADDR")
PG_PORT = os.getenv("POSTGRES_PORT")
PG_USER = os.getenv("POSTGRES_USER")
PG_PW = os.getenv("POSTGRES_PW")

if (
    PG_DB is None
    or PG_ADDR is None
    or PG_PORT is None
    or PG_USER is None
    or PG_PW is None
):
    raise ValueError("Environment variables not set")

# connecting to database
db = DataService(PG_DB, PG_ADDR, int(PG_PORT), PG_USER, PG_PW)
conn = db.connect()

In [96]:
# query for weather station data
weatherStationQuery = sq.text("""
    SELECT * from dataset_monthly_station
""")
# query for sat data
weatherSatQuery = sq.text("""
    SELECT * from dataset_monthly_sat
""")
# query for ergot prev year data
ergotPrevYearsAggQuery = sq.text("""
    SELECT year, district, 
    present_prev1, present_prev2, present_prev3,
    percnt_true_prev1, percnt_true_prev2, percnt_true_prev3 
    from agg_ergot_sample_v2
""")
# query for ergot data
ergotTargetQuery = sq.text("""
    SELECT year, district, downgrade from ergot_sample_feat_eng
""")

In [97]:
# retriving data from database
stationDf = pd.read_sql(weatherStationQuery, conn)
satelliteDf = pd.read_sql(weatherSatQuery, conn)
ergotPrevDf = pd.read_sql(ergotPrevYearsAggQuery, conn)
ergotTargetDf = pd.read_sql(ergotTargetQuery, conn)

conn.close()
del conn

### preprocessing data

In [99]:
# cheching for dublicates
if ergotTargetDf.duplicated().any():
    print('dublicates found')
    ergotTargetDf.drop_duplicates( inplace=True)
if (ergotTargetDf.isna().sum().sum() > 0):
    print('NaN found')
    ergotTargetDf.dropna(inplace=True)
ergotTargetDf

Unnamed: 0,year,district,downgrade
0,1995,4810,False
48,1995,4820,False
190,1995,4830,False
230,1995,4840,False
280,1995,4840,True
...,...,...,...
157041,2022,4751,True
157126,2022,4791,True
157219,2022,4731,True
157727,2022,4604,True


In [100]:
if ergotPrevDf.duplicated().any():
    print('dublicates found')
    ergotPrevDf.drop_duplicates( inplace=True)
if (ergotPrevDf.isna().sum().sum() > 0):
    print('NaN found')
    ergotPrevDf.fillna(0, inplace=True)
ergotPrevDf

NaN found


Unnamed: 0,year,district,present_prev1,present_prev2,present_prev3,percnt_true_prev1,percnt_true_prev2,percnt_true_prev3
0,1995,4810,False,False,False,0.000000,0.000000,0.000000
1,1995,4820,False,False,False,0.000000,0.000000,0.000000
2,1995,4830,False,False,False,0.000000,0.000000,0.000000
3,1995,4840,False,False,False,0.000000,0.000000,0.000000
4,1995,4850,False,False,False,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
1087,2022,4761,False,True,True,0.010204,0.076923,0.062893
1088,2022,4771,False,True,False,0.000000,0.153333,0.013072
1089,2022,4790,True,True,True,0.025316,0.229167,0.147727
1090,2022,4791,False,True,True,0.028986,0.223881,0.328571


In [101]:
# cheching for dublicates
if stationDf.duplicated().any():
    print('dublicates found')
    stationDf.drop_duplicates( inplace=True)
if (stationDf.isna().sum().sum() > 0):
    print('NaN found')
    stationDf.dropna(inplace=True)
stationDf

Unnamed: 0,year,month,district,min_temp_x,max_temp_x,mean_temp_x,min_dew_point_temp,max_dew_point_temp,mean_dew_point_temp,min_humidex,...,mean_total_rain,min_total_snow,max_total_snow,mean_total_snow,min_total_precip,max_total_precip,mean_total_precip,min_snow_on_grnd,max_snow_on_grnd,mean_snow_on_grnd
0,1953,1,4606,-37.2,-3.3,-16.650918,-22.2,0.0,-3.170833,0.0,...,0.000000,0.0,18.3,1.375000,0.0,18.3,1.375000,0.0,0.0,0.000000
1,1953,1,4607,-31.7,-4.4,-14.224855,-33.9,-5.0,-14.939367,0.0,...,0.000000,0.0,2.8,0.466667,0.0,2.8,0.466667,0.0,0.0,0.000000
2,1953,1,4611,-31.7,-5.0,-13.608918,-32.2,-5.6,-14.508056,0.0,...,0.000000,0.0,7.4,1.366667,0.0,7.4,1.366667,0.0,0.0,0.000000
3,1953,1,4612,-38.9,-7.2,-23.922143,-40.0,0.0,-23.888548,0.0,...,0.000000,0.0,3.8,0.439583,0.0,3.8,0.439583,0.0,0.0,0.000000
4,1953,1,4710,-33.9,3.3,-11.815733,-33.9,-1.1,-12.818630,0.0,...,0.000000,0.0,2.5,0.220000,0.0,2.5,0.220000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22602,2022,12,4830,-46.5,11.7,-11.121018,-51.8,2.8,-13.209914,0.0,...,0.000357,0.0,7.2,0.028596,0.0,8.4,0.420344,0.0,49.0,4.857147
22603,2022,12,4840,-41.9,5.9,-17.592067,-46.0,2.7,-20.147237,0.0,...,0.000000,0.0,12.6,0.030833,0.0,8.7,0.501989,0.0,44.0,4.498098
22604,2022,12,4850,-41.8,9.9,-16.074466,-46.6,1.5,-19.138173,0.0,...,0.000000,0.0,2.7,0.022471,0.0,10.0,0.291207,0.0,36.0,5.410004
22605,2022,12,4860,-48.1,4.9,-18.400136,-52.4,0.4,-20.571919,0.0,...,0.000000,0.0,15.5,0.052028,0.0,10.1,0.570979,0.0,47.0,7.136807


In [102]:
# merge on year and district
# tempdf = pd.merge(satelliteDf, ergotPrevDf, on=["year", "district"], how="left")
# del satelliteDf
# del ergotPrevDf
# tempdf = satelliteDf
tempdf = stationDf

# merge on year and district
datasetDf = pd.merge(ergotTargetDf, tempdf, on=["year", "district"], how="left")
datasetDf = pd.merge(datasetDf, ergotPrevDf, on=["year", "district"], how="left")
del ergotTargetDf
del tempdf

In [103]:
datasetDf

Unnamed: 0,year,district,downgrade,month,min_temp_x,max_temp_x,mean_temp_x,min_dew_point_temp,max_dew_point_temp,mean_dew_point_temp,...,mean_total_precip,min_snow_on_grnd,max_snow_on_grnd,mean_snow_on_grnd,present_prev1,present_prev2,present_prev3,percnt_true_prev1,percnt_true_prev2,percnt_true_prev3
0,1995,4810,False,1.0,-29.5,11.1,-10.052509,-31.9,4.8,-12.584334,...,0.103226,0.0,9.0,1.612903,False,False,False,0.000000,0.000000,0.000000
1,1995,4810,False,2.0,-26.1,20.0,-5.029997,-29.9,4.1,-10.819124,...,0.078929,0.0,8.0,0.326786,False,False,False,0.000000,0.000000,0.000000
2,1995,4810,False,3.0,-28.4,17.3,-1.511578,-32.2,5.6,-8.330515,...,0.293871,0.0,8.0,0.804839,False,False,False,0.000000,0.000000,0.000000
3,1995,4810,False,4.0,-14.8,19.0,3.551604,-21.3,9.1,-2.984549,...,0.603000,0.0,14.0,0.471667,False,False,False,0.000000,0.000000,0.000000
4,1995,4810,False,5.0,-4.6,31.5,11.397211,-10.2,11.2,2.413164,...,1.340645,0.0,0.0,0.000000,False,False,False,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18658,2022,4830,True,8.0,-2.0,34.6,18.476221,-6.6,20.6,7.039598,...,1.161233,0.0,0.0,0.000000,False,True,True,0.025641,0.513369,0.083333
18659,2022,4830,True,9.0,-6.3,36.9,13.501864,-30.9,15.6,2.855264,...,0.615462,0.0,4.0,0.066177,False,True,True,0.025641,0.513369,0.083333
18660,2022,4830,True,10.0,-10.6,28.3,7.661494,-21.0,13.0,-0.843584,...,0.826722,0.0,45.0,0.929443,False,True,True,0.025641,0.513369,0.083333
18661,2022,4830,True,11.0,-31.5,13.4,-6.694053,-38.8,1.9,-10.254210,...,0.836543,0.0,38.0,3.963579,False,True,True,0.025641,0.513369,0.083333


In [104]:
# preprocess after merging the data 
mean = datasetDf.mean()
datasetDf.fillna(mean, inplace=True)
# one-hot encoding
t = pd.get_dummies(datasetDf["district"], drop_first=True)
datasetDf = pd.concat([datasetDf, t], axis=1)
datasetDf.columns = datasetDf.columns.astype(str)
datasetDf.drop(["district"], axis=1, inplace=True)

In [106]:
# cheching for dublicates
if datasetDf.duplicated().any():
    print('dublicates found')
    datasetDf.drop_duplicates( inplace=True)
if (datasetDf.isna().sum().sum() > 0):
    print('NaN found')
    datasetDf.dropna(inplace=True)

datasetDf

Unnamed: 0,year,downgrade,month,min_temp_x,max_temp_x,mean_temp_x,min_dew_point_temp,max_dew_point_temp,mean_dew_point_temp,min_humidex,...,4781,4790,4791,4810,4820,4830,4840,4850,4860,4870
0,1995,False,1.0,-29.5,11.1,-10.052509,-31.9,4.8,-12.584334,0.0,...,0,0,0,1,0,0,0,0,0,0
1,1995,False,2.0,-26.1,20.0,-5.029997,-29.9,4.1,-10.819124,0.0,...,0,0,0,1,0,0,0,0,0,0
2,1995,False,3.0,-28.4,17.3,-1.511578,-32.2,5.6,-8.330515,0.0,...,0,0,0,1,0,0,0,0,0,0
3,1995,False,4.0,-14.8,19.0,3.551604,-21.3,9.1,-2.984549,0.0,...,0,0,0,1,0,0,0,0,0,0
4,1995,False,5.0,-4.6,31.5,11.397211,-10.2,11.2,2.413164,0.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18658,2022,True,8.0,-2.0,34.6,18.476221,-6.6,20.6,7.039598,0.0,...,0,0,0,0,0,1,0,0,0,0
18659,2022,True,9.0,-6.3,36.9,13.501864,-30.9,15.6,2.855264,0.0,...,0,0,0,0,0,1,0,0,0,0
18660,2022,True,10.0,-10.6,28.3,7.661494,-21.0,13.0,-0.843584,0.0,...,0,0,0,0,0,1,0,0,0,0
18661,2022,True,11.0,-31.5,13.4,-6.694053,-38.8,1.9,-10.254210,0.0,...,0,0,0,0,0,1,0,0,0,0


In [107]:
# train 1995 - 2015 test 2016 - 2020
X_train, X_val, X_test, y_train, y_val, y_test = splitData(datasetDf, drop_features=["year"], target_variable="downgrade", pivot=2015, val_size=0.2, stratified=False)
del datasetDf

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_features, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=drop_features, inplace=True)


### scaling and oversampling

In [108]:
# normalizing data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# oversampling data
ros = RandomOverSampler(random_state=42)
X_train_rs, y_train_rs = ros.fit_resample(X_train, y_train)

In [109]:
y_train.value_counts()

False    6824
True     3704
Name: downgrade, dtype: int64

In [110]:
y_train_rs.value_counts()

False    6824
True     6824
Name: downgrade, dtype: int64

### Model

In [None]:
from sklearn import metrics
from keras import backend as K

def auc(y_true, y_pred):
    auc = tensorflow.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tensorflow.local_variables_initializer())
    return auc

In [111]:
def build_model(hp):
    model = Sequential()
    # model.add(Dense(input_dim=X_train.shape[1]))
    for i in range(hp.Int('num_layers', 2, 30)):
        model.add(Dense(units=hp.Int('units_' + str(i),
                                            min_value=124, #32
                                            max_value=1748, # 512
                                            step=32),
                                activation=hp.Choice('act_' + str(i), ['relu', 'sigmoid']),# , 'tanh', 'elu', 'selu', 'softplus', 'softsign', 'exponential', 'linear'])))
                                kernel_regularizer = l1_l2(0.01)))
        
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer=Adam(
            hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy', tensorflow.keras.metrics.AUC(name='auc')])
    return model


# def build_model(hp):
#     model = keras.Sequential()
#     model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=256, step=32),
#                            activation='relu', input_shape=(X_train.shape[1],)))
#     for i in range(hp.Int('num_layers', min_value=1, max_value=5)):
#         model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
#                                activation='relu'))
#     model.add(Dense(1, activation='sigmoid'))

#     # Compile the model with the desired optimizer, loss, and metrics
#     model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
#                   loss='binary_crossentropy', metrics=['accuracy'])
#     return model




In [112]:
# tuner = RandomSearch(
#     build_model,
#     objective=kt.Objective("val_auc", direction="max"),
#     max_trials=5,
#     executions_per_trial=3,
#     overwrite=True,
#     directory='data/random_search',
#     project_name='ergot_random_search'
#     )

tuner = BayesianOptimization(
    build_model,
    # objective=kt.Objective("val_auc", direction="max"),
    objective="val_accuracy",
    max_trials=10,
    overwrite=True,
    executions_per_trial=2,
    directory='data/BayesianOptimization',
    project_name='ergot_random_search')

In [113]:
tuner.search_space_summary()

Search space summary
Default search space size: 6
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 30, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 124, 'max_value': 1748, 'step': 32, 'sampling': 'linear'}
act_0 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'sigmoid'], 'ordered': False}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 124, 'max_value': 1748, 'step': 32, 'sampling': 'linear'}
act_1 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'sigmoid'], 'ordered': False}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [114]:
tuner.search(X_train_rs, y_train_rs, epochs=20, validation_data=(X_val, y_val))


Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
7                 |7                 |num_layers
1180              |1180              |units_0
sigmoid           |sigmoid           |act_0
1148              |1148              |units_1
sigmoid           |sigmoid           |act_1
0.0001            |0.0001            |learning_rate

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

KeyboardInterrupt: 

In [None]:
# results of hyper perameter tuning
tuner.results_summary()

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_hps.values



In [None]:
# Method : 1
# model = tuner.hypermodel.build(best_hps)
# model.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test))

# Method : 2
model = tuner.get_best_models(num_models=1)[0]
model.build(X_train.shape)

In [None]:
model.summary()
model

In [None]:
# using  validation_data
# history = model.fit(X_train_rs, y_train_rs, epochs=500, batch_size=64, validation_data=(X_val, y_val), verbose=1)

# using validation_split
history = model.fit(X_train_rs, y_train_rs, epochs=500, batch_size=64, validation_split=0.2, verbose=1)

In [None]:
# function to plot the training and validation loss for each epoch
def evaluate_model(history):
    # Get the training and validation loss from the history
    training_loss = history.history['loss']
    validation_loss = history.history['val_loss']

    # Get the training and validation accuracy from the history
    training_accuracy = history.history['accuracy']
    validation_accuracy = history.history['val_accuracy']

    # Plot the training and validation loss
    plt.figure(figsize=(20, 6))
    plt.subplot(1, 2, 1)
    plt.plot(training_loss, label='Training Loss')
    plt.plot(validation_loss, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()

    # Plot the training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(training_accuracy, label='Training Accuracy')
    plt.plot(validation_accuracy, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()

    plt.show()

    # Check if the model is overfitting, underfitting, or performing well
    # final_training_loss = np.array(training_loss).mean()
    # final_validation_loss = np.array(validation_loss).mean()

    # final_training_accuracy = np.array(training_accuracy).mean()
    # final_validation_accuracy = np.array(validation_accuracy).mean()

    # if final_training_loss < final_validation_loss:
    #     print("The model is likely underfitting.")
    # elif final_training_loss > final_validation_loss:
    #     print("The model is likely overfitting.")
    # else:
    #     print("The model is performing well and generalizing to new data.")

    # if final_training_accuracy == 1.0 and final_validation_accuracy == 1.0:
    #     print("The model has achieved 100% accuracy on both training and validation data.")

In [None]:
evaluate_model(history)

###  pred

In [121]:
def model_predict(model, X_test):
    y_log = model.predict(X_test)
    y_pred = np.where(y_log > 0.5, 1, 0)

    conf_matrix = confusion_matrix(y_val, y_pred)

    accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
    print("Accuracy: ", accuracy)

    precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
    print("Precision: ", precision)

    recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
    print("Recall: ", recall)

    f1_score = 2 * (precision * recall) / (precision + recall)
    print("F1 Score: ", f1_score)

    auc_score = roc_auc_score(y_val, y_pred)
    print("AUC Score: ", auc_score)

In [122]:
model_predict(model, X_val)

 1/83 [..............................] - ETA: 0s

Accuracy:  0.6576747720364742
Precision:  nan
Recall:  0.0
F1 Score:  nan
AUC Score:  0.5


  precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])


In [124]:
# predicting the validation set results
y_log = model.predict(X_val)
y_pred = np.where(y_log > 0.5, 1, 0)



In [125]:
conf_matrix = confusion_matrix(y_val, y_pred)

accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
print("Accuracy: ", accuracy)

precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
print("Precision: ", precision)

recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Recall: ", recall)

f1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: ", f1_score)

auc_score = roc_auc_score(y_val, y_pred)
print("AUC Score: ", auc_score)

# print(classification_report(y_val, y_pred))

Accuracy:  0.6576747720364742
Precision:  nan
Recall:  0.0
F1 Score:  nan
AUC Score:  0.5


  precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])


In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.4f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# d1
[64, 16, 8, 4, 1]
- Accuracy:  0.7759562841530054
- Precision:  0.8767123287671232
- Recall:  0.847682119205298
- F1 Score:  0.861952861952862
- AUC Score:  0.642591059602649

[48, 48, 48, 48, 1]
- Accuracy:  0.7978142076502732
- Precision:  0.9014084507042254
- Recall:  0.847682119205298
- F1 Score:  0.8737201365187712
- AUC Score:  0.705091059602649

In [None]:
# main prediction

y_main_log = model.predict(X_test)
y_main_pred = np.where(y_main_log > 0.5, 1, 0)

conf_matrix = confusion_matrix(y_test, y_main_pred)

accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
print("Accuracy: ", accuracy)

precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
print("Precision: ", precision)

recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Recall: ", recall)

f1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: ", f1_score)

auc_score = roc_auc_score(y_test, y_main_pred)
print("AUC Score: ", auc_score)

# print(classification_report(y_test, y_main_pred))

### get weights

In [None]:
def plot_feature_importance(model, feature_names):
    # Get the weights of the first hidden layer
    first_hidden_layer_weights = model.layers[0].get_weights()[0]
    
    # Calculate the mean absolute weight for each feature
    feature_importance = np.mean(np.abs(first_hidden_layer_weights), axis=1)
    
    # Sort the features based on their importance
    sorted_indices = np.argsort(feature_importance)[::-1]
    sorted_feature_importance = feature_importance[sorted_indices]
    sorted_feature_names = [feature_names[i] for i in sorted_indices]
    
    # Plot the feature importance
    plt.figure(figsize=(20, 6))
    plt.bar(range(len(feature_names)), sorted_feature_importance)
    plt.xticks(range(len(feature_names)), sorted_feature_names, rotation=45, ha='right')
    plt.xlabel('Feature')
    plt.ylabel('Feature Importance')
    plt.title('Feature Importance of MLP')
    plt.tight_layout()
    plt.show()


In [None]:
plot_feature_importance(model, X_train_df.columns)

### Method : 2

In [None]:
import tensorflow as tf
from tensorflow import keras
from kerastuner import HyperModel
from tensorflow.keras.metrics import AUC

class MyHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes

    def build(self, hp):
        model = keras.Sequential()

        # Tune the number of dense layers and units
        for i in range(hp.Int('num_dense_layers', 1, 3)):
            model.add(keras.layers.Dense(units=hp.Int('units_' + str(i),
                                                      min_value=32,
                                                      max_value=512,
                                                      step=32),
                                        activation='relu'))

        model.add(keras.layers.Dense(self.num_classes, activation='sigmoid'))

        # Tune the learning rate for the optimizer
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

        # model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
        #               loss='binary_crossentropy',
        #               metrics=['accuracy'])

        model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                      loss='binary_crossentropy',
                      metrics=['accuracy', AUC(name='auc')])  # Add AUC metric

        return model


In [None]:
from kerastuner.tuners import BayesianOptimization

input_shape = (X_train.shape[1],)
num_classes = 1


import keras_tuner

# Define your hypermodel with the input shape and number of classes
hypermodel = MyHyperModel(input_shape=input_shape, num_classes=num_classes)

# Initialize the BayesianOptimization tuner with the objective set to maximize AUC
tuner = BayesianOptimization(
    hypermodel,
    objective=keras_tuner.Objective("val_auc", direction="max"),
    max_trials=10,
    overwrite=True,
    directory='my_tuner_dir',
    project_name='my_model_tuning'
)

# Start the hyperparameter search
tuner.search(X_train, y_train,
             validation_data=(X_val, y_val),
             epochs=10,
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)])



In [None]:
# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model with the best hyperparameters
best_model = hypermodel.build(best_hps)

# Train the best model on the full training data
best_model.fit(X_train, y_train,
               validation_data=(X_val, y_val),
               epochs=50,  # Use an appropriate number of epochs
               batch_size=32)  # Set a batch size that fits your data and hardware


In [None]:
y_main_log = best_model.predict(X_val)
y_main_pred = np.where(y_main_log > 0.5, 1, 0)

conf_matrix = confusion_matrix(y_val, y_main_pred)

accuracy = (conf_matrix[0, 0] + conf_matrix[1, 1]) / np.sum(conf_matrix)
print("Accuracy: ", accuracy)

precision = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[0, 1])
print("Precision: ", precision)

recall = conf_matrix[1, 1] / (conf_matrix[1, 1] + conf_matrix[1, 0])
print("Recall: ", recall)

f1_score = 2 * (precision * recall) / (precision + recall)
print("F1 Score: ", f1_score)

auc_score = roc_auc_score(y_test, y_main_pred)
print("AUC Score: ", auc_score)
