In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import plotly.graph_objects as go
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score, classification_report, roc_auc_score, roc_curve, confusion_matrix, roc_auc_score, auc, precision_recall_curve
from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import keras
from keras.optimizers import SGD, Adam
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
import time
import math
import warnings

warnings.filterwarnings("ignore")

In [None]:
X_train_unblnc = np.load("X_train_unblnc_10.npy")
y_train_unblnc = np.load("y_train_unblnc_10.npy")
X_train_smote = np.load("X_train_smote_10.npy")
y_train_smote = np.load("y_train_smote_10.npy")
X_train_under = np.load("X_train_under_10.npy")
y_train_under = np.load("y_train_under_10.npy")
X_test = np.load("X_test_10.npy")
y_test = np.load("y_test_10.npy")

In [None]:
X_train_unblnc.shape

In [None]:
def optimize_hyperparameters(X_train_data, X_test_data, y_train_data,
                             model, param_grid, n_jobs, cv=10, verbose=1,
                             scoring_fit='neg_mean_squared_error'):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=n_jobs, 
        scoring=scoring_fit,
        verbose=verbose
    )
    
    model = gs.fit(X_train_data, y_train_data)
    preds = model.predict(X_test_data)
    
    return model, preds

In [None]:
def prediction_interval(accuracy, z, num_samples):
    accuracy_interval = z * np.sqrt((accuracy * (1 - accuracy)) / num_samples)
    accuracy_lower = accuracy - accuracy_interval
    accuracy_upper = accuracy + accuracy_interval
    return accuracy_interval, accuracy_lower, accuracy_upper

In [None]:
def plot_auc_roc(fpr, tpr):
    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()

In [None]:
def plot_precision_recall_auc(precision, recall, fpr, tpr):
    fig = px.area(
        x=recall, y=precision,
        title=f'Precision-Recall Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='Recall', y='Precision'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=1, y1=0
    )
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')

    fig.show()

In [None]:
def train_tune_valid(X_train, y_train, X_test, y_test, model, param_grid):
    
    # Baseline model
    baseline_model = model
    baseline_model.fit(X_train, y_train)
    y_pred = baseline_model.predict(X_test)
    baseline_accuracy = accuracy_score(y_test, y_pred)
    print("Baseline Model accuracy: {}".format(baseline_accuracy))

    # Hyperparameter optimization
    start_time = time.time()
    tuned_model = model
    tuned_model, y_pred_tuned = optimize_hyperparameters(X_train, X_test, y_train, tuned_model, 
                                             param_grid, n_jobs=-1, cv=5,
                                             scoring_fit='roc_auc', verbose=1)

    end_time = time.time()
    duration = end_time - start_time
    print("Time taken: {:.3f} seconds".format(duration))
    print("Best score: {}".format(tuned_model.best_score_))
    print("Best parameters: {}".format(tuned_model.best_params_))

    # Accuracy and prediction interval
    accuracy = accuracy_score(y_test, y_pred)
    print("Tuned Model accuracy: {}".format(accuracy))

    interval, lower, upper = prediction_interval(accuracy, 1.96, X_test.shape[0])
    print("interval: {:.4f}, lower: {:.4f}, upper: {:.4f}".format(interval, lower, upper))

    # Classification report and ROC curve
    print(classification_report(y_test, y_pred))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    plot_auc_roc(fpr, tpr)
    precision, recall, thresholds = precision_recall_curve(y_test, model.predict_proba(X_test)[:, 1])
    plot_precision_recall_auc(precision, recall, fpr, tpr)

In [None]:
model = LogisticRegression()

tuning_space = {
    'C': np.logspace(-2, 4, 10),
    'penalty': ['l1', 'l2']
}

train_tune_valid(X_train_unblnc, y_train_unblnc, X_test, y_test, model, tuning_space)
train_tune_valid(X_train_smote, y_train_smote, X_test, y_test, model, tuning_space)
train_tune_valid(X_train_under, y_train_under, X_test, y_test, model, tuning_space)

In [None]:
model = XGBClassifier()

tuning_space = {
    'max_depth': range(4, 10, 1),
    'n_estimators': [60],
    'reg_alpha': [1e-2, 0.1, 1],
    'gamma': [i / 10.0 for i in range(0, 5)]
}

train_tune_valid(X_train_unblnc, y_train_unblnc, X_test, y_test, model, tuning_space)
train_tune_valid(X_train_smote, y_train_smote, X_test, y_test, model, tuning_space)
train_tune_valid(X_train_under, y_train_under, X_test, y_test, model, tuning_space)

In [None]:
def train_xgboost(X_train, y_train, X_test, y_test, param_grid):
    
    # Baseline XGBoost model
    baseline_model = XGBClassifier()
    baseline_model.fit(X_train, y_train)
    y_pred_baseline = baseline_model.predict(X_test)
    baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
    print("Baseline Model Accuracy: {:.4f}".format(baseline_accuracy))

    # Hyperparameter optimization
    start_time = time.time()
    tuned_model = XGBClassifier()
    tuned_model, y_pred_tuned = optimize_hyperparameters(X_train, X_test, y_train, tuned_model,
                                                         param_grid, n_jobs=-1, cv=5,
                                                         scoring_fit='roc_auc', verbose=1)
    
    end_time = time.time()
    duration = end_time - start_time
    print("Time taken: {:.3f} seconds".format(duration))
    print("Best score: {}".format(tuned_model.best_score_))
    print("Best parameters: {}".format(tuned_model.best_params_))
    
    # Accuracy and prediction interval
    tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
    print("Tuned Model Accuracy: {:.4f}".format(tuned_accuracy))

    # Classification report and ROC curve
    print(classification_report(y_test, y_pred_tuned))
    fpr, tpr, thresholds = roc_curve(y_test, tuned_model.predict_proba(X_test)[:, 1])
    plot_auc_roc(fpr, tpr)
    precision, recall, thresholds = precision_recall_curve(y_test, tuned_model.predict_proba(X_test)[:, 1])
    plot_precision_recall_auc(precision, recall)

    # Prediction Interval (Assuming you have a function named 'prediction_interval')
    interval, lower, upper = prediction_interval(tuned_accuracy, 1.96, X_test.shape[0])
    print("Accuracy Interval: {:.4f}, Lower: {:.4f}, Upper: {:.4f}".format(interval, lower, upper))

# Usage

tuning_space = {
    'max_depth': range(4, 10, 1),
    'n_estimators': [60],
    'reg_alpha': [1e-2, 0.1, 1],
    'gamma': [i / 10.0 for i in range(0, 5)]
}

train_xgboost(X_train_unblnc, y_train_unblnc, X_test, y_test, tuning_space)
train_xgboost(X_train_smote, y_train_smote, X_test, y_test, tuning_space)
train_xgboost(X_train_under, y_train_under, X_test, y_test, tuning_space)


In [None]:
#Baseline model
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

In [None]:
start_time = time.time()

param_grid = {
    'max_depth': range (4, 10, 1),
    'n_estimators': [60],
    'reg_alpha':[1e-2, 0.1, 1],
    'gamma':[i/10.0 for i in range(0, 8, 2)]
}
model = XGBClassifier(max_depth=9, n_estimators=40, reg_alpha=0.1, gamma=0.3)
model, preds = optimize_hyperparameters(X_train, X_test, y_train, model, 
                                 param_grid, n_jobs=-1, cv=5, scoring_fit='roc_auc')

probs = model.predict_proba(X_test)
print(model.best_score_)
print(model.best_params_)

end_time = time.time()

duration = end_time - start_time
print("Time taken: {:.3f} seconds".format(duration))

In [None]:
model = XGBClassifier(max_depth=9, n_estimators=100, reg_alpha=1, gamma=0.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

In [None]:
interval, lower, upper = prediction_interval(accuracy, 1.96, X_test.shape[0])
print("interval: {:.4f}, lower: {:.4f}, upper: {:.4f}".format(interval, lower, upper))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs[:,1])
plot_auc_roc(fpr, tpr)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
plot_precision_recall_auc(precision, recall)

In [None]:
model = XGBClassifier(max_depth=9, n_estimators=100, reg_alpha=1, gamma=0.0)
model.fit(Xn, yn)
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

In [None]:
interval, lower, upper = prediction_interval(accuracy, 1.96, X_test.shape[0])
print("interval: {:.4f}, lower: {:.4f}, upper: {:.4f}".format(interval, lower, upper))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs[:,1])
plot_auc_roc(fpr, tpr)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
plot_precision_recall_auc(precision, recall)

In [None]:
model = LGBMClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

In [None]:
start_time = time.time()

param_grid = {
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': range(5, 9, 1),
    'num_leaves':[30, 40],
    'n_estimators': [100],
    'reg_alpha': [0.1, 1, 5]
}

model = LGBMClassifier(verbose=0)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best ROC AUC Score: {:.4f}".format(grid_search.best_score_))
print("Best Parameters:", grid_search.best_params_)

probs = grid_search.predict_proba(X_test)

end_time = time.time()
duration = end_time - start_time
print("Time taken: {:.3f} seconds".format(duration))

In [None]:
model = LGBMClassifier(max_depth=8, n_estimators=1000, reg_alpha=1, num_leaves=40, learning_rate=0.1, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

In [None]:
interval, lower, upper = prediction_interval(accuracy, 1.96, X_test.shape[0])
print("interval: {:.4f}, lower: {:.4f}, upper: {:.4f}".format(interval, lower, upper))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs[:,1])
plot_auc_roc(fpr, tpr)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
plot_precision_recall_auc(precision, recall)

In [None]:
model = LGBMClassifier(max_depth=8, n_estimators=1000, reg_alpha=1, num_leaves=40, learning_rate=0.1, verbose=0)
model.fit(Xn, yn)
y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy: {}".format(accuracy))

In [None]:
interval, lower, upper = prediction_interval(accuracy, 1.96, X_test.shape[0])
print("interval: {:.4f}, lower: {:.4f}, upper: {:.4f}".format(interval, lower, upper))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs[:,1])
plot_auc_roc(fpr, tpr)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
plot_precision_recall_auc(precision, recall)

In [None]:
model = Sequential()
model.add(Dense(10, input_dim=14, activation='relu'))
model.add(Dense(7, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

optimizer = SGD(learning_rate=0.15, momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', 'AUC'])

early_stopping = EarlyStopping(monitor='val_auc', patience=5, verbose=1, mode='max')

history = model.fit(X_train, y_train, epochs=20, batch_size=32, callbacks=[early_stopping], validation_split=0.2)

In [None]:
model = Sequential()
model.add(Dense(10, input_dim=14, activation='tanh'))
model.add(Dense(7, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

optimizer = SGD(learning_rate=0.15, momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', 'AUC'])

early_stopping = EarlyStopping(monitor='val_auc', patience=5, verbose=1, mode='max')

history = model.fit(X_train, y_train, epochs=20, batch_size=32, callbacks=[early_stopping], validation_split=0.2)

In [None]:
model = Sequential()
model.add(Dense(10, input_dim=14, activation='tanh'))
model.add(Dense(7, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001, beta_1=0.9)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy', 'AUC'])

early_stopping = EarlyStopping(monitor='val_auc', patience=5, verbose=1, mode='max')

history = model.fit(X_train, y_train, epochs=20, batch_size=32, callbacks=[early_stopping], validation_split=0.2)

In [None]:
class CosineLearningRateSchedule(keras.callbacks.Callback):
    def __init__(self, max_lr, warmup_epochs, total_epochs):
        super(CosineLearningRateSchedule, self).__init__()
        self.max_lr = max_lr
        self.warmup_epochs = warmup_epochs
        self.total_epochs = total_epochs
        self.initial_lr = max_lr / warmup_epochs

    def on_epoch_begin(self, epoch, logs=None):
        if epoch < self.warmup_epochs:
            lr = self.initial_lr * (epoch + 1)
        else:
            decayed_lr = 0.5 * self.max_lr * (1 + math.cos(math.pi * (epoch - self.warmup_epochs) / (self.total_epochs - self.warmup_epochs)))
            lr = max(decayed_lr, 1e-7)
        keras.backend.set_value(self.model.optimizer.lr, lr)

model = Sequential()
model.add(Dense(10, input_dim=14, activation='tanh'))
model.add(Dense(7, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy', 'AUC'])

# Cosine Learning Rate Schedule with Warm-up
cosine_lr_schedule = CosineLearningRateSchedule(max_lr=0.01, warmup_epochs=5, total_epochs=20)

# Fit the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, callbacks=[cosine_lr_schedule, early_stopping], validation_split=0.2)


In [None]:
probs = model.predict(X_test)
preds = np.argmax(probs, axis=-1)
accuracy = accuracy_score(y_test, preds)
print("Model accuracy: {}".format(accuracy))

In [None]:
interval, lower, upper = prediction_interval(accuracy, 1.96, X_test.shape[0])
print("interval: {:.4f}, lower: {:.4f}, upper: {:.4f}".format(interval, lower, upper))

In [None]:
print(classification_report(y_test, preds))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs)

In [None]:
plot_auc_roc(fpr, tpr)