## 1.1 import libraries

In [None]:
%%capture
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# Feature Scaling
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [None]:
#Seaborn settings for visualizations!
rc = {
    "axes.facecolor": "#f7f9fc",
    "figure.facecolor": "#f7f9fc",
    "axes.edgecolor": "#000000",
    "grid.color": "#EBEBE7",
    "font.family": "serif",
    "axes.labelcolor": "#000000",
    "xtick.color": "#000000",
    "ytick.color": "#000000",
    "grid.alpha": 0.4
}

default_palette = 'YlOrRd'

sns.set(rc=rc)
pd.set_option('display.max_columns',35)
pd.options.display.float_format = '{:,.2f}'.format

## 1.2 import data

In [None]:
original = pd.read_csv('/media/barry/DKCH/data/Thesis_data/20240410/rf-acls.csv')
dataset_name = 'rf-acls'
model_name = 'xgbc'
original['recovery_class'] = pd.cut(original['recovery ratio'], bins=[-1, 0.5, 1.1], labels=[0, 1])
original['recovery_class'].value_counts()
#Let's check the Shape of data
print(f'The Whole dataset has {original.shape[0]} rows and {original.shape[1]} columns')

# Pre-Processing

In [None]:
def get_variable_types(dataframe):
    continuous_vars = []
    categorical_vars = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object':
            categorical_vars.append(column)
        else:
            continuous_vars.append(column)

    return continuous_vars, categorical_vars

continuous_vars, categorical_vars = get_variable_types(original)
continuous_vars.remove('JOA')
continuous_vars.remove('post JOA')
continuous_vars.remove('recovery ratio')
categorical_vars.remove('Case')
categorical_vars.remove('whole cord at the max com level')

In [None]:
del_columns = []
for columns_name in original.columns:
    if 'Case' in columns_name:
        del_columns.append(columns_name)
    elif 'post JOA' in columns_name:
        del_columns.append(columns_name)
    elif 'recovery ratio' in columns_name:
        del_columns.append(columns_name)
    elif 'whole cord at the max com level' in columns_name:
        del_columns.append(columns_name)
print(del_columns)
train = original.drop(del_columns, axis=1)

In [None]:
train = pd.get_dummies(train, columns=categorical_vars, drop_first=True)

In [None]:
#Let's check the Shape of data
print(f'The encoded Train dataset has {train.shape[0]} rows and {train.shape[1]} columns')

In [None]:
X = train.drop(['recovery_class'], axis=1)
y = train['recovery_class']

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_original, X_test, y_train_original, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

In [None]:
# random oversampling
from imblearn.over_sampling import SMOTENC
# define oversampling strategy
ros = SMOTENC(random_state=42,categorical_features=[-1])
# fit and apply the transform
X_train, y_train = ros.fit_resample(X_train_original, y_train_original)
print('bad:', y_train.value_counts()[0], '/', round(y_train.value_counts()[0]/len(y_train) * 100,2), '% of the dataset')
print('good:', y_train.value_counts()[1], '/',round(y_train.value_counts()[1]/len(y_train) * 100,2), '% of the dataset')

# Model Building
Hyperparameters for XGBClassifier using Optuna 

In [None]:
# Define the objective function for Optuna optimization
import optuna
from optuna.samplers import TPESampler
from sklearn.preprocessing import LabelEncoder

def objective(trial, X_train, y_train, X_test, y_test):
     # Define parameters to be optimized for the LGBMClassifier
     param = {
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.05),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000, 100),
        'eta': trial.suggest_discrete_uniform('eta', 0.01, 0.1, 0.01),
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "random_state":42
    }

 # LGBMClassifier with the suggested parameters
     lgbm_classifier = XGBClassifier(**param)
    
# Fit 
     lgbm_classifier.fit(X_train, y_train)

# Evaluate
     score = lgbm_classifier.score(X_test, y_test, )

     return score

# Train Test split
X_train_original, X_test, y_train_original, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y) 

# random oversampling
from imblearn.over_sampling import SMOTENC
# define oversampling strategy
ros = SMOTENC(random_state=42,categorical_features=[-1])
# fit and apply the transform
X_train, y_train = ros.fit_resample(X_train_original, y_train_original)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#sampler for Optuna optimization
sampler = optuna.samplers.TPESampler(seed=42)  # Using Tree-structured Parzen Estimator sampler for optimization

# Create a study object
study = optuna.create_study(direction="maximize", sampler=sampler)

# Run the optimization process
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)
study.optimize(lambda trial: objective(trial, X_train, y_train, X_test, y_test), n_trials=50)

# best parameters after optimization
best_params = study.best_params

print('='*50)
print(best_params)

XGBClassifier with the best parameters

In [None]:
lgbm_classifier = XGBClassifier(**best_params)
lgbm_classifier.fit(X_train, y_train)
y_pred = lgbm_classifier.predict(X_test)
accuracy_score(y_test, y_pred) 

In [None]:
feature_importance = lgbm_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df.to_csv("feature_importance_xgb.csv")
name_del = []
for name_index,name in enumerate(X.columns):
    if feature_importance[name_index]<=0:
        name_del.append(name)

In [None]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, precision_score,f1_score,recall_score, roc_curve, auc
import numpy as np
from scipy import interpolate

best_params = study.best_params
accuracy_train_multi = []
accuracy_test_multi = []
recall_train_multi = []
recall_test_multi = []
precision_train_multi = []
precision_test_multi = []
f1_score_train_multi = []
f1_score_test_multi = []
auc_test_multi = []
fpr_test_multi = []
tpr_test_multi = []

accuracy_test_list = []
recall_test_list = []
precision_test_list = []
f1_score_test_list = []
auc_test_list = []

for i in range(10):
    kf = StratifiedKFold(n_splits=5, shuffle=True)# 

    accuracy_train = []
    accuracy_test = []
    recall_train = []
    recall_test = []
    precision_train = []
    precision_test = []
    f1_score_train = []
    f1_score_test = []
    fpr_test = []
    tpr_test = []
    auc_test = []
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)
        
        
        # define oversampling strategy
        ros = SMOTENC(random_state=42,categorical_features=[-1])
        # fit and apply the transform
        X_train, y_train = ros.fit_resample(X_train, y_train)
        
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        
        model = XGBClassifier(**best_params,random_state=42)
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        fpr, tpr, thersholds = roc_curve(y_test, y_test_pred, pos_label=1)
        linear_interpolator = interpolate.interp1d(fpr, tpr, kind='linear')
        fpr_new = np.linspace(0, 1, 100)
        tpr_new = linear_interpolator(fpr_new)
        roc_auc = auc(fpr_new, tpr_new)
        tpr_test.append(tpr_new)
        auc_test.append(roc_auc)
        
        accuracy_train.append(accuracy_score(y_train, y_train_pred))
        accuracy_test.append(accuracy_score(y_test, y_test_pred))
        recall_train.append(recall_score(y_train, y_train_pred))
        recall_test.append(recall_score(y_test, y_test_pred))
        precision_train.append(precision_score(y_train, y_train_pred))
        precision_test.append(precision_score(y_test, y_test_pred))
        f1_score_train.append(f1_score(y_train, y_train_pred))
        f1_score_test.append(f1_score(y_test, y_test_pred))
        
        # print('------Weighted------')
        # print('Weighted precision', precision_score(y_test, y_test_pred, average='weighted'))
        # print('Weighted recall', recall_score(y_test, y_test_pred, average='weighted'))
        # print('Weighted f1-score', f1_score(y_test, y_test_pred, average='weighted'))
        # print('------Macro------')
        # print('Macro precision', precision_score(y_test, y_test_pred, average='macro'))
        # print('Macro recall', recall_score(y_test, y_test_pred, average='macro'))
        # print('Macro f1-score', f1_score(y_test, y_test_pred, average='macro'))
        # print('------Micro------')
        # print('Micro precision', precision_score(y_test, y_test_pred, average='micro'))
        # print('Micro recall', recall_score(y_test, y_test_pred, average='micro'))
        # print('Micro f1-score', f1_score(y_test, y_test_pred, average='micro'))
    acc_train = np.array(accuracy_train)
    acc_test = np.array(accuracy_test)
    rec_train = np.array(recall_train)
    rec_test = np.array(recall_test)
    pre_train = np.array(precision_train)
    pre_test = np.array(precision_test)
    f1_train = np.array(f1_score_train)
    f1_test = np.array(f1_score_test)
    auc_test = np.array(auc_test)
    tpr_test = np.array(tpr_test).mean(0).T

    accuracy_train_multi.append(acc_train.mean())
    accuracy_test_multi.append(acc_test.mean())
    recall_train_multi.append(rec_train.mean())
    recall_test_multi.append(rec_test.mean())
    precision_train_multi.append(pre_train.mean())
    precision_test_multi.append(pre_test.mean())
    f1_score_train_multi.append(f1_train.mean())
    f1_score_test_multi.append(f1_test.mean())
    auc_test_multi.append(auc_test.mean())
    tpr_test_multi.append(tpr_test.T)
    
    accuracy_test_list.append(acc_test)
    recall_test_list.append(rec_test)
    precision_test_list.append(pre_test)
    f1_score_test_list.append(f1_test)
    auc_test_list.append(auc_test)

print("mean accuracy is: {:.4f} $\pm$ {:.2f}".format(np.array(accuracy_test_multi).mean(),np.array(accuracy_test_multi).std()))
print("mean recall is: {:.4f} $\pm$ {:.2f}".format(np.array(recall_test_multi).mean(),np.array(recall_test_multi).std()))
print("mean precision is: {:.4f} $\pm$ {:.2f}".format(np.array(precision_test_multi).mean(),np.array(precision_test_multi).std()))
print("mean f1 is: {:.4f} $\pm$ {:.2f}".format(np.array(f1_score_test_multi).mean(),np.array(f1_score_test_multi).std()))
print("mean auc is: {:.4f} $\pm$ {:.2f}".format(np.array(auc_test_multi).mean(),np.array(auc_test_multi).std()))

In [None]:
pd.DataFrame(accuracy_test_list).to_csv('accuracy.csv')
pd.DataFrame(recall_test_list).to_csv('recall.csv')
pd.DataFrame(precision_test_list).to_csv('precision.csv')
pd.DataFrame(f1_score_test_list).to_csv('f1-score.csv')
pd.DataFrame(auc_test_list).to_csv('auc.csv')
pd.DataFrame(tpr_test_multi).to_csv('tpr.csv')
pd.DataFrame(fpr_new).to_csv('fpr.csv')