In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)


# Machine Learning 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, roc_auc_score, precision_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import plot_roc_curve, plot_precision_recall_curve, roc_curve, auc
from sklearn.impute import SimpleImputer, KNNImputer
import optuna
METRIC = f1_score
metric = 'f1_macro'

# METRIC = f1_score
# metric = 'f1'


# Classifiers
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("../input/in-hospital-mortality-prediction/data01.csv")
df

In [None]:
df.shape

In [None]:
df.describe().style.background_gradient(cmap = 'PuBu')

In [None]:
df.isnull().sum()

In [None]:
# Removing Duplicates
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
# missing = ['outcome', 'BMI', 'heart rate',
#        'Systolic blood pressure', 'Diastolic blood pressure',
#        'Respiratory rate', 'temperature', 'SP O2', 'Urine output',
#        'Neutrophils', 'Basophils', 'Lymphocyte', 'PT', 'INR',
#        'Creatine kinase','glucose', 'Blood calcium',
#        'PH', 'Lactic acid', 'PCO2']
# df.isnull().sum()
missing_values = df.isna().sum().to_dict()

In [None]:
keys = ['outcome', 'BMI', 'heart rate',
       'Systolic blood pressure', 'Diastolic blood pressure',
       'Respiratory rate', 'temperature', 'SP O2', 'Urine output',
       'Neutrophils', 'Basophils', 'Lymphocyte', 'PT', 'INR',
       'Creatine kinase','glucose', 'Blood calcium',
       'PH', 'Lactic acid', 'PCO2']
missing = {x:missing_values[x] for x in keys}

In [None]:
missing_values_df = pd.DataFrame(list(missing.items()), columns=['Column Names', 'Missing_Values'])
# fig = px.bar(missing_values_df, x = 'Column Names', y = 'Missing_Values', title = 'Missing Values')
# fig.show()
plt.figure(figsize = (14, 10))
sns.barplot(x = "Column Names", y = "Missing_Values", data = missing_values_df, palette = "Spectral")
plt.title("Missing Values", fontsize = 18, weight = "bold")
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45, fontsize = 14)
plt.show()

In [None]:
y = df[['outcome']]
X = df.drop(columns = ['outcome'])

In [None]:
# imputerX = SimpleImputer(missing_values=np.nan, strategy='mean')
imputerX = KNNImputer(missing_values=np.nan, n_neighbors = 5)
cols = X.select_dtypes(include='float64').columns
cols

In [None]:
imputerX.fit(X[cols])

In [None]:
X[cols] = imputerX.transform(X[cols])
X.info()

In [None]:
# Imputing y with most frequent value
imputer_y = SimpleImputer(strategy = "most_frequent")
imputer_y.fit(y)
y = imputer_y.transform(y)
y

In [None]:
y = pd.DataFrame(y, columns=['outcome'], dtype='int64')
y

In [None]:
df_final = X.copy()
df_final['outcome'] = y
df_final.head()

In [None]:
df_final.shape

In [None]:
# Changing gendera to gender
df_final.rename(columns = {
    'gendera':'gender',
    'Hyperlipemia':'hyperlipidemia'
}, 
          inplace = True)


In [None]:
df_final.columns

# EDA

In [None]:
fig, ax = plt.subplots(figsize=(8,5), dpi=100)

patches, texts, autotexts = ax.pie(df_final['outcome'].value_counts(), autopct= '%1.1f%%', shadow=True, 
                                   startangle=90, explode=(0.1, 0), labels=['Alive','Death'])

plt.setp(autotexts, size=12, color = 'black', weight='bold')
autotexts[1].set_color('white');

plt.title('Outcome Distribution', fontsize=14)
plt.show()

In [None]:
sns.set_style("ticks")
sns.histplot(data = df_final, x = "diabetes", kde = True, hue = "outcome", palette = "dark")

In [None]:
predictors = list(df_final.columns)
predictors.remove("outcome")
target = df_final['outcome']

In [None]:
predictors.remove("group")
predictors.remove("ID")

In [None]:
fig, axes = plt.subplots(nrows=24, ncols=2, figsize=(16,140))
axes = axes.flatten()

for idx, axis in enumerate(axes):
    sns.histplot(data=df_final, x=df_final[predictors].iloc[:, idx],
                 ax=axis, hue=target, legend=True, kde = True, palette = "dark") #["#682F2F","#F3AB60"], viridis, dark
    axis.set_ylabel('')
    axis.set_xlabel('')
    axis.set_title(predictors[idx], fontsize = 20)

plt.subplots_adjust(hspace = 0.4)
plt.show()

In [None]:
labels = ['No hypertensive+A', 'No hypertensive+D', 'hypertensive+A', 'hypertensive+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['hypertensive', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()

In [None]:
labels = ['No atrialfibrillation+A', 'No atrialfibrillation+D', 'atrialfibrillation+A', 'atrialfibrillation+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['atrialfibrillation', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()

In [None]:
labels = ['No CHD+A', 'No CHD+D', 'CHD+A', 'CHD+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['CHD with no MI', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()

In [None]:
labels = ['No Diabetic+A', 'No Diabetic+D', 'Diabetic+A', 'Diabetic+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['diabetes', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()

In [None]:
labels = ['No Depressed+A', 'No Depressed+D', 'Depressed+A', 'Depressed+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['depression', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()


In [None]:
labels = ['No Hyperlipidemia+A', 'No Hyperlipidemia+D', 'Hyperlipidemia+A', 'Hyperlipidemia+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['hyperlipidemia', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()


In [None]:
labels = ['No COPD+A', 'No COPD+D', 'COPD+A', 'COPD+D']
fig = go.Figure(data=[go.Pie(labels=labels, values=df_final.groupby(by=['COPD', 'outcome']).outcome.count(), 
                             textinfo='label+percent',
                             insidetextorientation='radial', 
                            )])
fig.show()

# Correlation

In [None]:
corr_cols = ['hypertensive', 'atrialfibrillation', 'CHD with no MI', 'diabetes', 'deficiencyanemias',
             'depression', 'hyperlipidemia', 'Renal failure', 'COPD', 'outcome']

cor = df_final[corr_cols].corr()

plt.figure(figsize=(15,10))
sns.heatmap(cor, annot = True, cmap = "PuBu")
plt.xticks(fontsize = 15, rotation = 45)
plt.yticks(fontsize = 15)
plt.show()

In [None]:
continous_corr = ['heart rate','hematocrit', 'RBC', 'MCH',
        'Basophils', 'Lymphocyte', 'PT', 'Creatinine', 'Urea nitrogen',
       'Chloride', 'Anion gap', 'Bicarbonate',
       'Lactic acid', 'PCO2'] # BMI, Urine Output

corr2 = df_final[continous_corr].corr()
df_lt = corr2.where(np.tril(np.ones(corr2.shape)).astype(np.bool))
plt.figure(figsize=(15,10))
hmap=sns.heatmap(df_lt,cmap="Spectral", annot = True)
plt.xticks(fontsize = 15, rotation = 45)
plt.yticks(fontsize = 15)
plt.title("Correlation Map for Vital Features", fontsize=15, weight="bold")
plt.show()

In [None]:
continous_corr = ['age', 'Systolic blood pressure', 'Diastolic blood pressure',
       'Respiratory rate', 'SP O2', 'Urine output',
       'hematocrit', 'MCHC', 'Platelets', 'Neutrophils', 'PT', 'INR',
       'NT-proBNP', 'Creatine kinase', 'Urea nitrogen','glucose', 'PCO2', 'EF', 'outcome']

corr2 = df_final[continous_corr].corr()
corr2.style.background_gradient(cmap = 'PuBu')

# Without Scaling

In [None]:
predictors.remove("gender")

X = df_final[predictors]
X.head()

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X , y, 
                                                    test_size = 0.3, random_state = 11)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
xgb = XGBClassifier(random_state = 2)
xgb.fit(X_train, y_train)

In [None]:
accuracy_score(y_train, xgb.predict(X_train))

In [None]:
accuracy_score(y_test, xgb.predict(X_test))

Accuracy = 0.8813559322033898 (No stratification)

In [None]:
print(classification_report(y_test, xgb.predict(X_test)))

                        precision    recall  f1-score   support (NO statification)

                  0       0.90      0.97      0.93       310
                  1       0.55      0.25      0.34        44

           accuracy                           0.88       354

          macro avg       0.73      0.61      0.64       354
       weighted avg       0.86      0.88      0.86       354

In [None]:
confusion = confusion_matrix(y_test, xgb.predict(X_test), normalize='all')
sns.heatmap(confusion, annot=True, cmap = "PuBu")

Acutal ALIVE -- Predicted ALIVE --> 85%

Acutal ALIVE -- Predicted DEAD --> 2.5%

Acutal DEAD -- Predicted ALIVE --> 9.3%

Acutal DEAD -- Predicted DEAD --> 3.1%

**As mentioned earlier, this is an highly imbalanced dataset out of all patients only 14% patients were died but remianing 86% of patients were alive , due to this imbalance in nature less bias towards major class (i.e. 0 alive) and high variance on test data due to this reason model has predicted all patients as alive, moreover as observed from classification report the accuracy is 85% but f1 score for Minority class is 36%, to deal this problem we have to balance the data between majority class (i.e. 0) and Minority calss (i.e. 1), there were different techniques available but for this study I will be using SMOTE ('Synthetic Minority Oversampling Technique')**


In [None]:
# AUC ROC Curve plotting
probs = xgb.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'XGBClassifier (AUC = %0.2f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
# Sensitivity and Specificity

In [None]:
xgb.feature_importances_

In [None]:
# xgb.feature_importances_
plt.figure(figsize=(15, 12))
plt.barh(X.columns, xgb.feature_importances_)
plt.xlabel("Xgboost Feature Importance")

# Oversampling using SMOTE

In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority', n_jobs = -1)

In [None]:
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
X_resampled.shape, y_resampled.shape

In [None]:
y_resampled.value_counts()

# Custom Oversampled Dataset Predictions

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X_resampled, y_resampled, 
                                                             test_size=0.3, random_state = 10)

In [None]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

In [None]:
train_y.value_counts()

In [None]:
classifier = XGBClassifier(verbosity=0, random_state=42, n_jobs=-1)
classifier.fit(train_X, train_y)

In [None]:
print(classification_report(test_y, classifier.predict(test_X), digits = 5))

                precision    recall  f1-score   support
    
               0    0.93355   0.90064   0.91680       312
               1    0.90000   0.93311   0.91626       299

        accuracy                        0.91653       611
       macro avg    0.91678   0.91688   0.91653       611
    weighted avg    0.91713   0.91653   0.91654       611

In [None]:
# AUC ROC Curve plotting
probs = classifier.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'XGBClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
sns.set_style("ticks")
plt.figure(figsize=(10, 3))
sorted_idx = classifier.feature_importances_.argsort()[:38:-1]
sns.barplot(x = classifier.feature_importances_[sorted_idx], y = X.columns[sorted_idx], palette = "Spectral")
plt.xlabel("Feature Importance")

In [None]:
confusion_clf = confusion_matrix(test_y, classifier.predict(test_X))
print(confusion_clf)

In [None]:
LGB_clf = LGBMClassifier(random_state = 42, n_jobs = -1)
LGB_clf.fit(train_X, train_y)

print(classification_report(test_y, LGB_clf.predict(test_X), digits = 5,output_dict=False))

In [None]:
# AUC ROC Curve plotting
probs = LGB_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'LGBClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10, 3))
sorted_idx = LGB_clf.feature_importances_.argsort()[:38:-1]
sns.barplot(x = LGB_clf.feature_importances_[sorted_idx], y = X.columns[sorted_idx], palette = "Spectral")
plt.xlabel("Feature Importance")

In [None]:
CTB_clf = CatBoostClassifier(verbose=0, random_state = 42)
CTB_clf.fit(train_X, train_y)

print(classification_report(test_y, CTB_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = CTB_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'CTBClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(10, 3))
sorted_idx = CTB_clf.feature_importances_.argsort()[:38:-1]
sns.barplot(x = CTB_clf.feature_importances_[sorted_idx], y = X.columns[sorted_idx], palette = "Spectral")
plt.xlabel("Feature Importance")

In [None]:
RDG_clf = RidgeClassifier(random_state = 12)
RDG_clf.fit(train_X, train_y)

print(classification_report(test_y, RDG_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = RDG_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'XGBClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
SGD_clf = SGDClassifier(loss = "hinge",n_jobs=-1)
SGD_clf.fit(train_X, train_y)

print(classification_report(test_y, SGD_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = SGD_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'SGDClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
GNB_clf = GaussianNB()
GNB_clf.fit(train_X, train_y)

print(classification_report(test_y, GNB_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = GNB_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'GNBClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
RFC_clf = RandomForestClassifier(n_jobs=-1)
RFC_clf.fit(train_X, train_y)

print(classification_report(test_y, RFC_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = RFC_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'RFCClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
RFC_clf.feature_importances_

In [None]:
plt.figure(figsize=(10, 3))
sorted_idx = RFC_clf.feature_importances_.argsort()[:38:-1]
sns.barplot(x = RFC_clf.feature_importances_[sorted_idx], y = X.columns[sorted_idx], palette = "Spectral")
plt.xlabel("Feature Importance")

In [None]:
ABC_clf = AdaBoostClassifier(n_estimators=50)
ABC_clf.fit(train_X, train_y)

print(classification_report(test_y, ABC_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = ABC_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'ABCClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
GBC_clf = GradientBoostingClassifier(random_state=13)
GBC_clf.fit(train_X, train_y)

print(classification_report(test_y, GBC_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = GBC_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'GBCClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

In [None]:
LRC_clf = LogisticRegression()
LRC_clf.fit(train_X, train_y)

print(classification_report(test_y, LRC_clf.predict(test_X), digits = 5))

In [None]:
# AUC ROC Curve plotting
probs = LRC_clf.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# plt.figure(figsize = (12, 7))
plt.title('Receiver Operating Characteristic', weight='bold')
plt.plot(fpr, tpr, 'b', label = 'LRCClassifier (AUC = %0.4f)' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.ylabel('True Positive Rate', fontsize=12)
plt.xlabel('False Positive Rate', fontsize=12)
plt.show()

# Integrated Results

In [None]:
params = {}


LGB_clf = LGBMClassifier(**params)
XGB_clf = XGBClassifier(verbosity=0, **params)
CTB_clf = CatBoostClassifier(verbose=0, **params)
RDG_clf = RidgeClassifier(**params)
SGD_clf = SGDClassifier(loss="hinge", **params)
GNB_clf = GaussianNB(**params)
RFC_clf = RandomForestClassifier(**params)
ABC_clf = AdaBoostClassifier(**params)
GBC_clf = GradientBoostingClassifier(random_state=0, **params)
LRC_clf = LogisticRegression(**params)

In [None]:
algorithms = [
              LGB_clf, 
              XGB_clf, 
              CTB_clf, 
              RDG_clf, 
              SGD_clf, 
              GNB_clf, 
              RFC_clf, 
              ABC_clf, 
              GBC_clf, 
              LRC_clf, 
             ]

CV = StratifiedShuffleSplit(n_splits=5, random_state=42)
for alg in algorithms:
    scores = cross_val_score(alg, X_resampled, y_resampled, scoring=metric, cv=CV)
    print("%s %0.2f (+/- %0.2f) %s" % (metric, scores.mean(), scores.std(), alg.__class__.__name__))


    f1_macro 0.93 (+/- 0.02) LGBMClassifier
    f1_macro 0.93 (+/- 0.01) XGBClassifier
    f1_macro 0.94 (+/- 0.01) CatBoostClassifier
    f1_macro 0.84 (+/- 0.02) RidgeClassifier
    f1_macro 0.53 (+/- 0.10) SGDClassifier
    f1_macro 0.80 (+/- 0.03) GaussianNB
    f1_macro 0.94 (+/- 0.02) RandomForestClassifier
    f1_macro 0.86 (+/- 0.00) AdaBoostClassifier
    f1_macro 0.90 (+/- 0.02) GradientBoostingClassifier
    f1_macro 0.72 (+/- 0.02) LogisticRegression

In [None]:
def predict(X__train, y__train, X__pred, all_algorithms):
    stacked_predicts = pd.DataFrame()
    stacked_column_names = []
    for alg in all_algorithms:
        alg_name = str(alg.__class__.__name__)[:3]
        model = alg.fit(X__train, y__train)
        y_hat = model.predict(X__pred)
        stacked_predicts[alg_name] = y_hat
        stacked_column_names.append(alg_name)
    stacked_predicts['final_aggregated_prediction'] = stacked_predicts[stacked_column_names].mode(axis=1)[0].astype('int64')
    y_hat = list(stacked_predicts.loc[:, 'final_aggregated_prediction'])
    print(stacked_predicts[-30:])
    del stacked_predicts
    return y_hat


algorithms = [
              LGB_clf, 
              XGB_clf, 
              CTB_clf, 
#               RDG_clf, 
#               SGD_clf, 
#               GNB_clf, 
              RFC_clf, 
#               ABC_clf, 
#               GBC_clf, 
#               LRC_clf, 
             ]


pred = predict(train_X, train_y, test_X, algorithms)

score = METRIC(test_y, pred, average='macro')
print(f'\n{metric}: {score}\n')
# print(f'last 30 y_test: {list(test_y[target][-30:])}')
# print(f'last 30 y_pred: {pred[-30:]}\n')
print(f'Classification Report:\n')
print(classification_report(test_y, pred), '\n')


cm = confusion_matrix(test_y, pred)
sns.heatmap(cm, annot=True, cmap="Blues", fmt='.0f')

    f1_macro: 0.9328968903436989

    Classification Report:

                    precision    recall  f1-score   support
   
                0       0.95      0.91      0.93       312
                1       0.91      0.95      0.93       299

        accuracy                           0.93       611
       macro avg       0.93      0.93      0.93       611
    weighted avg       0.93      0.93      0.93       611

# By Hyperparameter Tuning

In [None]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

def evaluate(clf, X, y):
    CV = StratifiedShuffleSplit(n_splits=5, random_state=42)
    scores = cross_val_score(clf, X, y, scoring=metric, cv=CV)
    score = scores.mean()
    return score

In [None]:
def objective(trial):
    params = {
        'metric': trial.suggest_categorical('metric', ['binary_error',"binary_logloss"]),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 105, step=25),
        'learning_rate' :  trial.suggest_loguniform('learning_rate', 1e-5, 1.0),
        'max_depth' : trial.suggest_int('max_depth', 4, 12, step=2),
        'reg_lambda' : trial.suggest_loguniform('reg_lambda', 1e-5, 10.0),
        'reg_alpha' : trial.suggest_loguniform('reg_alpha', 1e-5, 10.0),
    }
    
    LGB_clf = LGBMClassifier(**params)
    return evaluate(LGB_clf, X_resampled, y_resampled)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)
params = study.best_params
print(f'{metric}: {study.best_value}')
print(params)


f1_macro: 0.9283448760096817

In [None]:
params = {'metric': 'binary_logloss', 'num_leaves': 112, 
          'min_child_samples': 80, 'learning_rate': 0.2866074254648006, 
          'max_depth': 12, 'reg_lambda': 0.0001261576527556916, 
          'reg_alpha': 4.542475824392113e-05}
LGB_clf = LGBMClassifier(**params)

In [None]:
def objective(trial):
    params = {
        'lambda': trial.suggest_loguniform('lambda', 1e-5, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 10.0),
        'learning_rate' :  trial.suggest_loguniform('learning_rate', 1e-5, 1.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]),
        'colsample_bynode': trial.suggest_categorical('colsample_bynode', [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]),
        'subsample': trial.suggest_categorical('subsample', [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]),
        'max_depth':  trial.suggest_int('max_depth', 4, 12, step=2),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
    }
    XGB_clf = XGBClassifier(verbosity=0, **params)
    return evaluate(XGB_clf, X_resampled, y_resampled)
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)
params = study.best_params
print(f'{metric}: {study.best_value}')
print(params)

f1_macro: 0.9333000257084603

In [None]:
params = {'lambda': 0.00017351981502448073, 'alpha': 1.1493784396911765e-05, 
        'learning_rate': 0.027346390661614303, 'colsample_bytree': 0.2, 
        'colsample_bylevel': 0.6, 'colsample_bynode': 1, 'subsample': 0.7, 
        'max_depth': 8, 'min_child_weight': 2}
XGB_clf = XGBClassifier(verbosity=0, **params)

In [None]:
algorithms = [
              LGB_clf, 
              XGB_clf, 
              CTB_clf, 
              RFC_clf, 
             ]

CV = StratifiedShuffleSplit(n_splits=5, random_state=42)

for alg in algorithms:
    scores = cross_val_score(alg, X_resampled, y_resampled, scoring=metric, cv=CV)
    print("%s %0.2f (+/- %0.2f) %s" % (metric, scores.mean(), scores.std(), alg.__class__.__name__))

In [None]:
algorithms = [
              LGB_clf, 
              XGB_clf, 
              CTB_clf, 
              RFC_clf, 
             ]


pred = predict(train_X, train_y, test_X, algorithms)

score = METRIC(test_y, pred, average='macro')
print(f'\n{metric}: {score}\n')
print(f'Classification Report:\n')
print(classification_report(test_y, pred), '\n')


cm = confusion_matrix(test_y, pred)
sns.heatmap(cm, annot=True, cmap="Blues", fmt='.0f')