## SMOTE to correct for imbalances. Comparing multiple classifiers

### This notebook based on work from this website: 
    https://practicaldatascience.co.uk/machine-learning/how-to-use-smote-for-imbalanced-classification

In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import make_scorer
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import geometric_mean_score

In [2]:
data = pd.read_csv("OSA_complete_patients.csv", index_col = 0)

### Train Test Splt

In [3]:
features = ['Sex', 'Age', 'Current_smoker', 'Former_smoker',
       'Sedentary', 'Height', 'Weight', 'Cervical_perimeter',
       'Abdominal_perimeter', 'Systolic_BP', 'Diastolic_BP',
       'Maxillofacial_profile', 'BMI', 'High_BP', 'Asthma', 'Rhinitis', 'COPD',
       'Respiratory_fail', 'Myocardial_infarct', 'Coronary_fail',
       'Arrhythmias', 'Stroke', 'Heart_fail', 'Arteriopathy', 'Gastric_reflux',
       'Glaucoma', 'Diabetes', 'Hypercholesterolemia', 'Hypertriglyceridemia',
       'Hypo(er)thyroidism', 'Depression', 'Obesity', 'Dysmorphology',
       'Restless_Leg_Syndrome', 'Snoring', 'Diurnal_somnolence',
       'Driving_drowsiness', 'Morning_fatigue', 'Morning_headache',
       'Memory_problem', 'Nocturnal_perspiration',
       'Shortness_of_breath_on_exertion', 'Nocturia', 'Drowsiness_accident',
       'Near_miss_accident', 'Respiratory_arrest', 'Epworth_scale',
       'Pichots_scale', 'Depression_scale']
X=data[features]  # Features  
y=data['Severity']  # Labels

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    shuffle=True, 
                                                    stratify=y)

### Defining classifiers for comparison. Default settings

In [17]:
classifiers = {
    "LGBMClassifier": LGBMClassifier(),
    "XGBClassifier": XGBClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier(),    
    "RandomForestClassifier": RandomForestClassifier(),
    "CatBoostClassifier": CatBoostClassifier(silent=True),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "Logistic Regression": LogisticRegression(max_iter = 2000),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machines": LinearSVC(max_iter = 2000),
    "MLP": MLPClassifier(early_stopping=True)}

### Run times and accuracies for classifiers. Average accuracy and average accuaracy standard deviation


In [6]:
df_models = pd.DataFrame(columns=['model', 'run_time', 'avg_acc', 'avg_acc_std'])

for key in classifiers:

    print('*',key)

    start_time = time.time()

    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    cv_scores = cross_val_score(model, X_test, y_test, cv=5, scoring='accuracy')
    y_pred = model.predict(X_test)

    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60,2)),
           'avg_acc': cv_scores.mean(),
           'avg_acc_std': cv_scores.std(),
    }

    df_models = df_models.append(row, ignore_index=True)

* LGBMClassifier


  df_models = df_models.append(row, ignore_index=True)


* XGBClassifier


  df_models = df_models.append(row, ignore_index=True)


* ExtraTreesClassifier


  df_models = df_models.append(row, ignore_index=True)


* RandomForestClassifier


  df_models = df_models.append(row, ignore_index=True)


* CatBoostClassifier


  df_models = df_models.append(row, ignore_index=True)


* AdaBoostClassifier


  df_models = df_models.append(row, ignore_index=True)


* Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

* Support Vector Machines


  df_models = df_models.append(row, ignore_index=True)


* Naive Bayes


  df_models = df_models.append(row, ignore_index=True)


* MLP










  df_models = df_models.append(row, ignore_index=True)


In [7]:
df_models.head(10)  

Unnamed: 0,model,run_time,avg_acc,avg_acc_std
0,LGBMClassifier,0.37,0.420712,0.008557
1,XGBClassifier,1.37,0.420708,0.016906
2,ExtraTreesClassifier,0.35,0.43767,0.015776
3,RandomForestClassifier,0.38,0.442483,0.007674
4,CatBoostClassifier,5.67,0.430794,0.011392
5,AdaBoostClassifier,0.12,0.439274,0.008652


### Performance metrics for imbalanced dataset

In [19]:
df_result = pd.DataFrame(columns=['model','accuracy', 'precision','recall', 'f1', 'bal_acc', 'g_mean'])

for key in classifiers:
    print('*',key)

    classifier = classifiers[key]
    if key == "Support Vector Machines" or key == "MLP":
        scaler = MinMaxScaler()  
        scaler.fit(X_train)  
        X_train = scaler.transform(X_train)  
        X_test = scaler.transform(X_test) 
    
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    bal_accuracy = balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    g_mean = geometric_mean_score(y_test, y_pred)
    row = {'model': key,
           'accuracy': round(accuracy,3),
           'recall': round(recall,3),
           'f1': round(f1,3),
           'bal_acc': round(bal_accuracy,3),
           'precision': round(precision,3),
           'g_mean': round(g_mean,3),}

    df_result = df_result.append(row, ignore_index=True)


* LGBMClassifier


  df_result = df_result.append(row, ignore_index=True)


* XGBClassifier


  df_result = df_result.append(row, ignore_index=True)


* ExtraTreesClassifier


  df_result = df_result.append(row, ignore_index=True)


* RandomForestClassifier


  df_result = df_result.append(row, ignore_index=True)


* CatBoostClassifier


  y = column_or_1d(y, warn=True)
  df_result = df_result.append(row, ignore_index=True)


* AdaBoostClassifier


  df_result = df_result.append(row, ignore_index=True)


* Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  df_result = df_result.append(row, ignore_index=True)


* Naive Bayes


  df_result = df_result.append(row, ignore_index=True)


* Support Vector Machines


  df_result = df_result.append(row, ignore_index=True)


* MLP


  df_result = df_result.append(row, ignore_index=True)


In [20]:
df_result.head(10)

Unnamed: 0,model,accuracy,precision,recall,f1,bal_acc,g_mean
0,LGBMClassifier,0.464,0.429,0.464,0.428,0.387,0.32
1,XGBClassifier,0.453,0.423,0.453,0.426,0.378,0.327
2,ExtraTreesClassifier,0.456,0.416,0.456,0.415,0.368,0.298
3,RandomForestClassifier,0.463,0.429,0.463,0.422,0.377,0.307
4,CatBoostClassifier,0.46,0.428,0.46,0.43,0.383,0.327
5,AdaBoostClassifier,0.463,0.434,0.463,0.411,0.374,0.273
6,Logistic Regression,0.457,0.426,0.457,0.403,0.362,0.257
7,Naive Bayes,0.342,0.394,0.342,0.322,0.359,0.236
8,Support Vector Machines,0.452,0.418,0.452,0.366,0.334,0.148
9,MLP,0.464,0.435,0.464,0.413,0.372,0.268


## Using SMOTE for oversampling on X_train and y_train

### oversampling in training data

In [24]:
oversampled = SMOTE(random_state=0)
X_train_smote, y_train_smote = oversampled.fit_resample(X_train, y_train)
y_train_smote.value_counts()

3    7022
2    7022
1    7022
0    7022
Name: Severity, dtype: int64

### Performance metrics

In [25]:
df_result = pd.DataFrame(columns=['model','accuracy', 'precision','recall', 'f1', 'bal_acc', 'g_mean'])

for key in classifiers:
    print('*',key)

    classifier = classifiers[key]
    #if key == "Support Vector Machines" or key == "MLP":
     #   scaler = MinMaxScaler()  
      #  scaler.fit(X_train_smote)  
       # X_train_smote = scaler.transform(X_train_smote)  
        #X_test = scaler.transform(X_test) 
    
    model = classifier.fit(X_train_smote, y_train_smote)
    y_pred_smote = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote, average='weighted')
    f1 = f1_score(y_test, y_pred_smote, average='weighted')
    bal_accuracy = balanced_accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote, average='weighted')
    g_mean = geometric_mean_score(y_test, y_pred_smote)
    row = {'model': key,
           'accuracy': round(accuracy,3),
           'recall': round(recall,3),
           'f1': round(f1,3),
           'bal_acc': round(bal_accuracy,3),
           'precision': round(precision,3),
           'g_mean': round(g_mean,3),}

    df_result = df_result.append(row, ignore_index=True)

* LGBMClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  df_result = df_result.append(row, ignore_index=True)


* XGBClassifier


KeyboardInterrupt: 

In [23]:
df_result.head(10)

Unnamed: 0,model,accuracy,precision,recall,f1,bal_acc,g_mean
0,LGBMClassifier,0.231,0.053,0.231,0.087,0.25,0.0
1,XGBClassifier,0.232,0.111,0.232,0.095,0.252,0.0
2,ExtraTreesClassifier,0.219,0.391,0.219,0.15,0.271,0.136
3,RandomForestClassifier,0.223,0.524,0.223,0.123,0.258,0.056
4,CatBoostClassifier,0.209,0.128,0.209,0.144,0.275,0.0
5,AdaBoostClassifier,0.152,0.058,0.152,0.069,0.262,0.0
6,Logistic Regression,0.392,0.36,0.392,0.297,0.332,0.121
7,Naive Bayes,0.301,0.347,0.301,0.226,0.279,0.0
8,Support Vector Machines,0.131,0.017,0.131,0.03,0.25,0.0
9,MLP,0.322,0.283,0.322,0.277,0.327,0.0


## Using ADASYN for oversampling on X_train and y_train

In [13]:
oversampled_a = ADASYN(random_state=0)
X_train_ada, y_train_ada = oversampled_a.fit_resample(X_train, y_train)
y_train_ada.value_counts()

2    7777
1    7536
0    7439
3    7022
Name: Severity, dtype: int64

### Performance Metrics

In [14]:
df_result = pd.DataFrame(columns=['model','accuracy', 'precision','recall', 'f1', 'bal_acc', 'g_mean'])

for key in classifiers:
    print('*',key)

    classifier = classifiers[key]
    model = classifier.fit(X_train_ada, y_train_ada)
    y_pred_smote = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_smote)
    recall = recall_score(y_test, y_pred_smote, average='weighted')
    f1 = f1_score(y_test, y_pred_smote, average='weighted')
    bal_accuracy = balanced_accuracy_score(y_test, y_pred_smote)
    precision = precision_score(y_test, y_pred_smote, average='weighted')
    g_mean = geometric_mean_score(y_test, y_pred_smote)
    row = {'model': key,
           'accuracy': round(accuracy,3),
           'recall': round(recall,3),
           'f1': round(f1,3),
           'bal_acc': round(bal_accuracy,3),
           'precision': round(precision,3),
           'g_mean': round(g_mean,3),}

    df_result = df_result.append(row, ignore_index=True)

* LGBMClassifier


  df_result = df_result.append(row, ignore_index=True)


* XGBClassifier


  df_result = df_result.append(row, ignore_index=True)


* ExtraTreesClassifier


  df_result = df_result.append(row, ignore_index=True)


* RandomForestClassifier


  df_result = df_result.append(row, ignore_index=True)


* CatBoostClassifier


  y = column_or_1d(y, warn=True)
  df_result = df_result.append(row, ignore_index=True)


* AdaBoostClassifier


  df_result = df_result.append(row, ignore_index=True)


* Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  df_result = df_result.append(row, ignore_index=True)


* Support Vector Machines


  _warn_prf(average, modifier, msg_start, len(result))
  df_result = df_result.append(row, ignore_index=True)
  df_result = df_result.append(row, ignore_index=True)


* Naive Bayes
* MLP


  df_result = df_result.append(row, ignore_index=True)


In [16]:
df_result.head(10)

Unnamed: 0,model,accuracy,precision,recall,f1,bal_acc,g_mean
0,LGBMClassifier,0.463,0.428,0.463,0.428,0.393,0.329
1,XGBClassifier,0.456,0.424,0.456,0.429,0.389,0.338
2,ExtraTreesClassifier,0.459,0.422,0.459,0.426,0.378,0.318
3,RandomForestClassifier,0.459,0.425,0.459,0.429,0.382,0.327
4,CatBoostClassifier,0.454,0.424,0.454,0.43,0.386,0.338
5,AdaBoostClassifier,0.443,0.418,0.443,0.417,0.373,0.316
6,Logistic Regression,0.422,0.437,0.422,0.418,0.415,0.368
7,Support Vector Machines,0.184,0.071,0.184,0.095,0.294,0.0
8,Naive Bayes,0.302,0.377,0.302,0.289,0.335,0.222
9,MLP,0.411,0.459,0.411,0.39,0.408,0.284
