### Module Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer, KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import TomekLinks

from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, LabelEncoder, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import  XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, auc

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pickle
from tabulate import tabulate

ModuleNotFoundError: No module named 'tabulate'

### Dataset Definition

In [None]:
df = pd.read_csv('stroke_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


### Stratification of the Dataset

In [None]:
X = df.drop('stroke', axis = 1)
y = df.stroke

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
print(Counter(y))
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 4733, 1: 248})
Counter({0: 3786, 1: 198})
Counter({0: 947, 1: 50})


In [None]:
print((248)/(248 + 4733))
print((198)/(198 + 3786))
print((50)/(50 + 947))

0.049789198956032926
0.04969879518072289
0.05015045135406219


In [None]:
NOTA = 'Revisar las validaciones, si se utiliza validacion cruzada, no es necesario la division del test en train y test'

### Separation of the Test Dataset

In [None]:
data_test = pd.concat([X_test, y_test], axis = 1)

In [None]:
data_test.to_csv('test.csv')

## Null Imputation: smoking_status -> most-frequent

In [None]:
X_train_1 = X_train.copy()
y_train_1 = y_train.copy()

In [None]:
y_train_1 = y_train.copy()

In [None]:
X_train_1.smoking_status.value_counts()

never smoked       1455
Unknown            1197
formerly smoked     712
smokes              620
Name: smoking_status, dtype: int64

In [None]:
def impute_smokers_age(X_train):
    min_age = 12

    X_train.loc[(X_train['age'] <= min_age) & (X_train['smoking_status'] == 'Unknown'), 'smoking_status'] = 'never smoked'
    X_train.loc[(X_train['smoking_status'] == 'Unknown'), 'smoking_status'] = np.nan
    
    return X_train

In [None]:
X_train_1 = impute_smokers_age(X_train_1)

In [None]:
X_train_1.iloc[:,9].value_counts()

never smoked       1868
formerly smoked     712
smokes              620
Name: smoking_status, dtype: int64

In [None]:
X_train_1.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                    0
smoking_status       784
dtype: int64

In [None]:
X_train_1.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
4428,Female,24.0,0,0,No,Private,Rural,187.99,24.9,smokes
1135,Male,3.0,0,0,No,children,Rural,86.38,22.8,never smoked
2417,Female,69.0,0,0,Yes,Private,Urban,111.48,37.0,smokes
1173,Female,46.0,0,0,Yes,Private,Urban,127.75,30.5,never smoked
3696,Male,52.0,1,0,Yes,Private,Rural,100.71,37.0,never smoked


In [None]:
dicc_columns = {name_column: index for index, name_column in enumerate(X_train_1.columns)}
print(dicc_columns)

{'gender': 0, 'age': 1, 'hypertension': 2, 'heart_disease': 3, 'ever_married': 4, 'work_type': 5, 'Residence_type': 6, 'avg_glucose_level': 7, 'bmi': 8, 'smoking_status': 9}


### Oversampling training data - SMOTENC, (null input with mode)

In [None]:
def smote_balance(X_train, y_train):
    smote = SMOTENC(sampling_strategy = 'auto', random_state=42, categorical_features=[0,4,5,6,9])
    X_train, y_train = smote.fit_resample(X_train, y_train)
    return X_train, y_train

In [None]:
X_train_1, y_train_1 = smote_balance(X_train_1, y_train_1)

print(Counter(y_train_1))

Counter({0: 3786, 1: 3786})


### Simple Preprocessing

In [None]:
def cat_num_separation(X_train):
    cat = X_train.select_dtypes(include = ['object'])
    cat_columns = list(cat)

    num = X_train.select_dtypes(include = ['number'])
    num_columns = list(num)
    return cat_columns, num_columns


In [None]:
cat_columns, num_columns = cat_num_separation(X_train_1)

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('most_freq', SimpleImputer(strategy = 'most_frequent')), ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

def preprocessor(transformers):
    '''
    Receives a list of tuples with the transformers.
    The tuple should follow the following structure ('name', transformer, data or columns)
    Returns the preprocessor.
    '''
    scaler = {st: StandardScaler(), mm: MinMaxScaler()}




    numeric_transformer =  ('num', StandardScaler(), num_columns)

    categorical_transformer = ('cat', categorical_transformer, cat_columns)
    transformers = numeric_transformer+categorical_transformer+
    preprocessor = ColumnTransformer(transformers=transformers, remainder = 'passthrough')
    return preprocessor

In [None]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_columns), ('cat', categorical_transformer, cat_columns)], remainder = 'passthrough')

In [None]:
X_transformed_1 = preprocessor.fit_transform(X_train_1)

In [None]:
X_transformed_1

array([[-1.41138701, -0.32316261, -0.20730462, ...,  0.        ,
         0.        ,  1.        ],
       [-2.36502549, -0.32316261, -0.20730462, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.63212402, -0.32316261, -0.20730462, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 1.14069004, -0.32316261, -0.20730462, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.91446555, -0.32316261, -0.20730462, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.14351916, -0.32316261, -0.20730462, ...,  0.        ,
         0.        ,  1.        ]])

### Model Testing Functions

In [None]:
def prediction(model, data_to_predict):
    '''
    Esta función recibe el modelo que debe aplicarse y la matriz X (variables predictoras)
    Retorna la y que predice
    '''
    y_predicted = model.predict(data_to_predict)
    
    probability = model.predict_proba(data_to_predict)
    
    #tomamos como clase positiva (1)
    y_probs = probability[:, model.classes_.tolist().index(1)]
    
    return y_predicted, y_probs

Try stratified cross validation

In [None]:
def test(model, data_to_predict, y_predicted, y_true, y_probs):
    '''
    Esta función calcula las métricas del modelo y las imprime
    Toma como clase positiva (pos_label) a 1
    Se aplica validación cruzada para obtener todas las métricas  
    '''
    eval_metrics = ['accuracy', 'precision', 'recall', 'f1']
    #model_scores = cross_val_score(model, data_to_predict, y_true.values.ravel(), cv=3)
    model_scores = cross_validate(model, data_to_predict, y_true, cv=3, scoring = eval_metrics, return_train_score = True)
    
    
    acc = np.mean(model_scores['test_accuracy'])
    precision = np.mean(model_scores['test_precision'])
    recall = np.mean(model_scores['test_recall'])
    f1 = np.mean(model_scores['test_f1'])
    
    overfit_acc = (np.mean(model_scores['train_accuracy']) - acc) * 100
    overfit_f1 = (np.mean(model_scores['train_f1']) - f1) * 100
    
    conf_matrix = confusion_matrix(y_true, y_predicted)
    fpr, tpr, _ = roc_curve(y_true, y_probs, pos_label = 1)
    roc_auc = auc(fpr, tpr)
   
    #print('Model: {} || Accuracy: {} || Precision: {} || Recall: {} || F1: {}'.format(model, acc, precision, recall, f1))
    #print()
    #print('Model: {} || Overfitting Accuracy: {}, || Overfitting F1: {}'.format(model, overfit_acc, overfit_f1))
    #print()
    #print("Matriz de Confusión:")
    #print(conf_matrix)
    #print()
    #print("Curva ROC-AUC:")
    return {
        'Model': model,
        'Accuracy': acc,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        'Overfitting Acc% ': overfit_acc,
        'Overfitting F1 %': overfit_f1,
        'Confusion Matrix': conf_matrix.tolist(),
        'ROC Curve': (fpr, tpr, auc)
    }

    #plot_roc_curve(y_true, y_probs)

def plot_roc_curve(y_true, y_probs):
    '''
    Esta función muestra la curva ROC del modelo. Recibe como parámetros 
    la y verdadera y las probabilidades calculadas en la función prediction.
    '''
    fpr, tpr, _ = roc_curve(y_true, y_probs, pos_label = 1)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize = (8, 6))
    plt.plot(fpr, tpr, color = 'darkorange', lw = 2, label = 'Curva ROC (Área (AUC) = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color = 'navy', lw = 2, linestyle = '--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Tasa de Falsos Positivos')
    plt.ylabel('Tasa de Verdaderos Positivos')
    plt.title('Curva ROC')
    plt.legend(loc = 'lower right')
    plt.show()

In [None]:
def plot_metrics(df_metrics):
    for i in range(len(df_metrics)):
        plt.figure(figsize=(12, 4))
        
        # Graph of conf matrix
        plt.subplot(1, 2, 1)
        confusion_matrix = df_metrics['Confusion Matrix'][i]
        sns.heatmap(confusion_matrix, annot=True, fmt='g', cmap='crest')
        plt.title(df_metrics['Model'][i])
        
        # Create labels 
        plt.text(0.5, 0.25, "True Positive", ha='center', va='center', fontsize=9, color='white')
        plt.text(0.5, 1.25, "False Negative", ha='center', va='center', fontsize=9, color='black')
        plt.text(1.5, 0.25, "False Positive", ha='center', va='center', fontsize=9, color='black')
        plt.text(1.5, 1.25, "True Negative", ha='center', va='center', fontsize=9, color='white')
        
        # Graph of ROC curve 
        #fpr_train, tpr_train, auc_train = df_metrics['ROC Curve'][i]
        
        # Graficar la curva ROC del conjunto de prueba
        #fpr_test, tpr_test, auc_test = df_metrics['ROC Curve Test'][i]
        fpr, tpr, roc_auc = df_metrics['ROC Curve'][i]
        
        
        plt.subplot(1, 2, 2)
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {auc:.2f}')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc='lower right')
        
        plt.tight_layout()
        plt.show()



In [None]:
def train_predict_test(model, data_to_predict, y_true):
    '''
    Esta función agrupa todas las anteriores para hacer un sólo llamado
    ''' 
     # Evaluate the models
    metrics = []
    for model in models:
        model.fit(data_to_predict, y_true)
        y_predicted, y_probs = prediction(model, data_to_predict)
        model_metrics = test(model, data_to_predict, y_predicted, y_true, y_probs)
        metrics.append(model_metrics)

    return pd.DataFrame(metrics)

### Models to Test

In [None]:
models = [
    RandomForestClassifier(), 
    LogisticRegression(), 
    AdaBoostClassifier(n_estimators=50),
    GradientBoostingClassifier(learning_rate=0.3,n_estimators=50),
    LGBMClassifier(),
    XGBClassifier(),
    KNeighborsClassifier()
]

## Null Imputation: smoking_status -> KNN

In [None]:
X_train_2 = X_train.copy()
y_train_2 = y_train.copy()

In [None]:
X_train_2.smoking_status.value_counts()

never smoked       1455
Unknown            1197
formerly smoked     712
smokes              620
Name: smoking_status, dtype: int64

In [None]:
X_train_2 = impute_smokers_age(X_train_2)

In [None]:
X_train_2.iloc[:,9].value_counts()

never smoked       1868
formerly smoked     712
smokes              620
Name: smoking_status, dtype: int64

In [None]:
X_train_2.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                    0
smoking_status       784
dtype: int64

To apply a KNN imputer, we need to transform categorical columns into numerical. With the .info() method, we verify which columns should be changed later with the encoder.

In [None]:
X_train_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3984 entries, 4428 to 2029
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             3984 non-null   object 
 1   age                3984 non-null   float64
 2   hypertension       3984 non-null   int64  
 3   heart_disease      3984 non-null   int64  
 4   ever_married       3984 non-null   object 
 5   work_type          3984 non-null   object 
 6   Residence_type     3984 non-null   object 
 7   avg_glucose_level  3984 non-null   float64
 8   bmi                3984 non-null   float64
 9   smoking_status     3200 non-null   object 
dtypes: float64(3), int64(2), object(5)
memory usage: 342.4+ KB


We have to transform the categorical columns with OneHotEncoder() and the smoking_status column with OrdinalEncoder()

In [None]:
X_train_2, y_train_2 = smote_balance(X_train_2, y_train_2)

print(Counter(y_train_2))

Counter({0: 3786, 1: 3786})


In [None]:
cat_columns, num_columns = cat_num_separation(X_train_2)
print(cat_columns)
print(num_columns)


['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']


In [None]:
categorical_transformer = Pipeline([('ordinal', OrdinalEncoder()), ('knn', KNNImputer(n_neighbors=5))])

We could also try with OrdinalEncoder, specifying the order as: never smoked = 0, formerly smoker = 1, smokes = 2

Also change parameters  of the KNNImpute and add definition of missing values= 'Unknown' to avoid conversion clean out part of the code from function smoking_status_age 

In [None]:
num_trans = ('num', StandardScaler(), num_columns)
cat_trans = ('cat', categorical_transformer, cat_columns)
# imputer_trans = ('knn', KNNImputer(n_neighbors=5), )

preprocessor = ColumnTransformer([num_trans, cat_trans], remainder = 'passthrough')

In [None]:
X_transformed_2 = preprocessor.fit_transform(X_train_2)

### KNN Verification (was the X_train_2 correctly transformed?)

In [None]:
new = pd.DataFrame(data=X_transformed_2)
new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.411387,-0.323163,-0.207305,1.214923,-0.748507,0.0,0.0,1.0,0.0,2.0
1,-2.365025,-0.323163,-0.207305,-0.604433,-1.103521,1.0,0.0,3.0,0.0,1.0
2,0.632124,-0.323163,-0.207305,-0.15501,1.297053,0.0,1.0,1.0,1.0,2.0
3,-0.412337,-0.323163,-0.207305,0.136309,0.198199,0.0,1.0,1.0,1.0,1.0
4,-0.139869,3.094417,-0.207305,-0.34785,1.297053,1.0,1.0,1.0,0.0,1.0


In [None]:
new.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

### Model Testing after KNN Imputation and Preprocessing

In [None]:
for model in models:
    df_metrics = train_predict_test(model, X_transformed_2, y_train_2)

[LightGBM] [Info] Number of positive: 3786, number of negative: 3786
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 791
[LightGBM] [Info] Number of data points in the train set: 7572, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2524, number of negative: 2524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 791
[LightGBM] [Info] Number of data points in the train set: 5048, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[L

In [None]:
df_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,Overfitting Acc%,Overfitting F1 %,Confusion Matrix,ROC Curve
0,"(DecisionTreeClassifier(max_features='sqrt', r...",0.938986,0.920199,0.961965,0.940168,6.101426,5.983214,"[[3786, 0], [0, 3786]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,LogisticRegression(),0.767697,0.748715,0.806128,0.77618,0.184892,0.157538,"[[2759, 1027], [727, 3059]]","([0.0, 0.0002641310089804543, 0.00052826201796..."
2,"(DecisionTreeClassifier(max_depth=1, random_st...",0.811014,0.791116,0.845747,0.817134,1.069731,1.13312,"[[2949, 837], [518, 3268]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,([DecisionTreeRegressor(criterion='friedman_ms...,0.883386,0.859283,0.917063,0.886981,2.713946,2.667361,"[[3298, 488], [241, 3545]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,LGBMClassifier(),0.940438,0.930329,0.953249,0.94076,4.490227,4.462084,"[[3686, 100], [75, 3711]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,"XGBClassifier(base_score=None, booster=None, c...",0.938854,0.923444,0.957475,0.939649,5.731643,5.653077,"[[3751, 35], [19, 3767]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,KNeighborsClassifier(),0.871236,0.817662,0.95589,0.881257,3.796883,3.330337,"[[3272, 514], [100, 3686]]","([0.0, 0.0, 0.06735340729001585, 0.13576333861..."


## Null Elimination: smoking_status -> drop column

In [None]:
X_train_3 = X_train.copy()
y_train_3 = y_train.copy()

### Oversampling 

In [None]:
X_train_3, y_train_3 = smote_balance(X_train_3, y_train_3)

print(Counter(y_train_3))

Counter({0: 3786, 1: 3786})


### "Smoking_status" drop off

In [None]:
X_train_3 = X_train_3.drop('smoking_status', axis = 1)
X_train_3.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi
0,Female,24.0,0,0,No,Private,Rural,187.99,24.9
1,Male,3.0,0,0,No,children,Rural,86.38,22.8
2,Female,69.0,0,0,Yes,Private,Urban,111.48,37.0
3,Female,46.0,0,0,Yes,Private,Urban,127.75,30.5
4,Male,52.0,1,0,Yes,Private,Rural,100.71,37.0


In [None]:
cat_columns, num_columns = cat_num_separation(X_train_3)

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'))])

In [None]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_columns), ('cat', categorical_transformer, cat_columns)], remainder = 'passthrough')

In [None]:
X_transformed_3 = preprocessor.fit_transform(X_train_3)

### Model testing after elimination of the column "Smoking_status" and Oversampling

In [None]:
for model in models:
    df.metrics = train_predict_test(model, X_transformed_3, y_train_3)

[LightGBM] [Info] Number of positive: 3786, number of negative: 3786
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000370 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 791
[LightGBM] [Info] Number of data points in the train set: 7572, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2524, number of negative: 2524
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000458 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 791
[LightGBM] [Info] Number of data points in the train set: 5048, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2524, number of negative: 25

In [None]:
df_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,Overfitting Acc%,Overfitting F1 %,Confusion Matrix,ROC Curve
0,"(DecisionTreeClassifier(max_features='sqrt', r...",0.938457,0.918956,0.962229,0.939683,6.154253,6.031683,"[[3786, 0], [0, 3786]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,LogisticRegression(),0.767697,0.748715,0.806128,0.77618,0.184892,0.157538,"[[2759, 1027], [727, 3059]]","([0.0, 0.0002641310089804543, 0.00052826201796..."
2,"(DecisionTreeClassifier(max_depth=1, random_st...",0.811014,0.791116,0.845747,0.817134,1.069731,1.13312,"[[2949, 837], [518, 3268]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,([DecisionTreeRegressor(criterion='friedman_ms...,0.883254,0.85925,0.916799,0.886841,2.727153,2.681331,"[[3298, 488], [241, 3545]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,LGBMClassifier(),0.940438,0.930329,0.953249,0.94076,4.490227,4.462084,"[[3686, 100], [75, 3711]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,"XGBClassifier(base_score=None, booster=None, c...",0.938854,0.923444,0.957475,0.939649,5.731643,5.653077,"[[3751, 35], [19, 3767]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,KNeighborsClassifier(),0.871236,0.817662,0.95589,0.881257,3.796883,3.330337,"[[3272, 514], [100, 3686]]","([0.0, 0.0, 0.06735340729001585, 0.13576333861..."


## No null imputation: leave 'Unknown' values in smoking_status

In [None]:
X_train_4 = X_train.copy()
y_train_4 = y_train.copy()

In [None]:
X_train_4, y_train_4 = smote_balance(X_train_4, y_train_4)
print(Counter(y_train_4))

Counter({0: 3786, 1: 3786})


In [None]:
cat_columns, num_columns = cat_num_separation(X_train_4)

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse= False, handle_unknown = 'ignore'))])


In [None]:
preprocessor = ColumnTransformer(transformers = [('num', numeric_transformer, num_columns),('cat', categorical_transformer, cat_columns)], remainder= 'passthrough')

In [None]:
X_transformed_4 = preprocessor.fit_transform(X_train_4)

In [None]:
for model in models:
    df.metrics = train_predict_test(model,X_transformed_4,y_train_4)

[LightGBM] [Info] Number of positive: 3786, number of negative: 3786
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 797
[LightGBM] [Info] Number of data points in the train set: 7572, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 2524, number of negative: 2524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 797
[LightGBM] [Info] Number of data points in the train set: 5048, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[L

In [None]:
df_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,Overfitting Acc%,Overfitting F1 %,Confusion Matrix,ROC Curve
0,"(DecisionTreeClassifier(max_features='sqrt', r...",0.938457,0.918956,0.962229,0.939683,6.154253,6.031683,"[[3786, 0], [0, 3786]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,LogisticRegression(),0.767697,0.748715,0.806128,0.77618,0.184892,0.157538,"[[2759, 1027], [727, 3059]]","([0.0, 0.0002641310089804543, 0.00052826201796..."
2,"(DecisionTreeClassifier(max_depth=1, random_st...",0.811014,0.791116,0.845747,0.817134,1.069731,1.13312,"[[2949, 837], [518, 3268]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,([DecisionTreeRegressor(criterion='friedman_ms...,0.883254,0.85925,0.916799,0.886841,2.727153,2.681331,"[[3298, 488], [241, 3545]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,LGBMClassifier(),0.940438,0.930329,0.953249,0.94076,4.490227,4.462084,"[[3686, 100], [75, 3711]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,"XGBClassifier(base_score=None, booster=None, c...",0.938854,0.923444,0.957475,0.939649,5.731643,5.653077,"[[3751, 35], [19, 3767]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,KNeighborsClassifier(),0.871236,0.817662,0.95589,0.881257,3.796883,3.330337,"[[3272, 514], [100, 3686]]","([0.0, 0.0, 0.06735340729001585, 0.13576333861..."


## Balancing the dataset with combined technique SMOTE+

In [None]:
input_2 = 'This option will be tested on data set without null imputation'

In [None]:
X_train_5 = X_train.copy()
y_train_5 = y_train.copy()

### Creating function for combined  method SMOTENC+ENN

In [None]:
X_train_5, y_train_5 = smote_balance(X_train_5, y_train_5)
print(Counter(y_train_5))

Counter({0: 3786, 1: 3786})


In [None]:
def enn_balance(X_train, y_train):
    enn = EditedNearestNeighbours(sampling_strategy = 'auto', n_jobs =-1, kind_sel='all')
    X_train, y_train = enn.fit_resample(X_train, y_train)
    return X_train, y_train  

In [None]:
def tom_balance(X_train, y_train):
    tom = TomekLinks(sampling_strategy = 'auto', n_jobs =-1)
    X_train, y_train = tom.fit_resample(X_train, y_train)
    return X_train, y_train 

In [None]:
cat_columns, num_columns = cat_num_separation(X_train_5)

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', MinMaxScaler())])
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse= False, handle_unknown = 'ignore'))])


In [None]:
preprocessor = ColumnTransformer(transformers = [('num', numeric_transformer, num_columns),('cat', categorical_transformer, cat_columns)], remainder= 'passthrough')

In [None]:
X_transformed_5 = preprocessor.fit_transform(X_train_5)

In [None]:
X_transformed_5, y_train_5 = enn_balance(X_transformed_5, y_train_5)
print(Counter(y_train_5))

Counter({0: 3786, 1: 3397})


In [None]:
for model in models:
    df.metrics = train_predict_test(model,X_transformed_5,y_train_5)

[LightGBM] [Info] Number of positive: 3397, number of negative: 3786
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 797
[LightGBM] [Info] Number of data points in the train set: 7183, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.472922 -> initscore=-0.108417
[LightGBM] [Info] Start training from score -0.108417
[LightGBM] [Info] Number of positive: 2264, number of negative: 2524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 797
[LightGBM] [Info] Number of data points in the train set: 4788, number of used features: 19
[LightGBM] [Info] [binary:

In [None]:
df_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,Overfitting Acc%,Overfitting F1 %,Confusion Matrix,ROC Curve
0,"(DecisionTreeClassifier(max_features='sqrt', r...",0.938457,0.918956,0.962229,0.939683,6.154253,6.031683,"[[3786, 0], [0, 3786]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,LogisticRegression(),0.767697,0.748715,0.806128,0.77618,0.184892,0.157538,"[[2759, 1027], [727, 3059]]","([0.0, 0.0002641310089804543, 0.00052826201796..."
2,"(DecisionTreeClassifier(max_depth=1, random_st...",0.811014,0.791116,0.845747,0.817134,1.069731,1.13312,"[[2949, 837], [518, 3268]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,([DecisionTreeRegressor(criterion='friedman_ms...,0.883254,0.85925,0.916799,0.886841,2.727153,2.681331,"[[3298, 488], [241, 3545]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,LGBMClassifier(),0.940438,0.930329,0.953249,0.94076,4.490227,4.462084,"[[3686, 100], [75, 3711]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
5,"XGBClassifier(base_score=None, booster=None, c...",0.938854,0.923444,0.957475,0.939649,5.731643,5.653077,"[[3751, 35], [19, 3767]]","([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
6,KNeighborsClassifier(),0.871236,0.817662,0.95589,0.881257,3.796883,3.330337,"[[3272, 514], [100, 3686]]","([0.0, 0.0, 0.06735340729001585, 0.13576333861..."


## Results comparison

### 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=af12788f-aecc-4989-a302-f8b336f386d1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>