# EDA GRUPO PULMÓN


In [1]:
#Librerías de análisis de datos:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pickle

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier

from pickle import dump
from pickle import load

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import sys
!{sys.executable} -m pip install xgboost lightgbm catboost



In [3]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [4]:
# Buenas prácticas.
path_to_data = "./stroke_dataset.csv"
data_stroke = pd.read_csv(path_to_data)

In [5]:
#Cantidad de entradas y columnas del dataset.
data_stroke.shape

(4981, 11)

In [6]:
# Visualizamos las 5 primeras entradas de nuestro dataset
data_stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [7]:
# Información del tipo de variables de nuestro dataset
data_stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [8]:
# Vemos el número de observaciones distintas de cada variable
data_stroke.nunique()

gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               4
Residence_type          2
avg_glucose_level    3895
bmi                   342
smoking_status          4
stroke                  2
dtype: int64

In [9]:
#Visualización del número de observaciones distintas en la variable "Work_type"
data_stroke ["work_type"].value_counts()

Private          2860
Self-employed     804
children          673
Govt_job          644
Name: work_type, dtype: int64

In [10]:
# Buscamos datos nulos 
data_stroke.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

### Vemos que nuestro dataset no tiene datos nulos 

In [11]:
#Clasificación de variables
categoricas=["gender", "ever_married","heart_disease", "hypertension","work_type","Residence_type","smoking_status", "stroke"]
numericas=["age","avg_glucose_level","bmi"]

In [12]:
# Descripción de los datos numéricos con sus métricas estadísticas más relevantes.

data_stroke[numericas].describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,4981.0,4981.0,4981.0
mean,43.419859,105.943562,28.498173
std,22.662755,45.075373,6.790464
min,0.08,55.12,14.0
25%,25.0,77.23,23.7
50%,45.0,91.85,28.1
75%,61.0,113.86,32.6
max,82.0,271.74,48.9


In [13]:
#Comprobamos que no hay datos duplicados
data_stroke.duplicated().sum()

0

## Preprocesamiento: 

In [14]:
# Creamos una función con Onehot encode para poder hacer dummies.

def onehot_encode(df,column):
    df = df.copy()
    
    dummies = pd.get_dummies( df [column],prefix=column)
    df = pd.concat([df,dummies], axis=1)
    df = df.drop(column,axis=1)
    
    return df 
    

In [15]:
# Hacemos función que hace una copia como primer paso

def preprocess_inputs(df):
    df = df.copy()

#2.- Si quicieramos eliminar la columna Id, pero en este dato no existe.
    # df = df.drop("id", axis=1)
    
#3.- Despues de identificar las clases  dentro de cada variable y transformamos los datos en binarios
    df["ever_married"] = df["ever_married"].replace({"No":0,"Yes":1})
    df["gender"] = df["gender"].replace({"Male":0,"Female":1})
    df["Residence_type"] = df["Residence_type"].replace({"Urban":0,"Rural":1})
    
#4.-Haciendo Onehot_encode con las variables categóricas
    for column in ["work_type","smoking_status"]:
        df = onehot_encode(df,column = column)
        
        
#5.- Separamos la columna stroke (variable a predecir) 
    y = df['stroke']
    X = df.drop('stroke', axis=1)
    
#6.- Definiendo Train-test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
#7.- Escalamos las variables Xtest, Xtrain  
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
   
    return X_train, X_test, y_train, y_test

In [16]:
# Definimos la variable preprocess_inputs 
X_train, X_test, y_train, y_test = preprocess_inputs(data_stroke)

In [17]:
# Veridficamos el funcionamiento de preprocess_inputs 
y_train 

2117    0
332     0
3915    0
4610    0
4485    0
       ..
2895    0
2763    0
905     0
3980    0
235     0
Name: stroke, Length: 3486, dtype: int64

In [18]:
#Calculamos la varianza para comprobar la escala en X_train
X_train.var()

gender                            1.000287
age                               1.000287
hypertension                      1.000287
heart_disease                     1.000287
ever_married                      1.000287
Residence_type                    1.000287
avg_glucose_level                 1.000287
bmi                               1.000287
work_type_Govt_job                1.000287
work_type_Private                 1.000287
work_type_Self-employed           1.000287
work_type_children                1.000287
smoking_status_Unknown            1.000287
smoking_status_formerly smoked    1.000287
smoking_status_never smoked       1.000287
smoking_status_smokes             1.000287
dtype: float64

## ENTRENAMOS


### Primer entrenamiento sin balanceo 


In [19]:
# Modelos elegidos para nuestro entrenamiesto de clasificación 
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

# entrenamiento:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")



                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


In [20]:
# Cantidad de datos que tienen casos de stroke afirmativos y negativos donde 0=No y 1=Si
y_train.value_counts()

0    3315
1     171
Name: stroke, dtype: int64

### Nos damos cuenta que el dataset esta muy desbalanceado puesto que hay muchos casos negativos y esto puede influir sesgando los datos , pero lo solucionaremos luego


In [21]:
# Sacamos las métricas en test y train 
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_test)
    
    y_pred_train = model.predict(X_train)
    
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred),roc_auc_score(y_test, y_pred) ))
    
    print( confusion_matrix(y_test, y_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t              Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_train, y_pred_train) * 100, f1_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train) ))

# Imprimiendo matriz de confusión 
    print(confusion_matrix(y_train, y_pred_train))
    
# Medimos el overfitting
    print(
    np.abs(((((accuracy_score(y_train, y_pred_train))-accuracy_score(y_test, y_pred))/(accuracy_score(y_test, y_pred)) *100)))
       )
           

Model Performance
-----------------

                   Logistic Regression Accuracy: 94.916%
				       F1-Score: 0.02564
				              Recall: 0.01299
				       Precision: 1.00000
				       Roc_Au: 0.50649
				
[[1418    0]
 [  76    1]]

                   Logistic Regression Accuracy_train: 95.095%
				       F1-Score_train: 0.00000
				              Recall_Train: 0.00000
				       Precision_Train: 0.00000
				       Roc_Au_Train: 0.50000
				
[[3315    0]
 [ 171    0]]
0.18782469048650188

                   K-Nearest Neighbors Accuracy: 94.716%
				       F1-Score: 0.04819
				              Recall: 0.02597
				       Precision: 0.33333
				       Roc_Au: 0.51158
				
[[1414    4]
 [  75    2]]

                   K-Nearest Neighbors Accuracy_train: 95.209%
				       F1-Score_train: 0.07735
				              Recall_Train: 0.04094
				       Precision_Train: 0.70000
				       Roc_Au_Train: 0.52002
				
[[3312    3]
 [ 164    7]]
0.5212334406228691

                        

### Primer intento de balanceo de datos con técnica de Oversampling


In [22]:
# comprobamos en numero de casos positivos y negativos en y_train
y_train.value_counts()

0    3315
1     171
Name: stroke, dtype: int64

In [23]:
# Definimos el oversample en los datos reservados para train y hacemos una copia para mayor seguridad
oversampled_data = pd.concat([X_train, y_train], axis=1).copy()

# igualamos la cantidad de datos 
num_samples = y_train.value_counts()[0] - y_train.value_counts()[1]
new_samples = oversampled_data.query("stroke == 1").sample(num_samples, replace=True, random_state=1)

# definimos para oversamplear
oversampled_data = pd.concat([oversampled_data, new_samples], axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

# Definimos X e y con la data banceada 
y_oversampled = oversampled_data['stroke']
X_oversampled = oversampled_data.drop('stroke', axis=1)

#Def Train-Test-Split
X_oversampled_train, X_oversampled_test, y_oversampled_train, y_oversampled_test = train_test_split(X_oversampled, y_oversampled, train_size=0.7, shuffle=True, random_state=1)

In [24]:
# Verificamos que se ha hecho bien el split
y_oversampled_train

3253    1
4972    0
2408    0
6594    1
2761    0
       ..
905     1
5192    0
3980    0
235     0
5157    0
Name: stroke, Length: 4641, dtype: int64

In [25]:
#Visualización de la data oversampleada
oversampled_data 

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.860964,0.760070,-0.329816,4.153470,0.709390,1.009801,0.301358,1.210494,2.5208,-1.147292,-0.437479,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,1
1,0.860964,-0.296505,-0.329816,-0.240763,0.709390,-0.990294,-0.192419,-1.199677,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,-0.465568,1.309562,-0.420131,0
2,0.860964,1.200310,-0.329816,-0.240763,0.709390,-0.990294,1.957183,0.813698,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,2.147912,-0.763614,-0.420131,1
3,-1.161489,0.451902,-0.329816,-0.240763,-1.409662,1.009801,-0.005382,-0.170945,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,2.147912,-0.763614,-0.420131,0
4,0.860964,1.508478,-0.329816,-0.240763,0.709390,1.009801,-1.014280,-0.215034,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,2.147912,-0.763614,-0.420131,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6625,0.860964,0.319830,-0.329816,-0.240763,0.709390,1.009801,-0.209362,-1.038019,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,-0.465568,1.309562,-0.420131,0
6626,-1.161489,1.068238,-0.329816,-0.240763,0.709390,-0.990294,-0.636906,-0.156249,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,1
6627,-1.161489,1.332382,-0.329816,-0.240763,0.709390,-0.990294,-0.743847,0.402205,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,1
6628,-1.161489,-0.428577,-0.329816,-0.240763,-1.409662,-0.990294,0.037526,0.269940,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,0


In [26]:
# comprobamos el balanceo de la data
y_oversampled_train.value_counts()

1    2322
0    2319
Name: stroke, dtype: int64

In [27]:
# Entrenando los modelos con la data balanceada
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}


for name, model in models.items():
    model.fit(X_oversampled_train, y_oversampled_train)
    print(name + " Entrenado.")


                   Logistic Regression Entrenado.
                   K-Nearest Neighbors Entrenado.
                         Decision Tree Entrenado.
Support Vector Machine (Linear Kernel) Entrenado.
   Support Vector Machine (RBF Kernel) Entrenado.
                        Neural Network Entrenado.
                         Random Forest Entrenado.
                     Gradient Boosting Entrenado.
                               XGBoost Entrenado.
                              LightGBM Entrenado.
                              CatBoost Entrenado.


In [28]:
# Sacamos las métricas más importantes ya con los datos de Oversampled en X e y .

print("Model Performance\n-----------------")
for name, model in models.items():
    y_oversampled_pred = model.predict(X_oversampled_test)
    
    y_oversampled_pred_train = model.predict(X_oversampled_train)


    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_oversampled_test, y_oversampled_pred) * 100, f1_score(y_oversampled_test, y_oversampled_pred),recall_score(y_oversampled_test, y_oversampled_pred),precision_score(y_oversampled_test, y_oversampled_pred),roc_auc_score(y_oversampled_test, y_oversampled_pred) ))
    
    print( confusion_matrix(y_oversampled_test, y_oversampled_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t        Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_oversampled_train, y_oversampled_pred_train) * 100, f1_score(y_oversampled_train, y_oversampled_pred_train),recall_score(y_oversampled_train, y_oversampled_pred_train),precision_score(y_oversampled_train, y_oversampled_pred_train),roc_auc_score(y_oversampled_train, y_oversampled_pred_train) ))
 
    print( confusion_matrix(y_oversampled_train, y_oversampled_pred_train))
    
    print(
    "\n" + name + " Overfitting:\n\t\t\t\t",
    np.abs(((((accuracy_score(y_oversampled_train, y_oversampled_pred_train))-accuracy_score(y_oversampled_test, y_oversampled_pred))/(accuracy_score(y_oversampled_test, y_oversampled_pred)) *100)))
       )

Model Performance
-----------------

                   Logistic Regression Accuracy: 78.482%
				       F1-Score: 0.79383
			              Recall: 0.82981
				       Precision: 0.76085
				       Roc_Au: 0.78488
				
[[737 259]
 [169 824]]

                   Logistic Regression Accuracy_train: 77.591%
				       F1-Score_train: 0.78819
				        Recall_Train: 0.83333
				       Precision_Train: 0.74768
				       Roc_Au_Train: 0.77587
				
[[1666  653]
 [ 387 1935]]

                   Logistic Regression Overfitting:
				 1.1348036972636548

                   K-Nearest Neighbors Accuracy: 90.447%
				       F1-Score: 0.91268
			              Recall: 1.00000
				       Precision: 0.83939
				       Roc_Au: 0.90462
				
[[806 190]
 [  0 993]]

                   K-Nearest Neighbors Accuracy_train: 94.441%
				       F1-Score_train: 0.94737
				        Recall_Train: 1.00000
				       Precision_Train: 0.90000
				       Roc_Au_Train: 0.94437
				
[[2061  258]
 [   0 2322]]

        

### Al haber usado el metodo de oversampling para balancear , no podemos confiar en que haya duplicado datos y las predicciones no esten sesgadas. 
### Para elegir un modelo , vamos a probar otro método de balanceo y comparamos las métricas .

## Segunda técnica de balanceo

In [29]:
# Segundo intento de balanceo de datos con técnica de SmoteTomek

data_stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [30]:
# Aqui sacamos las X y y para procesarlas fuera de la función
#Eliminamos Transported de X porque es el valor a predecir. La X es mayúscula
y = data_stroke['stroke']
X = data_stroke.drop('stroke', axis=1)

In [31]:
#Comprobamos que se ha eliminado la variable Stroke del eje X
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked


In [32]:
# Comprobamos que Y es la variable del stroke.
y.head()

0    1
1    1
2    1
3    1
4    1
Name: stroke, dtype: int64

In [33]:
# Clasificación de las variables
categoricas=["gender", "ever_married","heart_disease", "hypertension","work_type","Residence_type","smoking_status"]
numericas=["age","avg_glucose_level","bmi"]

In [34]:
#Importanción de Sklearn
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [35]:
#Creamos transformers para cada categoría de variables.
transformer_numerico = ("transformer_numerica", MinMaxScaler(), numericas)
transformer_categorico = ("transformer_categorica", OneHotEncoder(), categoricas)
transformer = ColumnTransformer([transformer_numerico, transformer_categorico], remainder="passthrough")

In [36]:
# Denominamos la X para transformer
X = transformer.fit_transform(X)

In [37]:
# Se crea un DataFrame con los transformers.
transformer_final= pd.DataFrame(X, columns = transformer.get_feature_names_out())

In [38]:
# Visualizamos el DataFrame con transformers
transformer_final

Unnamed: 0,transformer_numerica__age,transformer_numerica__avg_glucose_level,transformer_numerica__bmi,transformer_categorica__gender_Female,transformer_categorica__gender_Male,transformer_categorica__ever_married_No,transformer_categorica__ever_married_Yes,transformer_categorica__heart_disease_0,transformer_categorica__heart_disease_1,transformer_categorica__hypertension_0,...,transformer_categorica__work_type_Govt_job,transformer_categorica__work_type_Private,transformer_categorica__work_type_Self-employed,transformer_categorica__work_type_children,transformer_categorica__Residence_type_Rural,transformer_categorica__Residence_type_Urban,transformer_categorica__smoking_status_Unknown,transformer_categorica__smoking_status_formerly smoked,transformer_categorica__smoking_status_never smoked,transformer_categorica__smoking_status_smokes
0,0.816895,0.801265,0.647564,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.975586,0.234512,0.530086,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.597168,0.536008,0.584527,1.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.963379,0.549349,0.286533,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.987793,0.605161,0.429799,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,0.499512,0.069384,0.452722,0.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4977,0.487305,0.627966,0.489971,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4978,0.548340,0.184194,0.510029,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4979,0.487305,0.133044,0.458453,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [39]:
# Vemos las variables numéricas y categóricas.
transformer.output_indices_

{'transformer_numerica': slice(0, 3, None),
 'transformer_categorica': slice(3, 21, None),
 'remainder': slice(0, 0, None)}

In [40]:
#Cargando el archivo de transformer a pickle para subirlo a streamlit

file_trans = open('transformer_final.pkl', 'wb')
pickle.dump(transformer, file_trans)

In [41]:
# Sumamos la variable Y en positivos para después tratarlo.
positivo = y.sum()

In [42]:
# Para sacar los valores negativos hemos de restar los positivos.
negativo = y.shape[0]-positivo

In [54]:
# Mostramos los valores positivos y negativos.
print("Rpta Positivos: ", positivo, "Rpta Negativos:", negativo)

Rpta Positivos:  248 Rpta Negativos: 4733


In [45]:
# Instalación de imblearn para utilizar el método de SmoteTomek (no desde terminal)
!pip install imblearn



In [87]:
# Importación de SmomteTomek
from imblearn.combine import SMOTETomek

smoteT = SMOTETomek()

In [88]:
# Definiendo las variables para balancear los datos

X_smoteT, y_smoteT = smoteT.fit_resample(X, y)
Positivo_smoteT = y_smoteT.sum()
Negativo_smoteT = y_smoteT.shape[0]-Positivo_smoteT


print("Rpta Positivas:",Positivo_smoteT,",Rpta Negativas:", Negativo_smoteT)
X_smoteT 

X_smoteT_train, X_smoteT_test, y_smoteT_train, y_smoteT_test = train_test_split(X_smoteT, y_smoteT, train_size=0.7, shuffle=True, random_state=1)


Rpta Positivas: 4674 ,Rpta Negativas: 4674


In [89]:
## Corriendo los modelos con balanceo SMOTETomek
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
   # "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}


for name, model in models.items():
    model.fit(X_smoteT_train, y_smoteT_train)
    print(name + " Entrenado.")


                               XGBoost Entrenado.


### Tras entrenar con los datos balanaceados aplicando SmoteTomek , comparamos y el que nos da una mejor métrica es XGBoost. 

In [90]:
# Recuento de valores positivos y negativos despues de balancear con SmoteTomek
y_smoteT_train.value_counts()

1    3283
0    3260
Name: stroke, dtype: int64

In [92]:
#Test con valores smoteT
print("Model Performance\n-----------------")
for name, model in models.items():
    y_smoteT_pred = model.predict(X_smoteT_test)
    
    y_smoteT_pred_train = model.predict(X_smoteT_train)


    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_smoteT_test, y_smoteT_pred) * 100, f1_score(y_smoteT_test, y_smoteT_pred),recall_score(y_smoteT_test, y_smoteT_pred),precision_score(y_smoteT_test, y_smoteT_pred),roc_auc_score(y_smoteT_test, y_smoteT_pred) ))
    
    print( confusion_matrix(y_smoteT_test, y_smoteT_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t        Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_smoteT_train, y_smoteT_pred_train) * 100, f1_score(y_smoteT_train, y_smoteT_pred_train),recall_score(y_smoteT_train, y_smoteT_pred_train),precision_score(y_smoteT_train, y_smoteT_pred_train),roc_auc_score(y_smoteT_train, y_smoteT_pred_train) ))
   
    
    print( confusion_matrix(y_smoteT_train, y_smoteT_pred_train))
    
    print(
    "\n" + name + " Overfitting accuracy:\n\t\t\t\t",
    np.abs(((((accuracy_score(y_smoteT_train, y_smoteT_pred_train))-accuracy_score(y_smoteT_test, y_smoteT_pred))/(accuracy_score(y_smoteT_test, y_smoteT_pred)) *100)))
       )
    
    print(
    "\n" + name + " Overfitting recall:\n\t\t\t\t",
    np.abs(((((recall_score(y_smoteT_train, y_smoteT_pred_train))-recall_score(y_smoteT_test, y_smoteT_pred))/(recall_score(y_smoteT_test, y_smoteT_pred)) *100)))
       )
# Overfitting        
    print(
    "\n" + name + " Overfitting presicion:\n\t\t\t\t",
    np.abs(((((precision_score(y_smoteT_train, y_smoteT_pred_train))-precision_score(y_smoteT_test, y_smoteT_pred))/(precision_score(y_smoteT_test, y_smoteT_pred)) *100)))
       )

Model Performance
-----------------

                               XGBoost Accuracy: 96.078%
				       F1-Score: 0.96026
			              Recall: 0.95543
				       Precision: 0.96514
				       Roc_Au: 0.96074
				
[[1366   48]
 [  62 1329]]

                               XGBoost Accuracy_train: 99.694%
				       F1-Score_train: 0.99695
				        Recall_Train: 0.99482
				       Precision_Train: 0.99908
				       Roc_Au_Train: 0.99695
				
[[3257    3]
 [  17 3266]]

                               XGBoost Overfitting accuracy:
				 3.763486137233427

                               XGBoost Overfitting recall:
				 4.1231856106210545

                               XGBoost Overfitting presicion:
				 3.5166524302791013


In [93]:
# Elegimos el modelo y lo guardamos en una variable para poder utilizar Pickle.
modelo_elegido = XGBClassifier()
modelo_elegido.fit(X,y)

In [94]:
# Utilizamos pickle para guardar el modelo y poder utilizar en streamlit.
modelo_pickle = 'XGBC.pkl'
pickle.dump(modelo_elegido, open(modelo_pickle, 'wb'))