# EDA GRUPO PULMÓN


In [1]:
#Librerías de análisis de datos:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import pickle

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier

from pickle import dump
from pickle import load

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import sys
!{sys.executable} -m pip install xgboost lightgbm catboost



In [3]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [4]:
# Buenas prácticas.
path_to_data = "./stroke_dataset.csv"
data_stroke = pd.read_csv(path_to_data)

FileNotFoundError: [Errno 2] No such file or directory: './stroke_dataset.csv'

In [None]:
#Cantidad de entradas y columnas del dataset.
data_stroke.shape

In [None]:
# Visualizamos las 5 primeras entradas de nuestro dataset
data_stroke.head()

In [None]:
# Información del tipo de variables de nuestro dataset
data_stroke.info()

In [None]:
# Vemos el número de observaciones distintas de cada variable
data_stroke.nunique()

In [None]:
#Visualización del número de observaciones distintas en la variable "Work_type"
data_stroke ["work_type"].value_counts()

In [None]:
# Buscamos datos nulos 
data_stroke.isnull().sum()

### Vemos que nuestro dataset no tiene datos nulos 

In [None]:
#Clasificación de variables
categoricas=["gender", "ever_married","heart_disease", "hypertension","work_type","Residence_type","smoking_status", "stroke"]
numericas=["age","avg_glucose_level","bmi"]

In [None]:
# Descripción de los datos numéricos con sus métricas estadísticas más relevantes.

data_stroke[numericas].describe()

In [None]:
#Comprobamos que no hay datos duplicados
data_stroke.duplicated().sum()

## Preprocesamiento: 

In [None]:
# Creamos una función con Onehot encode para poder hacer dummies.

def onehot_encode(df,column):
    df = df.copy()
    
    dummies = pd.get_dummies( df [column],prefix=column)
    df = pd.concat([df,dummies], axis=1)
    df = df.drop(column,axis=1)
    
    return df 
    

In [None]:
# Hacemos función que hace una copia como primer paso

def preprocess_inputs(df):
    df = df.copy()

#2.- Si quicieramos eliminar la columna Id, pero en este dato no existe.
    # df = df.drop("id", axis=1)
    
#3.- Despues de identificar las clases  dentro de cada variable y transformamos los datos en binarios
    df["ever_married"] = df["ever_married"].replace({"No":0,"Yes":1})
    df["gender"] = df["gender"].replace({"Male":0,"Female":1})
    df["Residence_type"] = df["Residence_type"].replace({"Urban":0,"Rural":1})
    
#4.-Haciendo Onehot_encode con las variables categóricas
    for column in ["work_type","smoking_status"]:
        df = onehot_encode(df,column = column)
        
        
#5.- Separamos la columna stroke (variable a predecir) 
    y = df['stroke']
    X = df.drop('stroke', axis=1)
    
#6.- Definiendo Train-test split 
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
#7.- Escalamos las variables Xtest, Xtrain  
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
   
    return X_train, X_test, y_train, y_test

In [None]:
# Definimos la variable preprocess_inputs 
X_train, X_test, y_train, y_test = preprocess_inputs(data_stroke)

In [None]:
# Veridficamos el funcionamiento de preprocess_inputs 
y_train 

In [None]:
#Calculamos la varianza para comprobar la escala en X_train
X_train.var()

## ENTRENAMOS


### Primer entrenamiento sin balanceo 


In [None]:
# Modelos elegidos para nuestro entrenamiesto de clasificación 
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

# entrenamiento:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")



In [None]:
# Cantidad de datos que tienen casos de stroke afirmativos y negativos donde 0=No y 1=Si
y_train.value_counts()

### Nos damos cuenta que el dataset esta muy desbalanceado puesto que hay muchos casos negativos y esto puede influir sesgando los datos , pero lo solucionaremos luego


In [None]:
# Sacamos las métricas en test y train 
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_test)
    
    y_pred_train = model.predict(X_train)
    
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred),roc_auc_score(y_test, y_pred) ))
    
    print( confusion_matrix(y_test, y_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t              Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_train, y_pred_train) * 100, f1_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train) ))

# Imprimiendo matriz de confusión 
    print(confusion_matrix(y_train, y_pred_train))
    
# Medimos el overfitting
    print(
    np.abs(((((accuracy_score(y_train, y_pred_train))-accuracy_score(y_test, y_pred))/(accuracy_score(y_test, y_pred)) *100)))
       )
           

### Primer intento de balanceo de datos con técnica de Oversampling


In [None]:
# comprobamos en numero de casos positivos y negativos en y_train
y_train.value_counts()

In [None]:
# Definimos el oversample en los datos reservados para train y hacemos una copia para mayor seguridad
oversampled_data = pd.concat([X_train, y_train], axis=1).copy()

# igualamos la cantidad de datos 
num_samples = y_train.value_counts()[0] - y_train.value_counts()[1]
new_samples = oversampled_data.query("stroke == 1").sample(num_samples, replace=True, random_state=1)

# definimos para oversamplear
oversampled_data = pd.concat([oversampled_data, new_samples], axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

# Definimos X e y con la data banceada 
y_oversampled = oversampled_data['stroke']
X_oversampled = oversampled_data.drop('stroke', axis=1)

#Def Train-Test-Split
X_oversampled_train, X_oversampled_test, y_oversampled_train, y_oversampled_test = train_test_split(X_oversampled, y_oversampled, train_size=0.7, shuffle=True, random_state=1)

In [None]:
# Verificamos que se ha hecho bien el split
y_oversampled_train

In [None]:
#Visualización de la data oversampleada
oversampled_data 

In [None]:
# comprobamos el balanceo de la data
y_oversampled_train.value_counts()

In [None]:
# Entrenando los modelos con la data balanceada
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}


for name, model in models.items():
    model.fit(X_oversampled_train, y_oversampled_train)
    print(name + " Entrenado.")


In [None]:
# Sacamos las métricas más importantes ya con los datos de Oversampled en X e y .

print("Model Performance\n-----------------")
for name, model in models.items():
    y_oversampled_pred = model.predict(X_oversampled_test)
    
    y_oversampled_pred_train = model.predict(X_oversampled_train)


    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_oversampled_test, y_oversampled_pred) * 100, f1_score(y_oversampled_test, y_oversampled_pred),recall_score(y_oversampled_test, y_oversampled_pred),precision_score(y_oversampled_test, y_oversampled_pred),roc_auc_score(y_oversampled_test, y_oversampled_pred) ))
    
    print( confusion_matrix(y_oversampled_test, y_oversampled_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t        Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_oversampled_train, y_oversampled_pred_train) * 100, f1_score(y_oversampled_train, y_oversampled_pred_train),recall_score(y_oversampled_train, y_oversampled_pred_train),precision_score(y_oversampled_train, y_oversampled_pred_train),roc_auc_score(y_oversampled_train, y_oversampled_pred_train) ))
 
    print( confusion_matrix(y_oversampled_train, y_oversampled_pred_train))
    
    print(
    "\n" + name + " Overfitting:\n\t\t\t\t",
    np.abs(((((accuracy_score(y_oversampled_train, y_oversampled_pred_train))-accuracy_score(y_oversampled_test, y_oversampled_pred))/(accuracy_score(y_oversampled_test, y_oversampled_pred)) *100)))
       )

### Al haber usado el metodo de oversampling para balancear , no podemos confiar en que haya duplicado datos y las predicciones no esten sesgadas. 
### Para elegir un modelo , vamos a probar otro método de balanceo y comparamos las métricas .

## Segunda técnica de balanceo

In [None]:
# Segundo intento de balanceo de datos con técnica de SmoteTomek

data_stroke

In [None]:
# Aqui sacamos las X y y para procesarlas fuera de la función
#Eliminamos Transported de X porque es el valor a predecir. La X es mayúscula
y = data_stroke['stroke']
X = data_stroke.drop('stroke', axis=1)

In [None]:
#Comprobamos que se ha eliminado la variable Stroke del eje X
X.head()

In [None]:
# Comprobamos que Y es la variable del stroke.
y.head()

In [None]:
# Clasificación de las variables
categoricas=["gender", "ever_married","heart_disease", "hypertension","work_type","Residence_type","smoking_status"]
numericas=["age","avg_glucose_level","bmi"]

In [None]:
#Importanción de Sklearn
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [None]:
#Creamos transformers para cada categoría de variables.
transformer_numerico = ("transformer_numerica", MinMaxScaler(), numericas)
transformer_categorico = ("transformer_categorica", OneHotEncoder(), categoricas)
transformer = ColumnTransformer([transformer_numerico, transformer_categorico], remainder="passthrough")

In [None]:
# Denominamos la X para transformer
X = transformer.fit_transform(X)

In [None]:
# Se crea un DataFrame con los transformers.
transformer_final= pd.DataFrame(X, columns = transformer.get_feature_names_out())

In [None]:
# Visualizamos el DataFrame con transformers
transformer_final

In [None]:
# Vemos las variables numéricas y categóricas.
transformer.output_indices_

In [None]:
#Cargando el archivo de transformer a pickle para subirlo a streamlit

file_trans = open('transformer_f.pkl', 'wb')
pickle.dump(transformer, file_trans)

In [None]:
# Sumamos la variable Y en positivos para después tratarlo.
positivo = y.sum()

In [None]:
# Para sacar los valores negativos hemos de restar los positivos.
negativo = y.shape[0]-positivo

In [None]:
# Mostramos los valores positivos y negativos.
print("Rpta Positivos: ", positivo, "Rpta Negativos:", negativo)

In [None]:
# Instalación de imblearn para utilizar el método de SmoteTomek (no desde terminal)
!pip install imblearn

In [None]:
# Importación de SmomteTomek
from imblearn.combine import SMOTETomek

smoteT = SMOTETomek()

In [None]:
# Definiendo las variables para balancear los datos

X_smoteT, y_smoteT = smoteT.fit_resample(X, y)
Positivo_smoteT = y_smoteT.sum()
Negativo_smoteT = y_smoteT.shape[0]-Positivo_smoteT


print("Rpta Positivas:",Positivo_smoteT,",Rpta Negativas:", Negativo_smoteT)
X_smoteT 

X_smoteT_train, X_smoteT_test, y_smoteT_train, y_smoteT_test = train_test_split(X_smoteT, y_smoteT, train_size=0.7, shuffle=True, random_state=1)


In [None]:
## Corriendo los modelos con balanceo SMOTETomek
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
   # "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}


for name, model in models.items():
    model.fit(X_smoteT_train, y_smoteT_train)
    print(name + " Entrenado.")


### Tras entrenar con los datos balanaceados aplicando SmoteTomek , comparamos y el que nos da una mejor métrica es XGBoost. 

In [None]:
# Recuento de valores positivos y negativos despues de balancear con SmoteTomek
y_smoteT_train.value_counts()

In [None]:
#Test con valores smoteT
print("Model Performance\n-----------------")
for name, model in models.items():
    y_smoteT_pred = model.predict(X_smoteT_test)
    
    y_smoteT_pred_train = model.predict(X_smoteT_train)


    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_smoteT_test, y_smoteT_pred) * 100, f1_score(y_smoteT_test, y_smoteT_pred),recall_score(y_smoteT_test, y_smoteT_pred),precision_score(y_smoteT_test, y_smoteT_pred),roc_auc_score(y_smoteT_test, y_smoteT_pred) ))
    
    print( confusion_matrix(y_smoteT_test, y_smoteT_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t        Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_smoteT_train, y_smoteT_pred_train) * 100, f1_score(y_smoteT_train, y_smoteT_pred_train),recall_score(y_smoteT_train, y_smoteT_pred_train),precision_score(y_smoteT_train, y_smoteT_pred_train),roc_auc_score(y_smoteT_train, y_smoteT_pred_train) ))
   
    
    print( confusion_matrix(y_smoteT_train, y_smoteT_pred_train))
    
    print(
    "\n" + name + " Overfitting accuracy:\n\t\t\t\t",
    np.abs(((((accuracy_score(y_smoteT_train, y_smoteT_pred_train))-accuracy_score(y_smoteT_test, y_smoteT_pred))/(accuracy_score(y_smoteT_test, y_smoteT_pred)) *100)))
       )
    
    print(
    "\n" + name + " Overfitting recall:\n\t\t\t\t",
    np.abs(((((recall_score(y_smoteT_train, y_smoteT_pred_train))-recall_score(y_smoteT_test, y_smoteT_pred))/(recall_score(y_smoteT_test, y_smoteT_pred)) *100)))
       )
# Overfitting        
    print(
    "\n" + name + " Overfitting presicion:\n\t\t\t\t",
    np.abs(((((precision_score(y_smoteT_train, y_smoteT_pred_train))-precision_score(y_smoteT_test, y_smoteT_pred))/(precision_score(y_smoteT_test, y_smoteT_pred)) *100)))
       )

In [None]:
# Elegimos el modelo y lo guardamos en una variable para poder utilizar Pickle.
modelo_elegido = XGBClassifier()
modelo_elegido.fit(X,y)

In [None]:
# Utilizamos pickle para guardar el modelo y poder utilizar en streamlit.
modelo_pickle = 'XGBchachi.pkl'
pickle.dump(modelo_elegido, open(modelo_pickle, 'wb'))