## EQUIPO PULMON

## 1. Importacion de librerias


In [1]:
#Librerías de análisis de datos:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, precision_score, roc_auc_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import sys
!{sys.executable} -m pip install xgboost lightgbm catboost



In [3]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

## 2. Exploracion basica de los datos

Estas son las variables que contiene nuestra dataset:

 + gender: Genero	
 + age: Edad
 + hypertension: padece de Hypertencion
 + heart_disease: Tiene enfermedades cardiacas	
 + ever_married: Se ha casado
 + work_type: Tipo de trabajo	
 + Residence_type: Tipo de residencia
 + avg_glucose_level:  Promedio de glucosa en sangre
 + bmi: Indice de masa corporal
 + smoking_status: Tipo de fumador
 + stroke: Ictus

In [4]:
# Buenas prácticas.
path_to_data = "./stroke_dataset.csv"
data_stroke = pd.read_csv(path_to_data)

In [5]:
#Cantidad de entradas y columnas del dataset.
data_stroke.shape

(4981, 11)

Tenemos 4981 entradas y 11 columnas.

In [6]:
# Vemos las primeras cinco entradas.
data_stroke.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [7]:
data_stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4981 entries, 0 to 4980
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4981 non-null   object 
 1   age                4981 non-null   float64
 2   hypertension       4981 non-null   int64  
 3   heart_disease      4981 non-null   int64  
 4   ever_married       4981 non-null   object 
 5   work_type          4981 non-null   object 
 6   Residence_type     4981 non-null   object 
 7   avg_glucose_level  4981 non-null   float64
 8   bmi                4981 non-null   float64
 9   smoking_status     4981 non-null   object 
 10  stroke             4981 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 428.2+ KB


In [8]:
data_stroke.nunique()

gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               4
Residence_type          2
avg_glucose_level    3895
bmi                   342
smoking_status          4
stroke                  2
dtype: int64

In [9]:
data_stroke.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [10]:
categoricas=["gender", "ever_married","work_type","Residence_type","smoking_status"]
numericas=["age", "hypertension","heart_disease","avg_glucose_level","bmi","stroke"]

In [11]:
data_stroke.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4981.0,4981.0,4981.0,4981.0,4981.0,4981.0
mean,43.419859,0.096165,0.05521,105.943562,28.498173,0.049789
std,22.662755,0.294848,0.228412,45.075373,6.790464,0.217531
min,0.08,0.0,0.0,55.12,14.0,0.0
25%,25.0,0.0,0.0,77.23,23.7,0.0
50%,45.0,0.0,0.0,91.85,28.1,0.0
75%,61.0,0.0,0.0,113.86,32.6,0.0
max,82.0,1.0,1.0,271.74,48.9,1.0



numeric_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
for feature in numeric_features:
    plt.figure(figsize=(18, 10), facecolor='w')
    sns.distplot(data_stroke[feature])
    plt.title('{} Distribution'.format(feature), fontsize=20)
    ##plt.show()

sns.histplot(data=data_stroke, x="age", kde=True)
fig = plt.gcf()
fig.set_size_inches(10,5)

In [12]:
data_stroke.duplicated().sum()

0

In [13]:

# Generamos una máscara para no duplicar lops valores
##mask = np.triu(np.ones_like(data_stroke.corr(), dtype=bool))

# Configuramos el matplotlib
##f, ax = plt.subplots(figsize=(8, 6))

# Ploteamos el heatmap
##sns.heatmap(data_stroke.corr(), mask=mask, vmax=1., vmin=-1., center=0,
          #  square=True, linewidths=.5, cmap="coolwarm", cbar_kws={"shrink": .5}, annot=True)

## Preprocesamiento: 

In [14]:
#Hacemos funcion para trabajar las  variables que tienen mas de 2 clases ( prefix="work_type" para que en el nombre de las clases este tambien el nombre de la columna.)

def onehot_encode(df,column):
    df = df.copy()
    
    dummies = pd.get_dummies( df [column],prefix=column)
    df = pd.concat([df,dummies], axis=1)
    df = df.drop(column,axis=1)
    
    return df 
    

In [15]:
#Hacemos función que hace una copia como primer paso

def preprocess_inputs(df):
    df = df.copy()

#2.- Si quicieramos eliminar la columna 
    # df = df.drop("id", axis=1)
    
#3.- Despues de identificar las clases  dentro de cada columna, trabajamos los binarios
    df["ever_married"] = df["ever_married"].replace({"No":0,"Yes":1})
    df["gender"] = df["gender"].replace({"Male":0,"Female":1})
    df["Residence_type"] = df["Residence_type"].replace({"Urban":0,"Rural":1})
    
#4.-Haciendo Onehot_encode
    for column in ["work_type","smoking_status"]:
        df = onehot_encode(df,column = column)
        
        
#5.- Separamos la columna stroke (variable a predecir) 
    y = df['stroke']
    X = df.drop('stroke', axis=1)
    
#6.- Definiendo Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
#7.- Escalamos las variables Xtest, Xtrain  
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
   
    return X_train, X_test, y_train, y_test

In [16]:
X_train, X_test, y_train, y_test = preprocess_inputs(data_stroke)

In [17]:
X_train.var()

gender                            1.000287
age                               1.000287
hypertension                      1.000287
heart_disease                     1.000287
ever_married                      1.000287
Residence_type                    1.000287
avg_glucose_level                 1.000287
bmi                               1.000287
work_type_Govt_job                1.000287
work_type_Private                 1.000287
work_type_Self-employed           1.000287
work_type_children                1.000287
smoking_status_Unknown            1.000287
smoking_status_formerly smoked    1.000287
smoking_status_never smoked       1.000287
smoking_status_smokes             1.000287
dtype: float64

In [18]:
y_train 

2117    0
332     0
3915    0
4610    0
4485    0
       ..
2895    0
2763    0
905     0
3980    0
235     0
Name: stroke, Length: 3486, dtype: int64

In [19]:
#3.- Encontramos las clases de cada columna:
#{column: len(X[column].unique()) for column in X.select_dtypes("object").columns}

In [20]:
#4.- Ahora trabajamos las que tienen mas de 2 clases (,prefix="work_type" para que en el nombre de las clases este tambien el nombre de la columna.
#pd.get_dummies(X["work_type"],prefix="work_type")
# Se procede a borrar por que esta función la utilizamos para definir la función onehot_encode

## ENTRENAMOS


In [21]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}


for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")



                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


In [22]:
y_train.value_counts()

0    3315
1     171
Name: stroke, dtype: int64

In [23]:
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_test)
    
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred)
           ))
    #Hemos quitado la matriz de confusion
    #,recall_score(y_train, y_pred),precision_score(y_train, y_pred)
           

Model Performance
-----------------

                   Logistic Regression Accuracy: 94.916%
				       F1-Score: 0.02564

                   K-Nearest Neighbors Accuracy: 94.716%
				       F1-Score: 0.04819

                         Decision Tree Accuracy: 91.572%
				       F1-Score: 0.16000

Support Vector Machine (Linear Kernel) Accuracy: 94.849%
				       F1-Score: 0.00000

   Support Vector Machine (RBF Kernel) Accuracy: 94.849%
				       F1-Score: 0.00000

                        Neural Network Accuracy: 94.515%
				       F1-Score: 0.10870

                         Random Forest Accuracy: 94.783%
				       F1-Score: 0.00000

                     Gradient Boosting Accuracy: 94.983%
				       F1-Score: 0.07407

                               XGBoost Accuracy: 93.913%
				       F1-Score: 0.04211

                              LightGBM Accuracy: 93.980%
				       F1-Score: 0.00000

                              CatBoost Accuracy: 94.716%
				       F1-Score: 0.04819


In [25]:
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_train)
    
    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}"\
        .format(accuracy_score(y_train, y_pred) * 100, f1_score(y_train, y_pred) ))

    #,confusion_matrix(y_train,y_pred),recall_score(y_train, y_pred),precision_score(y_train, y_pred),roc_auc_score(y_train, y_pred)
           
    

Model Performance
-----------------

                   Logistic Regression Accuracy: 95.095%
				       F1-Score: 0.00000

                   K-Nearest Neighbors Accuracy: 95.209%
				       F1-Score: 0.07735

                         Decision Tree Accuracy: 100.000%
				       F1-Score: 1.00000

Support Vector Machine (Linear Kernel) Accuracy: 95.095%
				       F1-Score: 0.00000

   Support Vector Machine (RBF Kernel) Accuracy: 95.095%
				       F1-Score: 0.00000

                        Neural Network Accuracy: 95.554%
				       F1-Score: 0.23645

                         Random Forest Accuracy: 100.000%
				       F1-Score: 1.00000

                     Gradient Boosting Accuracy: 95.697%
				       F1-Score: 0.21875

                               XGBoost Accuracy: 99.742%
				       F1-Score: 0.97297

                              LightGBM Accuracy: 99.828%
				       F1-Score: 0.98214

                              CatBoost Accuracy: 97.676%
				       F1-Score: 0.68966


## OVERSAMPLING


In [26]:
y_train.value_counts()

0    3315
1     171
Name: stroke, dtype: int64

DATOS DESBALANCEADOS

In [27]:
oversampled_data = pd.concat([X_train, y_train], axis=1).copy()

num_samples = y_train.value_counts()[0] - y_train.value_counts()[1]
new_samples = oversampled_data.query("stroke == 1").sample(num_samples, replace=True, random_state=1)

oversampled_data = pd.concat([oversampled_data, new_samples], axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

y_train_oversampled = oversampled_data['stroke']
X_train_oversampled = oversampled_data.drop('stroke', axis=1)

In [28]:
## Mostramos el Data concatenado y balanceado
oversampled_data 

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,stroke
0,0.860964,0.760070,-0.329816,4.153470,0.709390,1.009801,0.301358,1.210494,2.5208,-1.147292,-0.437479,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,1
1,0.860964,-0.296505,-0.329816,-0.240763,0.709390,-0.990294,-0.192419,-1.199677,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,-0.465568,1.309562,-0.420131,0
2,0.860964,1.200310,-0.329816,-0.240763,0.709390,-0.990294,1.957183,0.813698,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,2.147912,-0.763614,-0.420131,1
3,-1.161489,0.451902,-0.329816,-0.240763,-1.409662,1.009801,-0.005382,-0.170945,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,2.147912,-0.763614,-0.420131,0
4,0.860964,1.508478,-0.329816,-0.240763,0.709390,1.009801,-1.014280,-0.215034,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,2.147912,-0.763614,-0.420131,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6625,0.860964,0.319830,-0.329816,-0.240763,0.709390,1.009801,-0.209362,-1.038019,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,-0.465568,1.309562,-0.420131,0
6626,-1.161489,1.068238,-0.329816,-0.240763,0.709390,-0.990294,-0.636906,-0.156249,-0.3967,-1.147292,2.285826,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,1
6627,-1.161489,1.332382,-0.329816,-0.240763,0.709390,-0.990294,-0.743847,0.402205,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,1
6628,-1.161489,-0.428577,-0.329816,-0.240763,-1.409662,-0.990294,0.037526,0.269940,-0.3967,0.871618,-0.437479,-0.395245,-0.660113,-0.465568,-0.763614,2.380208,0


In [None]:
## Del data balanceado eliminamos los duplicados
## oversampled_data.drop_duplicates()

In [29]:
y_train_oversampled.value_counts()

1    3315
0    3315
Name: stroke, dtype: int64

In [30]:
## Entrenando los modelos con la data balanceada
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}


for name, model in models.items():
    model.fit(X_train_oversampled, y_train_oversampled)
    print(name + " Entrenado.")


                   Logistic Regression Entrenado.
                   K-Nearest Neighbors Entrenado.
                         Decision Tree Entrenado.
Support Vector Machine (Linear Kernel) Entrenado.
   Support Vector Machine (RBF Kernel) Entrenado.
                        Neural Network Entrenado.
                         Random Forest Entrenado.
                     Gradient Boosting Entrenado.
                               XGBoost Entrenado.
                              LightGBM Entrenado.
                              CatBoost Entrenado.


In [64]:
print("Model Performance\n-----------------")
for name, model in models.items():
    y_pred = model.predict(X_test)
    
    y_pred_train = model.predict(X_train)


    print(
        "\n" + name + " Accuracy: {:.3f}%\n\t\t\t\t       F1-Score: {:.5f}\n\t\t\t\t              Recall: {:.5f}\n\t\t\t\t       Precision: {:.5f}\n\t\t\t\t       Roc_Au: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_test, y_pred) * 100, f1_score(y_test, y_pred),recall_score(y_test, y_pred),precision_score(y_test, y_pred),roc_auc_score(y_test, y_pred) ))
    
    print( confusion_matrix(y_test, y_pred))
    
    print(
        "\n" + name + " Accuracy_train: {:.3f}%\n\t\t\t\t       F1-Score_train: {:.5f}\n\t\t\t\t              Recall_Train: {:.5f}\n\t\t\t\t       Precision_Train: {:.5f}\n\t\t\t\t       Roc_Au_Train: {:.5f}\n\t\t\t\t"\
        .format(accuracy_score(y_train, y_pred_train) * 100, f1_score(y_train, y_pred_train),recall_score(y_train, y_pred_train),precision_score(y_train, y_pred_train),roc_auc_score(y_train, y_pred_train) ))
   # print(
   # np.abs((((accuracy_score(y_test, y_pred))/(accuracy_score(y_train, y_pred_train)) *100)))
    #      ) Confusion_Matrix: {:.5f}\n\t\t\t\t Confusion_Matrix_Train: {:.5f}\n\t\t\t\t,confusion_matrix(y_test,y_pred),
    
    
    print(confusion_matrix(y_train, y_pred_train))
    
    print(
    np.abs(((((accuracy_score(y_train, y_pred_train))-accuracy_score(y_test, y_pred))/(accuracy_score(y_test, y_pred)) *100)))
       )
    

Model Performance
-----------------

                   Logistic Regression Accuracy: 74.247%
				       F1-Score: 0.24951
				              Recall: 0.83117
				       Precision: 0.14679
				       Roc_Au: 0.78441
				
[[1046  372]
 [  13   64]]

                   Logistic Regression Accuracy_train: 72.719%
				       F1-Score_train: 0.22871
				              Recall_Train: 0.82456
				       Precision_Train: 0.13277
				       Roc_Au_Train: 0.77337
				
[[2394  921]
 [  30  141]]
2.0580391062318726

                   K-Nearest Neighbors Accuracy: 85.619%
				       F1-Score: 0.19476
				              Recall: 0.33766
				       Precision: 0.13684
				       Roc_Au: 0.61100
				
[[1254  164]
 [  51   26]]

                   K-Nearest Neighbors Accuracy_train: 92.025%
				       F1-Score_train: 0.55161
				              Recall_Train: 1.00000
				       Precision_Train: 0.38085
				       Roc_Au_Train: 0.95807
				
[[3037  278]
 [   0  171]]
7.4826090074584055

                         