In [39]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show
from bokeh.layouts import column
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix



df = pd.read_csv('stroke_dataset.csv')

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [40]:
df.shape

(4981, 11)

In [41]:



# Crear una lista para almacenar los gráficos
plots = []

# Iterar sobre cada columna numérica
for col in df.select_dtypes(include=['float64', 'int64']).columns:
    # Crear un histograma
    hist, edges = np.histogram(df[col].dropna(), bins=30)
    
    # Crear la figura
    p = figure(title=f'Histograma de {col}', x_axis_label=col, y_axis_label='Frecuencia')
    
    # Añadir el histograma a la figura
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="navy", line_color="white", alpha=0.7)
    
    # Añadir la figura a la lista de gráficos
    plots.append(p)

# Mostrar todos los histogramas en una columna
show(column(plots))

In [42]:
# Eliminar registros donde la edad es menor a 14 años
df = df[df['age'] >= 14]

# Eliminar registros donde smoking_status es 'Unknown'
df = df[df['smoking_status'] != 'Unknown']

# Reiniciar el índice del DataFrame
df = df.reset_index(drop=True)

# Opcionalmente, guardar el DataFrame limpio en un nuevo archivo CSV
df.to_csv('dataset_limpio.csv', index=False)

In [43]:
df = pd.read_csv('dataset_limpio.csv')

stroke_0 = df[df['stroke'] == 0]
stroke_1 = df[df['stroke'] == 1]

n_stroke_1 = len(stroke_1)

#submuestreo aleatorio de los 0
stroke = 0
stroke_0_subsampled = stroke_0.sample(n=n_stroke_1, random_state = 42)

#volver a unir
df_balanced = pd.concat([stroke_1, stroke_0_subsampled])

#mezclamos 
df_balanced =df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced['stroke'].value_counts())

df_balanced.to_csv('dataset_balanceado.csv', index=False)

stroke
0    201
1    201
Name: count, dtype: int64


In [44]:
print(df_balanced.dtypes)


gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [45]:


df = pd.read_csv('dataset_balanceado.csv')

X = df.drop('stroke', axis=1)
y = df['stroke']

#definimos columnas categóricas. 
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numeric_columns = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

preprocessor = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_columns), 
                ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns)
                ])

#ajustar y transformar
X_encoded = preprocessor.fit_transform(X)

#Sacar los nombres de las características después de la codificación
onehot_encoder = preprocessor.named_transformers_['cat']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_columns)
feature_names = numeric_columns + cat_feature_names.tolist()

# Crear un nuevo DataFrame con las características codificadas
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names)

print(X_encoded_df.head())

        age  hypertension  heart_disease  avg_glucose_level       bmi  \
0 -1.351020     -0.506211      -0.389695          -0.631053 -1.254006   
1 -0.350914     -0.506211      -0.389695          -0.260976  2.011366   
2  0.122820      1.975459      -0.389695           1.597603 -1.585516   
3 -0.245640     -0.506211       2.566107          -0.455863  0.900808   
4  1.070289      1.975459      -0.389695           0.159654 -1.602091   

   gender_Male  ever_married_Yes  work_type_Private  work_type_Self-employed  \
0          0.0               1.0                1.0                      0.0   
1          1.0               0.0                1.0                      0.0   
2          1.0               1.0                0.0                      0.0   
3          1.0               1.0                1.0                      0.0   
4          0.0               0.0                1.0                      0.0   

   work_type_children  Residence_type_Urban  smoking_status_never smoked  \
0   