In [3]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("data/stroke_dataset.csv")

In [None]:
from sklearn.preprocessing import LabelEncoder

def change_datatype(df: pd.DataFrame):
	df['age'] = df['age'].astype("int")
	df['ever_married'] = df['ever_married'].astype("category")
	cols = df.select_dtypes(include="object").columns
	cols = list(cols) + ['hypertension', 'heart_disease']
	df[cols] = df[cols].astype("category")

def code_data(df: pd.DataFrame):
	change_datatype(df)
	# Lista de features categóricas binarias
	cat_bin = [c for c in df.select_dtypes('category').columns if len(df[c].unique()) == 2]

	# Lista de features categóricas
	cat_no_bin = [c for c in df.select_dtypes('category').columns if len(df[c].unique()) > 2]
	
	df_cat = pd.get_dummies(df[cat_no_bin], dtype="int")
	df.drop(columns=cat_no_bin, inplace=True)

	le = LabelEncoder()

	for c in cat_bin:
		df[c] = le.fit_transform(df[c])
	return pd.concat([df, df_cat], axis=1)

df_code = code_data(df)


In [None]:
df_code.to_csv('data/dataset_code.csv')

In [5]:
df = pd.read_csv("data/dataset_code.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,67,0,1,1,1,228.69,36.6,1,0,1,0,0,0,1,0,0
1,1,1,80,0,1,1,0,105.92,32.5,1,0,1,0,0,0,0,1,0
2,2,0,49,0,0,1,1,171.23,34.4,1,0,1,0,0,0,0,0,1
3,3,0,79,1,0,1,0,174.12,24.0,1,0,0,1,0,0,0,1,0
4,4,1,81,0,0,1,1,186.21,29.0,1,0,1,0,0,0,1,0,0


In [7]:
df.tail()

Unnamed: 0.1,Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
4976,4976,1,41,0,0,0,0,70.15,29.8,0,0,1,0,0,0,1,0,0
4977,4977,1,40,0,0,1,1,191.15,31.1,0,0,1,0,0,0,0,0,1
4978,4978,0,45,1,0,1,0,95.02,31.8,0,1,0,0,0,0,0,0,1
4979,4979,1,40,0,0,1,0,83.94,30.0,0,0,1,0,0,0,0,0,1
4980,4980,0,80,1,0,1,1,83.75,29.1,0,0,1,0,0,0,0,1,0


In [6]:
df.shape

(4981, 18)

### Dividir data

In [8]:
X = df.drop('stroke', axis=1)
y = df['stroke']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Para el desbalanceo que existe en nuestras clases se agregara pesos a cada una.

In [30]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [15]:
rfcl = RandomForestClassifier(class_weight=class_weights_dict, random_state=42)

# Definir hiperparámetros
param_rfcl = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

grid_rfcl = GridSearchCV(
            estimator=rfcl,
            param_grid=param_rfcl,
            cv=5,
            n_jobs=-1,
            verbose=2)

grid_rfcl.fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


[CV] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=300; total time=   1.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=2, n_estimators=300; total time=   1.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=5, n_estimators=200; total time=   1.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=5, n_estimators=300; total time=   1.7s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=10, n_estimators=200; total time=   1.1s
[CV] END max_depth=5, max_features=sqrt, min_samples_split=10, n_estimators=200; total time=   1.1s
[CV] E

In [16]:
# Mejor hiperparámetros
print("Best parameters:\n", grid_rfcl.best_params_)

Best parameters:
 {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100}


In [17]:
rfcl_best = grid_rfcl.best_estimator_
y_pred = rfcl_best.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

Random Forest Accuracy: 0.9969909729187563


In [18]:
from sklearn.metrics import classification_report, confusion_matrix

# Matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred)
print("Matriz de confusión:")
print(conf_matrix)
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))

Matriz de confusión:
[[943   0]
 [  3  51]]
Reporte de clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       943
           1       1.00      0.94      0.97        54

    accuracy                           1.00       997
   macro avg       1.00      0.97      0.98       997
weighted avg       1.00      1.00      1.00       997



### Gradient Boosting Machines

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm = GradientBoostingClassifier(random_state=42)

param_gbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0]
}

grid_gbm = GridSearchCV(estimator=gbm,
                       param_grid=param_gbm,
                       cv=5,
                       verbose=2)

grid_gbm.fit(X_train, y_train)

In [None]:
print("Best params:", grid_gbm.best_params_)

gbm_best = grid_gbm.best_estimator_
y_pred_gbm = gbm_best.predict(X_test)
print("GBM accuracy:", accuracy_score(y_test, y_pred_gbm))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Matriz de confusión
conf_matrix = confusion_matrix(y_test, y_pred_gbm)
print("Matriz de confusión:")
print(conf_matrix)

# Reporte de clasificación
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred_gbm))

### XGBoost

In [21]:
import xgboost as xgb

In [33]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_xgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'eval_metric':['auc']
}

grid_xgb = GridSearchCV(
            estimator=xgb_model,
            param_grid=param_xgb,
            cv=5,
            verbose=2)

grid_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, eval_metric=auc, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] E

In [23]:
print("Best params:", grid_xgb.best_params_)
xgb_best = grid_xgb.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)
print("XGBoost accuracy:", accuracy_score(y_test, y_pred_xgb))

Best params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
XGBoost accuracy: 0.9979939819458375


In [28]:
from sklearn.metrics import classification_report, confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print("Matriz de confusión:\n", conf_matrix)

print("Reporte de clasificación:")
print(classification_report(y_test, y_pred_xgb))

Matriz de confusión:
 [[943   0]
 [  2  52]]
Reporte de clasificación:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       943
           1       1.00      0.96      0.98        54

    accuracy                           1.00       997
   macro avg       1.00      0.98      0.99       997
weighted avg       1.00      1.00      1.00       997

