In [116]:
pip install imbalanced-learn



In [117]:
!pip install ydata_profiling



In [118]:
!pip install optuna



In [119]:
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import optuna
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [120]:
# Cargar el dataset
df = pd.read_csv('stroke_dataset.csv')

# Encoding variables binarias
label_encoder = LabelEncoder()
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['gender'] = label_encoder.fit_transform(df['gender'])

# Crear dummies para variables categóricas
df = pd.get_dummies(df, columns=['work_type', 'Residence_type'], drop_first=True)

# Crear nuevas columnas
# 1. Grupos de edad
def age_group(age):
    if age <= 18:
        return '0-18'
    elif 19 <= age <= 35:
        return '19-35'
    elif 36 <= age <= 50:
        return '36-50'
    elif 51 <= age <= 65:
        return '51-65'
    else:
        return '65+'

df['age_group'] = df['age'].apply(age_group)

# 3. Combinación de hipertensión y enfermedades cardíacas
df['hypertension_heart_disease'] = ((df['hypertension'] == 1) & (df['heart_disease'] == 1)).astype(int)

# 4. Comorbilidades
df['comorbidities'] = df['hypertension'] + df['heart_disease']

# 5. Categorías de IMC
def bmi_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 25:
        return 'normal'
    elif 25 <= bmi < 30:
        return 'overweight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(bmi_category)

# 6. Obesidad
df['obese'] = df['bmi'].apply(lambda x: 1 if x >= 30 else 0)

# 7. Fumador actual o pasado
df['smoker_status'] = df['smoking_status'].apply(lambda x: 1 if x in ['formerly smoked', 'smokes'] else 0)

# 8. Riesgo elevado
df['high_risk_lifestyle'] = ((df['smoker_status'] == 1) | (df['hypertension'] == 1) | (df['heart_disease'] == 1)).astype(int)

# 9. Glucosa elevada
df['high_glucose'] = df['avg_glucose_level'].apply(lambda x: 1 if x > 140 else 0)

# 10. Combinación de edad avanzada y comorbilidades
df['elderly_comorbid'] = ((df['age'] > 65) & (df['comorbidities'] > 0)).astype(int)




# Mostrar las primeras filas del DataFrame actualizado
print(df.head())


   gender   age  hypertension  heart_disease  ever_married  avg_glucose_level  \
0       1  67.0             0              1             1             228.69   
1       1  80.0             0              1             1             105.92   
2       0  49.0             0              0             1             171.23   
3       0  79.0             1              0             1             174.12   
4       1  81.0             0              0             1             186.21   

    bmi   smoking_status  stroke  work_type_Private  ...  \
0  36.6  formerly smoked       1               True  ...   
1  32.5     never smoked       1               True  ...   
2  34.4           smokes       1               True  ...   
3  24.0     never smoked       1              False  ...   
4  29.0  formerly smoked       1               True  ...   

   Residence_type_Urban  age_group  hypertension_heart_disease comorbidities  \
0                  True        65+                           0          

In [121]:
# Assuming X is your original DataFrame with features
# ... (your code to load and prepare X) ...

y = df['stroke']
X = df.drop('stroke', axis=1)

# 1. Identify categorical columns:
categorical_cols = X.select_dtypes(include=['object']).columns

# 2. Create OneHotEncoder:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for compatibility with SMOTE

# 3. Fit and transform on categorical columns:
encoded_data = encoder.fit_transform(X[categorical_cols])

# 4. Create a new DataFrame with encoded features:
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# 5. Concatenate encoded features with numerical features:
X_encoded = pd.concat([X.drop(categorical_cols, axis=1), encoded_df], axis=1)

#Now use this new X_encoded for train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Calcular los pesos
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

# Optimización de hiperparámetros con Optuna



XGBoost

In [122]:


# Función objetivo para XGBoost
def objective_xgb(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.7, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.5, 0.7, 1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'scale_pos_weight': class_weights_dict[1]  # Aplicar el upweighting
    }

    # Convertir datos a DMatrix para XGBoost
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)
    valid_dmatrix = xgb.DMatrix(X_test, label=y_test)

    # Entrenar el modelo
    xgb_clf = xgb.train(param, train_dmatrix)

    # Predecir en el conjunto de prueba
    preds = xgb_clf.predict(valid_dmatrix)
    pred_labels = np.rint(preds)

    # Evaluar la métrica objetivo (f1_score en este caso)
    score = f1_score(y_test, pred_labels)
    return score

# Iniciar el estudio de Optuna para XGBoost
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=100)

# Extraer los mejores hiperparámetros
best_params_xgb = study_xgb.best_trial.params
print("Best hyperparameters for XGBoost: ", best_params_xgb)

[I 2024-10-15 11:59:45,592] A new study created in memory with name: no-name-4463a316-30d2-478c-ada1-5c3364a5a681
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
[I 2024-10-15 11:59:45,723] Trial 0 finished with value: 0.0 and parameters: {'lambda': 0.003999404503586992, 'alpha': 3.7292140286544935e-05, 'colsample_bytree': 0.5, 'subsample': 1.0, 'learning_rate': 0.0007619683100696827, 'n_estimators': 109, 'max_depth': 8, 'min_child_weight': 6}. Best is trial 0 with value: 0.0.
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
[I 2024-10-15 11:59:45,945] Trial 1 finished with value: 0.0 and parameters: {'lambda': 2.740615835976377e-07, 'alpha': 0.00017170556155925065, 'colsample_bytree': 1.0, 'subsample': 0.7, '

Best hyperparameters for XGBoost:  {'lambda': 0.0020347988977949456, 'alpha': 6.065116885395498e-07, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.07368598010119852, 'n_estimators': 273, 'max_depth': 3, 'min_child_weight': 10}


Regresión lineal

#Entrenamiento del modelo

In [123]:
# Entrenar el modelo XGBoost con los mejores hiperparámetros
xgb_model_optimized = xgb.XGBClassifier(**best_params_xgb)
xgb_model_optimized.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred_xgb_optimized = xgb_model_optimized.predict(X_test)

# Evaluar el rendimiento del modelo optimizado
from sklearn.metrics import classification_report

In [124]:
# Extraer importancias
importances = xgb_model_optimized.feature_importances_

# Crear un DataFrame para visualizar las importancias
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Ordenar las características por importancia
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Importancia de las características:")
print(feature_importances)

Importancia de las características:
                           Feature  Importance
1                              age    0.175878
17                elderly_comorbid    0.138191
2                     hypertension    0.075240
7                work_type_Private    0.059640
20     smoking_status_never smoked    0.053856
14                   smoker_status    0.050503
15             high_risk_lifestyle    0.049180
29         bmi_category_overweight    0.046336
21           smoking_status_smokes    0.043006
8          work_type_Self-employed    0.042966
5                avg_glucose_level    0.038371
0                           gender    0.036349
4                     ever_married    0.035495
3                    heart_disease    0.033597
12                   comorbidities    0.031188
18          smoking_status_Unknown    0.029919
6                              bmi    0.029818
10            Residence_type_Urban    0.015020
24                 age_group_36-50    0.010714
25                 age_g

In [216]:
# Seleccionar las 5 características más importantes
top_n = 5  # Cambia este valor según lo que desees
important_features = feature_importances.head(top_n)['Feature'].tolist()

# Filtrar los conjuntos de datos
X_train_important = X_train[important_features]
X_test_important = X_test[important_features]

import numpy as np
from sklearn.metrics import classification_report

# Entrenar el modelo XGBoost
xgb_model_optimized = xgb.XGBClassifier(**best_params_xgb)
xgb_model_optimized.fit(X_train_important, y_train)

# Obtener las probabilidades de las predicciones
y_prob_xgb_optimized = xgb_model_optimized.predict_proba(X_test_important)[:, 1]

# Definir un nuevo umbral
new_threshold = 0.165  # Ajusta este valor según sea necesario

# Clasificar las predicciones basadas en el nuevo umbral
y_pred_xgb_adjusted = (y_prob_xgb_optimized >= new_threshold).astype(int)

# Evaluar el rendimiento del modelo con el nuevo umbral
print("\nXGBoost Classification Report with Adjusted Threshold:")
print(classification_report(y_test, y_pred_xgb_adjusted))

from sklearn.metrics import accuracy_score

# Predecir en el conjunto de entrenamiento
y_pred_train = xgb_model_optimized.predict(X_train_important)

# Calcular la precisión en el conjunto de entrenamiento
accuracy_train = accuracy_score(y_train, y_pred_train)

# Calcular la precisión en el conjunto de prueba con el nuevo umbral
accuracy_test = accuracy_score(y_test, y_pred_xgb_adjusted)

# Calcular el porcentaje de overfitting
overfitting_percentage = (accuracy_train - accuracy_test) / accuracy_train * 100

# Imprimir los resultados
print(f"Accuracy en el conjunto de entrenamiento: {accuracy_train:.2f}")
print(f"Accuracy en el conjunto de prueba: {accuracy_test:.2f}")
print(f"Porcentaje de overfitting: {overfitting_percentage:.2f}%")


XGBoost Classification Report with Adjusted Threshold:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       943
           1       0.24      0.35      0.29        54

    accuracy                           0.91       997
   macro avg       0.60      0.64      0.62       997
weighted avg       0.92      0.91      0.91       997

Accuracy en el conjunto de entrenamiento: 0.95
Accuracy en el conjunto de prueba: 0.91
Porcentaje de overfitting: 4.79%


A PARTIR DE AQUÍ SON PRUEBAS :)

In [222]:
import pickle

# Guardar el modelo XGBoost entrenado en un archivo pickle
with open('xgb_model_optimized.pkl', 'wb') as file:
    pickle.dump(xgb_model_optimized, file)

print("Modelo XGBoost guardado exitosamente en 'xgb_model_optimized.pkl'")


Modelo XGBoost guardado exitosamente en 'xgb_model_optimized.pkl'


In [225]:
import pickle
from sklearn.metrics import classification_report, accuracy_score

# Cargar el modelo desde el archivo pickle
with open('xgb_model_optimized.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Obtener las probabilidades de las predicciones
y_prob_xgb = loaded_model.predict_proba(X_test_important)[:, 1]

# Definir un nuevo umbral
new_threshold = 0.165  # Ajusta este valor según sea necesario

# Clasificar las predicciones basadas en el nuevo umbral
y_pred_loaded_model = (y_prob_xgb >= new_threshold).astype(int)

# Evaluar el rendimiento del modelo cargado
accuracy = accuracy_score(y_test, y_pred_loaded_model)
print(f"Accuracy del modelo cargado: {accuracy:.4f}")

# Imprimir el informe de clasificación
print("\nClassification Report del modelo cargado:")
print(classification_report(y_test, y_pred_loaded_model))

ValueError: feature_names mismatch: ['age', 'elderly_comorbid', 'work_type_Private', 'ever_married', 'hypertension', 'smoking_status_never smoked', 'high_risk_lifestyle', 'smoking_status_smokes', 'avg_glucose_level', 'work_type_Self-employed', 'bmi_category_overweight', 'bmi', 'comorbidities', 'Residence_type_Urban'] ['age', 'elderly_comorbid', 'hypertension', 'work_type_Private', 'smoking_status_never smoked']
expected high_risk_lifestyle, comorbidities, Residence_type_Urban, bmi, smoking_status_smokes, work_type_Self-employed, ever_married, avg_glucose_level, bmi_category_overweight in input data