In [20]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable


In [21]:
!pip install ydata_profiling

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [22]:
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.1 -> 24.2
[notice] To update, run: C:\Users\Administrator\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [23]:
from ydata_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import optuna
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

In [24]:
# Cargar el dataset
df = pd.read_csv('stroke_dataset.csv')

# Encoding variables binarias con LabelEncoder
label_encoder = LabelEncoder()
df['ever_married'] = label_encoder.fit_transform(df['ever_married'])
df['gender'] = label_encoder.fit_transform(df['gender'])

# Crear nuevas columnas categóricas y features adicionales

# 1. Grupos de edad
def age_group(age):
    if age <= 18:
        return '0-18'
    elif 19 <= age <= 35:
        return '19-35'
    elif 36 <= age <= 50:
        return '36-50'
    elif 51 <= age <= 65:
        return '51-65'
    else:
        return '65+'

df['age_group'] = df['age'].apply(age_group)

# 2. Combinación de hipertensión y enfermedades cardíacas
df['hypertension_heart_disease'] = ((df['hypertension'] == 1) & (df['heart_disease'] == 1)).astype(int)

# 3. Comorbilidades
df['comorbidities'] = df['hypertension'] + df['heart_disease']

# 4. Categorías de IMC
def bmi_category(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 25:
        return 'normal'
    elif 25 <= bmi < 30:
        return 'overweight'
    else:
        return 'obese'

df['bmi_category'] = df['bmi'].apply(bmi_category)

# 5. Obesidad
df['obese'] = df['bmi'].apply(lambda x: 1 if x >= 30 else 0)

# 6. Fumador actual o pasado
df['smoker_status'] = df['smoking_status'].apply(lambda x: 1 if x in ['formerly smoked', 'smokes'] else 0)

# 7. Riesgo elevado
df['high_risk_lifestyle'] = ((df['smoker_status'] == 1) | (df['hypertension'] == 1) | (df['heart_disease'] == 1)).astype(int)

# 8. Glucosa elevada
df['high_glucose'] = df['avg_glucose_level'].apply(lambda x: 1 if x > 140 else 0)

# 9. Combinación de edad avanzada y comorbilidades
df['elderly_comorbid'] = ((df['age'] > 65) & (df['comorbidities'] > 0)).astype(int)

# Separar las características y el objetivo
y = df['stroke']
X = df.drop('stroke', axis=1)

# 1. Identificar columnas categóricas (incluyendo smoking_status)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# 2. Aplicar OneHotEncoder a las variables categóricas
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_data = encoder.fit_transform(X[categorical_cols])

# 3. Crear un DataFrame con las columnas codificadas
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# 4. Concatenar las columnas codificadas con las características numéricas
X_encoded = pd.concat([X.drop(categorical_cols, axis=1), encoded_df], axis=1)

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Calcular pesos de clase para manejar el desbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]}

# Ahora puedes usar X_train, X_test, y_train, y_test para entrenar tu modelo



# Optimización de hiperparámetros con Optuna



XGBoost

In [25]:


# Función objetivo para XGBoost
def objective_xgb(trial):
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5, 0.7, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.5, 0.7, 1.0]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'scale_pos_weight': class_weights_dict[1]  # Aplicar el upweighting
    }

    # Convertir datos a DMatrix para XGBoost
    train_dmatrix = xgb.DMatrix(X_train, label=y_train)
    valid_dmatrix = xgb.DMatrix(X_test, label=y_test)

    # Entrenar el modelo
    xgb_clf = xgb.train(param, train_dmatrix)

    # Predecir en el conjunto de prueba
    preds = xgb_clf.predict(valid_dmatrix)
    pred_labels = np.rint(preds)

    # Evaluar la métrica objetivo (f1_score en este caso)
    score = f1_score(y_test, pred_labels)
    return score

# Iniciar el estudio de Optuna para XGBoost
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=100)

# Extraer los mejores hiperparámetros
best_params_xgb = study_xgb.best_trial.params
print("Best hyperparameters for XGBoost: ", best_params_xgb)

[I 2024-10-16 10:36:00,707] A new study created in memory with name: no-name-743d9f70-0ca9-4825-ae5b-f2dc3e3247f9
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
[I 2024-10-16 10:36:00,832] Trial 0 finished with value: 0.0 and parameters: {'lambda': 0.0002637751701169119, 'alpha': 2.452886469210311e-05, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.001281629158770003, 'n_estimators': 287, 'max_depth': 4, 'min_child_weight': 1}. Best is trial 0 with value: 0.0.
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
[I 2024-10-16 10:36:00,912] Trial 1 finished with value: 0.0 and parameters: {'lambda': 1.5097271719915534e-06, 'alpha': 0.6148857087826212, 'colsample_bytree': 1.0, 'subsample': 0.7, 'lear

Best hyperparameters for XGBoost:  {'lambda': 1.7150608101583486e-07, 'alpha': 8.495865316200164e-05, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.15023631894267453, 'n_estimators': 266, 'max_depth': 6, 'min_child_weight': 7}


Regresión lineal

#Entrenamiento del modelo

In [26]:
# Entrenar el modelo XGBoost con los mejores hiperparámetros
xgb_model_optimized = xgb.XGBClassifier(**best_params_xgb)
xgb_model_optimized.fit(X_train, y_train)

# Predecir en el conjunto de prueba
y_pred_xgb_optimized = xgb_model_optimized.predict(X_test)

# Evaluar el rendimiento del modelo optimizado
from sklearn.metrics import classification_report

In [27]:
# Extraer importancias
importances = xgb_model_optimized.feature_importances_

# Crear un DataFrame para visualizar las importancias
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances
})

# Ordenar las características por importancia
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print("Importancia de las características:")
print(feature_importances)

Importancia de las características:
                           Feature  Importance
28                   age_group_65+    0.093819
25                 age_group_19-35    0.083634
1                              age    0.044547
4                     ever_married    0.043095
24                  age_group_0-18    0.041301
2                     hypertension    0.037956
27                 age_group_51-65    0.037771
26                 age_group_36-50    0.032853
8                    comorbidities    0.032503
15               work_type_Private    0.030346
5                avg_glucose_level    0.030319
6                              bmi    0.030116
30              bmi_category_obese    0.029889
22     smoking_status_never smoked    0.029653
16         work_type_Self-employed    0.029330
31         bmi_category_overweight    0.027599
3                    heart_disease    0.027180
21  smoking_status_formerly smoked    0.026994
14              work_type_Govt_job    0.026599
29             bmi_categ

In [28]:
# Seleccionar las 5 características más importantes
top_n = 5  # Cambia este valor según lo que desees
important_features = feature_importances.head(top_n)['Feature'].tolist()

# Filtrar los conjuntos de datos
X_train_important = X_train[important_features]
X_test_important = X_test[important_features]

import numpy as np
from sklearn.metrics import classification_report

# Entrenar el modelo XGBoost
xgb_model_optimized = xgb.XGBClassifier(**best_params_xgb)
xgb_model_optimized.fit(X_train_important, y_train)

# Obtener las probabilidades de las predicciones
y_prob_xgb_optimized = xgb_model_optimized.predict_proba(X_test_important)[:, 1]

# Definir un nuevo umbral
new_threshold = 0.165  # Ajusta este valor según sea necesario

# Clasificar las predicciones basadas en el nuevo umbral
y_pred_xgb_adjusted = (y_prob_xgb_optimized >= new_threshold).astype(int)

# Evaluar el rendimiento del modelo con el nuevo umbral
print("\nXGBoost Classification Report with Adjusted Threshold:")
print(classification_report(y_test, y_pred_xgb_adjusted))

from sklearn.metrics import accuracy_score

# Predecir en el conjunto de entrenamiento
y_pred_train = xgb_model_optimized.predict(X_train_important)

# Calcular la precisión en el conjunto de entrenamiento
accuracy_train = accuracy_score(y_train, y_pred_train)

# Calcular la precisión en el conjunto de prueba con el nuevo umbral
accuracy_test = accuracy_score(y_test, y_pred_xgb_adjusted)

# Calcular el porcentaje de overfitting
overfitting_percentage = (accuracy_train - accuracy_test) / accuracy_train * 100

# Imprimir los resultados
print(f"Accuracy en el conjunto de entrenamiento: {accuracy_train:.2f}")
print(f"Accuracy en el conjunto de prueba: {accuracy_test:.2f}")
print(f"Porcentaje de overfitting: {overfitting_percentage:.2f}%")


XGBoost Classification Report with Adjusted Threshold:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       943
           1       0.16      0.19      0.17        54

    accuracy                           0.90       997
   macro avg       0.56      0.56      0.56       997
weighted avg       0.91      0.90      0.91       997

Accuracy en el conjunto de entrenamiento: 0.95
Accuracy en el conjunto de prueba: 0.90
Porcentaje de overfitting: 5.11%


A PARTIR DE AQUÍ SON PRUEBAS :)

In [29]:
import pickle

# Guardar el modelo XGBoost entrenado en un archivo pickle
with open('xgb_model_optimized2.pkl', 'wb') as file:
    pickle.dump(xgb_model_optimized, file)

print("Modelo XGBoost guardado exitosamente en 'xgb_model_optimized.pkl'")


Modelo XGBoost guardado exitosamente en 'xgb_model_optimized.pkl'


In [30]:
import pickle
from sklearn.metrics import classification_report, accuracy_score

# Cargar el modelo desde el archivo pickle
with open('xgb_model_optimized_1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Obtener las probabilidades de las predicciones
y_prob_xgb = loaded_model.predict_proba(X_test_important)[:, 1]

# Definir un nuevo umbral
new_threshold = 0.165  # Ajusta este valor según sea necesario

# Clasificar las predicciones basadas en el nuevo umbral
y_pred_loaded_model = (y_prob_xgb >= new_threshold).astype(int)

# Evaluar el rendimiento del modelo cargado
accuracy = accuracy_score(y_test, y_pred_loaded_model)
print(f"Accuracy del modelo cargado: {accuracy:.4f}")

# Imprimir el informe de clasificación
print("\nClassification Report del modelo cargado:")
print(classification_report(y_test, y_pred_loaded_model))

ValueError: feature_names mismatch: ['age', 'elderly_comorbid', 'hypertension', 'work_type_Private', 'smoking_status_never smoked'] ['age_group_65+', 'age_group_19-35', 'age', 'ever_married', 'age_group_0-18']
expected elderly_comorbid, work_type_Private, smoking_status_never smoked, hypertension in input data
training data did not have the following fields: age_group_19-35, age_group_0-18, ever_married, age_group_65+

In [18]:
print(X_train.columns.tolist())  # Lista de columnas del conjunto de entrenamiento


['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'avg_glucose_level', 'bmi', 'hypertension_heart_disease', 'comorbidities', 'obese', 'smoker_status', 'high_risk_lifestyle', 'high_glucose', 'elderly_comorbid', 'work_type_Govt_job', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Rural', 'Residence_type_Urban', 'smoking_status_Unknown', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes', 'age_group_0-18', 'age_group_19-35', 'age_group_36-50', 'age_group_51-65', 'age_group_65+', 'bmi_category_normal', 'bmi_category_obese', 'bmi_category_overweight', 'bmi_category_underweight']


In [19]:
print(X_test_important.columns.tolist())  # Lista de columnas del conjunto de prueba


['age_group_19-35', 'age_group_0-18', 'age_group_65+', 'age', 'ever_married']
