In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline

# Cargar el dataset
df = pd.read_csv(r'C:\Users\Administrator\OneDrive\Documentos\GitHub\G5_D.Scientist\data\stroke_dataset_processed.csv')

# Separar características y variable objetivo
X = df.drop('stroke', axis=1)
y = df['stroke']

# Codificación de variables categóricas
le = LabelEncoder()
X['gender'] = le.fit_transform(X['gender'])
X['smoking_status'] = le.fit_transform(X['smoking_status'])

# Ingeniería de características
X['age_squared'] = X['age'] ** 2
X['glucose_age_interaction'] = X['age'] * X['avg_glucose_level']

# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Función para encontrar el umbral óptimo
def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.linspace(0, 1, 100)
    f1_scores = [f1_score(y_true, y_pred_proba >= threshold) for threshold in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold

# Modificar el pipeline y los parámetros de búsqueda
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('sampler', ADASYN(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

param_grid = {
    'classifier__C': np.logspace(-3, -1, 20),  # Aumentar la regularización
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga'],
    'classifier__l1_ratio': np.linspace(0, 1, 5)
}

# Realizar la búsqueda de hiperparámetros (mantén este código igual)

# Modificar el ensemble
nb = GaussianNB()
svm = SVC(kernel='linear', probability=True, random_state=42, C=0.1)  # Aumentar regularización
rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42)  # Limitar profundidad

ensemble = VotingClassifier(
    estimators=[
        ('lr', best_model),
        ('nb', nb),
        ('svm', svm),
        ('rf', rf)
    ],
    voting='soft'
)

# Función para evaluar el modelo
def evaluate_model(model, X, y, dataset_name):
    y_pred_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_pred_proba >= 0.5).astype(int)  # Usar un umbral fijo de 0.5
    
    accuracy = accuracy_score(y, y_pred)
    auc_roc = roc_auc_score(y, y_pred_proba)
    brier = brier_score_loss(y, y_pred_proba)
    
    print(f"\n{dataset_name} Set Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC: {auc_roc:.4f}")
    print(f"Brier Score: {brier:.4f}")
    
    return accuracy, auc_roc, brier

# Entrenar y evaluar el ensemble
ensemble.fit(X_train, y_train)
train_accuracy, train_auc, train_brier = evaluate_model(ensemble, X_train, y_train, "Training")
test_accuracy, test_auc, test_brier = evaluate_model(ensemble, X_test, y_test, "Test")

# Calcular el overfitting
print("\nOverfitting Metrics:")
print(f"Accuracy Overfitting (Train - Test): {train_accuracy - test_accuracy:.4f}")
print(f"AUC Overfitting (Train - Test): {train_auc - test_auc:.4f}")
print(f"Brier Score Difference (Train - Test): {train_brier - test_brier:.4f}")


# Visualizar la importancia de las características usando Random Forest
rf_model = ensemble.named_estimators_['rf']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Visualizar la curva ROC
from sklearn.metrics import roc_curve

y_pred_proba = ensemble.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {test_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.close()

# Guardar el modelo ensemble y el label encoder
joblib.dump(ensemble, 'best_ensemble_model.joblib')
joblib.dump(le, 'label_encoder.joblib')



Training Set Performance:
Accuracy: 0.8826
ROC-AUC: 0.8135
Brier Score: 0.0799

Test Set Performance:
Accuracy: 0.8975
ROC-AUC: 0.8260
Brier Score: 0.0737

Overfitting Metrics:
Accuracy Overfitting (Train - Test): -0.0149
AUC Overfitting (Train - Test): -0.0125
Brier Score Difference (Train - Test): 0.0063


['label_encoder.joblib']

In [38]:
import joblib
import numpy as np
import pandas as pd

# Cargar el modelo y el label encoder
model = joblib.load('best_ensemble_model.joblib')
le = joblib.load('label_encoder.joblib')

# Definir el orden correcto de las columnas
column_order = ['gender', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'smoking_status']

def get_input():
    while True:
        gender = input("Género (0 para Male, 1 para Female): ")
        if gender in ['0', '1']:
            gender = 'Male' if gender == '0' else 'Female'
            break
        else:
            print("Por favor, ingrese 0 para Male o 1 para Female.")

    while True:
        try:
            age = float(input("Edad: "))
            if 0 <= age <= 120:
                break
            else:
                print("Por favor, ingrese una edad válida entre 0 y 120.")
        except ValueError:
            print("Por favor, ingrese un número válido para la edad.")

    while True:
        hypertension = input("Hipertensión (0 para No, 1 para Sí): ")
        if hypertension in ['0', '1']:
            hypertension = int(hypertension)
            break
        else:
            print("Por favor, ingrese 0 para No o 1 para Sí.")

    while True:
        heart_disease = input("Enfermedad cardíaca (0 para No, 1 para Sí): ")
        if heart_disease in ['0', '1']:
            heart_disease = int(heart_disease)
            break
        else:
            print("Por favor, ingrese 0 para No o 1 para Sí.")

    while True:
        try:
            avg_glucose_level = float(input("Nivel promedio de glucosa: "))
            if 0 <= avg_glucose_level <= 500:
                break
            else:
                print("Por favor, ingrese un nivel de glucosa válido entre 0 y 500.")
        except ValueError:
            print("Por favor, ingrese un número válido para el nivel de glucosa.")

    while True:
        smoking_status = input("Estado de fumador (0: formerly smoked, 1: never smoked, 2: smokes): ")
        if smoking_status in ['0', '1', '2']:
            smoking_status = ['formerly smoked', 'never smoked', 'smokes'][int(smoking_status)]
            break
        else:
            print("Por favor, ingrese 0 para formerly smoked, 1 para never smoked, o 2 para smokes.")

    return gender, age, hypertension, heart_disease, avg_glucose_level, smoking_status

def predict_stroke(gender, age, hypertension, heart_disease, avg_glucose_level, smoking_status):
    # Codificar variables categóricas
    try:
        gender_encoded = le.transform([gender])[0]
    except ValueError:
        # Si la etiqueta no se ha visto antes, asignamos un valor arbitrario
        gender_encoded = -1

    try:
        smoking_status_encoded = le.transform([smoking_status])[0]
    except ValueError:
        # Si la etiqueta no se ha visto antes, asignamos un valor arbitrario
        smoking_status_encoded = -1

    # Crear un diccionario con los datos de entrada
    input_data = {
        'gender': gender_encoded,
        'age': age,
        'hypertension': hypertension,
        'heart_disease': heart_disease,
        'avg_glucose_level': avg_glucose_level,
        'smoking_status': smoking_status_encoded
    }

    # Crear un DataFrame con el orden correcto de las columnas
    input_df = pd.DataFrame([input_data])[column_order]

    # Añadir características ingenieradas
    input_df['age_squared'] = input_df['age'] ** 2
    input_df['glucose_age_interaction'] = input_df['age'] * input_df['avg_glucose_level']

    # Realizar la predicción
    prediction = model.predict_proba(input_df)[0]
    stroke_probability = prediction[1]

    return stroke_probability

if __name__ == "__main__":
    while True:
        print("\nIntroduzca los datos del paciente:")
        gender, age, hypertension, heart_disease, avg_glucose_level, smoking_status = get_input()
        
        probability = predict_stroke(gender, age, hypertension, heart_disease, avg_glucose_level, smoking_status)
        
        print(f"\nLa probabilidad de ictus es: {probability:.2%}")
        if probability > 0.5:
            print("Se recomienda consultar a un médico.")
        else:
            print("El riesgo parece ser bajo, pero siempre es bueno mantener hábitos saludables.")
        
        again = input("\n¿Desea hacer otra predicción? (s/n): ")
        if again.lower() != 's':
            break

    print("Gracias por usar el predictor de ictus.")


Introduzca los datos del paciente:

La probabilidad de ictus es: 51.23%
Se recomienda consultar a un médico.
Gracias por usar el predictor de ictus.
