In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss
from imblearn.over_sampling import ADASYN
from imblearn.pipeline import Pipeline as ImbPipeline

# Cargar el dataset
df = pd.read_csv(r'C:\Users\Administrator\OneDrive\Documentos\GitHub\G5_D.Scientist\data\stroke_dataset_processed.csv')

# Separar características y variable objetivo
X = df.drop('stroke', axis=1)
y = df['stroke']

# Codificación de variables categóricas
le = LabelEncoder()
X['gender'] = le.fit_transform(X['gender'])
X['smoking_status'] = le.fit_transform(X['smoking_status'])

# Ingeniería de características
X['age_squared'] = X['age'] ** 2
X['glucose_age_interaction'] = X['age'] * X['avg_glucose_level']

# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Función para encontrar el umbral óptimo
def find_optimal_threshold(y_true, y_pred_proba):
    thresholds = np.linspace(0, 1, 100)
    f1_scores = [f1_score(y_true, y_pred_proba >= threshold) for threshold in thresholds]
    optimal_threshold = thresholds[np.argmax(f1_scores)]
    return optimal_threshold

# Modificar el pipeline y los parámetros de búsqueda
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),
    ('sampler', ADASYN(random_state=42)),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

param_grid = {
    'classifier__C': np.logspace(-3, -1, 20),  # Aumentar la regularización
    'classifier__penalty': ['l1', 'l2', 'elasticnet'],
    'classifier__solver': ['saga'],
    'classifier__l1_ratio': np.linspace(0, 1, 5)
}

# Realizar la búsqueda de hiperparámetros (mantén este código igual)

# Modificar el ensemble
nb = GaussianNB()
svm = SVC(kernel='linear', probability=True, random_state=42, C=0.1)  # Aumentar regularización
rf = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=5, random_state=42)  # Limitar profundidad

ensemble = VotingClassifier(
    estimators=[
        ('lr', best_model),
        ('nb', nb),
        ('svm', svm),
        ('rf', rf)
    ],
    voting='soft'
)

# Función para evaluar el modelo
def evaluate_model(model, X, y, dataset_name):
    y_pred_proba = model.predict_proba(X)[:, 1]
    y_pred = (y_pred_proba >= 0.5).astype(int)  # Usar un umbral fijo de 0.5
    
    accuracy = accuracy_score(y, y_pred)
    auc_roc = roc_auc_score(y, y_pred_proba)
    brier = brier_score_loss(y, y_pred_proba)
    
    print(f"\n{dataset_name} Set Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC-AUC: {auc_roc:.4f}")
    print(f"Brier Score: {brier:.4f}")
    
    return accuracy, auc_roc, brier

# Entrenar y evaluar el ensemble
ensemble.fit(X_train, y_train)
train_accuracy, train_auc, train_brier = evaluate_model(ensemble, X_train, y_train, "Training")
test_accuracy, test_auc, test_brier = evaluate_model(ensemble, X_test, y_test, "Test")

# Calcular el overfitting
print("\nOverfitting Metrics:")
print(f"Accuracy Overfitting (Train - Test): {train_accuracy - test_accuracy:.4f}")
print(f"AUC Overfitting (Train - Test): {train_auc - test_auc:.4f}")
print(f"Brier Score Difference (Train - Test): {train_brier - test_brier:.4f}")


# Visualizar la importancia de las características usando Random Forest
rf_model = ensemble.named_estimators_['rf']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Visualizar la curva ROC
from sklearn.metrics import roc_curve

y_pred_proba = ensemble.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {test_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.close()

# Guardar el modelo ensemble y el label encoder
joblib.dump(ensemble, 'best_ensemble_model.joblib')
joblib.dump(le, 'label_encoder.joblib')



Training Set Performance:
Accuracy: 0.8826
ROC-AUC: 0.8135
Brier Score: 0.0799

Test Set Performance:
Accuracy: 0.8975
ROC-AUC: 0.8260
Brier Score: 0.0737

Overfitting Metrics:
Accuracy Overfitting (Train - Test): -0.0149
AUC Overfitting (Train - Test): -0.0125
Brier Score Difference (Train - Test): 0.0063


['label_encoder.joblib']