# **Heart Disease**

## Tabela de Conteúdo
1) Importação de bibliotecas + dataset
2) EDA (Análise Exploratória de Dados)
3) Pré-processamento de Dados
    - Limpeza de Dados
    - Normalização de Dados
    - Codificação de Dados
    - Engenharia de Features
    - Divisão de Dados
    - Aumento de Dados
4) Seleção do Modelo
    - Treinamento de Modelos (usando validação cruzada)
    - Comparação de Modelos (múltiplos algoritmos)
    - Otimização de Hiperparâmetros
    - Avaliação do Modelo
5) Salvamento do Melhor Modelo

## 1) Importação de bibliotecas e dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter
import ydata_profiling as pp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from mlxtend.classifier import StackingCVClassifier

In [2]:
data = pd.read_csv('./db/processed.cleveland.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


## 2) Análise Exploratória de Dados (EDA)

In [None]:
print(f"Dataset shape: {data.shape}")
print("\nData types:")
print(data.dtypes)


In [None]:
print("\nMissing values:")
print(data.isnull().sum())

print("\nBasic statistics:")
print(data.describe())

In [None]:
print("\nTarget variable distribution:")
print(data['target'].value_counts())
print(data['target'].value_counts(normalize=True).round(2))

In [None]:
plt.figure(figsize=(14, 10))
correlation_matrix = data.apply(lambda x: pd.factorize(x)[0] if x.dtype == 'object' else x).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(15, 12))
for i, feature in enumerate(numerical_features):
    plt.subplot(4, 3, i+1)
    sns.histplot(data=data, x=feature, hue='target', kde=True)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
categorical_features = data.select_dtypes(include=['object']).columns.drop('target') if 'target' in data.select_dtypes(include=['object']).columns else data.select_dtypes(include=['object']).columns
plt.figure(figsize=(15, 10))
for i, feature in enumerate(categorical_features):
    plt.subplot(4, 2, i+1)
    sns.countplot(data=data, x=feature, hue='target')
    plt.title(f'Distribution of {feature}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(data=data, x='age', hue='target', fill=True)
plt.title('Age Distribution by Heart Disease Status')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
subplot_count = 0
for feature in numerical_features:
    if feature != 'sex' and feature != 'fbs' and feature != 'exang':
        subplot_count += 1
        plt.subplot(3, 3, subplot_count)
        sns.boxplot(data=data, x='target', y=feature)
        plt.title(f'{feature} by Heart Disease Status')
plt.tight_layout()
plt.show()

In [None]:
profile = pp.ProfileReport(data, title="Heart Disease Dataset Profiling Report")
profile.to_notebook_iframe()

In [None]:
data.replace('?', "NA", inplace=True)

## 3) Pré-processamento de Dados

In [None]:
missing_values = data.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0])

data['ca'] = pd.to_numeric(data['ca'], errors='coerce')
data['ca'].fillna(data['ca'].median(), inplace=True)

data['thal'].fillna(data['thal'].mode()[0], inplace=True)

print("\nMissing values after imputation:")
print(data.isnull().sum().sum())

### Limpeza de Dados

Converta variáveis ​​categóricas em formato numérico para modelos de aprendizado de máquina.

In [None]:
categorical_cols = data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols.tolist())

data_encoded = pd.get_dummies(data, columns=['cp', 'thal'], drop_first=True)

data_encoded['target'] = (data_encoded['target'] > 0).astype(int)

print("\nShape after encoding:", data_encoded.shape)
print("\nEncoded columns:", data_encoded.columns.tolist())

data_encoded = data_encoded.dropna(subset=['target'])

### Normalização de Dados

Padronize características numéricas para que média = 0 e desvio padrão = 1.

In [None]:
numerical_features_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']

scaler = StandardScaler()

data_encoded[numerical_features_to_scale] = scaler.fit_transform(data_encoded[numerical_features_to_scale])

data_encoded[numerical_features_to_scale].describe().T

### Engenharia de Features

Cria novos recursos que possam ser úteis para previsão.

In [None]:
data_encoded['AgeGroup'] = pd.cut(data['age'], bins=[0, 40, 55, 65, 100], 
labels=['Young', 'Middle-aged', 'Senior', 'Elderly'])

data_encoded = pd.get_dummies(data_encoded, columns=['AgeGroup'], drop_first=True)

data_encoded['BP_per_Age'] = data['trestbps'] / data['age']
data_encoded['HR_per_Age'] = data['thalach'] / data['age']

print("Dataset shape after feature engineering:", data_encoded.shape)
data_encoded.head()

### Divisão de Dados

Divide o conjunto de dados em conjuntos de treinamento e teste.

In [None]:
X = data_encoded.drop('target', axis=1)
y = data_encoded['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in test set:")
print(y_test.value_counts(normalize=True))

### Aumento de Dados

Implementa técnicas de aumento de dados para lidar com desequilíbrio de classes, se necessário.

In [None]:
class_counts = y_train.value_counts()
print("Class distribution before augmentation:")
print(class_counts)

if abs(class_counts[0] - class_counts[1]) / len(y_train) > 0.2:  # If imbalance exceeds 20%
    from imblearn.over_sampling import SMOTE
    
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    print("\nClass distribution after SMOTE:")
    print(pd.Series(y_train_resampled).value_counts())
    
    X_train = X_train_resampled
    y_train = y_train_resampled
    
    print(f"\nNew X_train shape: {X_train.shape}")
else:
    print("\nNo significant class imbalance detected. Skipping augmentation.")

## 4) Seleção do Modelo

Nesta seção, construiremos e avaliaremos múltiplos modelos de aprendizado de máquina para predição de doenças cardíacas:
1. Treinar modelos usando validação cruzada
2. Comparar o desempenho dos modelos
3. Ajustar os hiperparâmetros dos melhores modelos
4. Avaliar o desempenho final do modelo

### Treinamento de Modelos (usando validação cruzada)

Treinaremos vários modelos de classificação usando validação cruzada k-fold para obter uma estimativa confiável de seu desempenho.

In [None]:
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(random_state=42)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {}

for name, model in models.items():
    cv_accuracy = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    cv_roc_auc = cross_val_score(model, X_train, y_train, cv=cv, scoring='roc_auc')
    cv_f1 = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')

    cv_results[name] = {
        'Accuracy': cv_accuracy,
        'ROC-AUC': cv_roc_auc,
        'F1 Score': cv_f1
    }
    
    print(f"Model: {name}")
    print(f"Mean Accuracy: {cv_accuracy.mean():.4f} (±{cv_accuracy.std():.4f})")
    print(f"Mean ROC-AUC: {cv_roc_auc.mean():.4f} (±{cv_roc_auc.std():.4f})")
    print(f"Mean F1 Score: {cv_f1.mean():.4f} (±{cv_f1.std():.4f})")
    print('-'*50)

### Comparação de Modelos

Vamos comparar o desempenho de diferentes modelos com base nos resultados da validação cruzada.

In [None]:
cv_accuracy_df = pd.DataFrame({name: results['Accuracy'] for name, results in cv_results.items()})
cv_roc_auc_df = pd.DataFrame({name: results['ROC-AUC'] for name, results in cv_results.items()})
cv_f1_df = pd.DataFrame({name: results['F1 Score'] for name, results in cv_results.items()})

cv_means = pd.DataFrame({
    'Mean Accuracy': cv_accuracy_df.mean(),
    'Mean ROC-AUC': cv_roc_auc_df.mean(),
    'Mean F1 Score': cv_f1_df.mean()
}).sort_values(by='Mean ROC-AUC', ascending=False)

cv_std = pd.DataFrame({
    'Std Accuracy': cv_accuracy_df.std(),
    'Std ROC-AUC': cv_roc_auc_df.std(),
    'Std F1 Score': cv_f1_df.std()
})

print("Models ranked by ROC-AUC score:")
print(cv_means)

In [None]:
plt.figure(figsize=(14, 6))
ax = sns.boxplot(data=cv_accuracy_df)
ax.set_title('Cross-Validation Accuracy Comparison', fontsize=16)
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6))
ax = sns.boxplot(data=cv_roc_auc_df)
ax.set_title('Cross-Validation ROC-AUC Comparison', fontsize=16)
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('ROC-AUC', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(14, 6))
ax = sns.boxplot(data=cv_f1_df)
ax.set_title('Cross-Validation F1 Score Comparison', fontsize=16)
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('F1 Score', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Otimização de Hiperparâmetros

Vamos selecionar os modelos de melhor desempenho da nossa comparação e otimizar seus hiperparâmetros usando a Pesquisa em Grade ou a Pesquisa Aleatória com um espaço de parâmetros simplificado para evitar problemas de memória.

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score

print("Tuning XGBoost hyperparameters...")
xgb_param_grid = {
    'learning_rate': [0.1],
    'max_depth': [3, 5],
    'n_estimators': [100]
}

try:
    xgb_random = RandomizedSearchCV(
        estimator=XGBClassifier(random_state=42),
        param_distributions=xgb_param_grid,
        n_iter=2,
        scoring='roc_auc',
        cv=3,
        verbose=1,
        random_state=42,
        n_jobs=1
    )

    xgb_random.fit(X_train, y_train)

    print(f"Best parameters: {xgb_random.best_params_}")
    print(f"Best score: {xgb_random.best_score_:.4f}")
    print("-"*50)

    best_xgb = xgb_random.best_estimator_
    
except Exception as e:
    print(f"Error during XGBoost tuning: {e}")
    print("Using default XGBoost model instead")
    best_xgb = XGBClassifier(
        random_state=42,
        learning_rate=0.1,
        max_depth=3,
        n_estimators=100
    )
    best_xgb.fit(X_train, y_train)

In [None]:
print("Tuning Random Forest hyperparameters...")
rf_param_grid = {
    'n_estimators': [100],
    'max_depth': [None, 10],
    'min_samples_split': [2]
}

try:
    rf_random = RandomizedSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_distributions=rf_param_grid,
        n_iter=2,
        scoring='roc_auc',
        cv=3,
        verbose=1,
        random_state=42,
        n_jobs=1
    )

    rf_random.fit(X_train, y_train)

    print(f"Best parameters: {rf_random.best_params_}")
    print(f"Best score: {rf_random.best_score_:.4f}")
    print("-"*50)

    best_rf = rf_random.best_estimator_
    
except Exception as e:
    print(f"Error during Random Forest tuning: {e}")
    print("Using default Random Forest model instead")
    best_rf = RandomForestClassifier(
        random_state=42,
        n_estimators=100
    )
    best_rf.fit(X_train, y_train)

In [None]:
print("Tuning Logistic Regression hyperparameters...")

lr_param_grid = [
    {'penalty': ['l2'], 'C': [0.1, 1, 10], 'solver': ['lbfgs'], 'max_iter': [1000]},
    {'penalty': ['none'], 'solver': ['lbfgs'], 'max_iter': [1000]}
]

try:
    lr_random = RandomizedSearchCV(
        estimator=LogisticRegression(random_state=42),
        param_distributions=lr_param_grid,
        n_iter=2,
        scoring='roc_auc',
        cv=3,
        verbose=1,
        random_state=42,
        n_jobs=1
    )

    lr_random.fit(X_train, y_train)

    print(f"Best parameters: {lr_random.best_params_}")
    print(f"Best score: {lr_random.best_score_:.4f}")
    print("-"*50)

    best_lr = lr_random.best_estimator_
    
except Exception as e:
    print(f"Error during Logistic Regression tuning: {e}")
    print("Using default Logistic Regression model instead")
    best_lr = LogisticRegression(
        random_state=42,
        max_iter=1000
    )
    best_lr.fit(X_train, y_train)

### Avaliação do Modelo

Agora, vamos avaliar nossos modelos ajustados no conjunto de testes para avaliar seu desempenho.

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

print("Evaluating the best models on the test set...")

xgb_eval = evaluate_model(best_xgb, X_test, y_test)
print("XGBoost Performance:")
print(f"Accuracy: {xgb_eval['accuracy']:.4f}")
print(f"Precision: {xgb_eval['precision']:.4f}")
print(f"Recall: {xgb_eval['recall']:.4f}")
print(f"F1 Score: {xgb_eval['f1_score']:.4f}")
print(f"ROC-AUC: {xgb_eval['roc_auc']:.4f}")
print("Confusion Matrix:")
print(xgb_eval['confusion_matrix'])
print("-"*50)

rf_eval = evaluate_model(best_rf, X_test, y_test)
print("Random Forest Performance:")
print(f"Accuracy: {rf_eval['accuracy']:.4f}")
print(f"Precision: {rf_eval['precision']:.4f}")
print(f"Recall: {rf_eval['recall']:.4f}")
print(f"F1 Score: {rf_eval['f1_score']:.4f}")
print(f"ROC-AUC: {rf_eval['roc_auc']:.4f}")
print("Confusion Matrix:")
print(rf_eval['confusion_matrix'])
print("-"*50)

lr_eval = evaluate_model(best_lr, X_test, y_test)
print("Logistic Regression Performance:")
print(f"Accuracy: {lr_eval['accuracy']:.4f}")
print(f"Precision: {lr_eval['precision']:.4f}")
print(f"Recall: {lr_eval['recall']:.4f}")
print(f"F1 Score: {lr_eval['f1_score']:.4f}")
print(f"ROC-AUC: {lr_eval['roc_auc']:.4f}")
print("Confusion Matrix:")
print(lr_eval['confusion_matrix'])
print("-"*50)

In [None]:
plt.figure(figsize=(10, 8))

models_eval = [
    ('XGBoost', xgb_eval, 'blue'),
    ('Random Forest', rf_eval, 'green'),
    ('Logistic Regression', lr_eval, 'red')
]

for name, eval_results, color in models_eval:
    fpr, tpr, _ = roc_curve(y_test, eval_results['y_pred_proba'])
    roc_auc = eval_results['roc_auc']
    
    plt.plot(fpr, tpr, color=color, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

models_eval = [
    ('XGBoost', xgb_eval, axes[0]),
    ('Random Forest', rf_eval, axes[1]),
    ('Logistic Regression', lr_eval, axes[2])
]

for name, eval_results, ax in models_eval:
    cm = eval_results['confusion_matrix']
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')
    ax.set_title(f'{name} Confusion Matrix')
    ax.set_xticklabels(['Negative', 'Positive'])
    ax.set_yticklabels(['Negative', 'Positive'])

plt.tight_layout()
plt.show()

In [None]:
model_comparison = pd.DataFrame({
    'XGBoost': [
        xgb_eval['accuracy'],
        xgb_eval['precision'],
        xgb_eval['recall'],
        xgb_eval['f1_score'],
        xgb_eval['roc_auc']
    ],
    'Random Forest': [
        rf_eval['accuracy'],
        rf_eval['precision'],
        rf_eval['recall'],
        rf_eval['f1_score'],
        rf_eval['roc_auc']
    ],
    'Logistic Regression': [
        lr_eval['accuracy'],
        lr_eval['precision'],
        lr_eval['recall'],
        lr_eval['f1_score'],
        lr_eval['roc_auc']
    ]
}, index=['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC-AUC'])

print("Model Comparison Summary:")
print(model_comparison)

plt.figure(figsize=(12, 8))
model_comparison.plot(kind='bar', figsize=(12, 8))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1.0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title='Models')
plt.tight_layout()
plt.show()

### Análise de Importância de Recursos

Vamos analisar quais características contribuem mais para as previsões do nosso melhor modelo.

In [None]:
best_model = best_xgb

if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values(by='Importance', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()
    
elif hasattr(best_model, 'coef_'):
    coefficients = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': best_model.coef_[0]
    }).sort_values(by='Coefficient', ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Coefficient', y='Feature', data=coefficients)
    plt.title('Feature Coefficients')
    plt.tight_layout()
    plt.show()
else:
    print("Feature importance not available for this model.")

In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predição de Doenças Cardiovasculares com MLP\n",
    "\n",
    "Este notebook implementa o proposto no TCC: treina uma rede neural MLP e compara seu desempenho com a Regressão Logística usando a base de dados pública UCI Heart Disease."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Importação de Bibliotecas e Dados"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Carregar o dataset\n",
    "data = pd.read_csv('./processed.cleveland.csv')\n",
    "data['ca'] = pd.to_numeric(data['ca'], errors='coerce')\n",
    "data['ca'].fillna(data['ca'].median(), inplace=True)\n",
    "data['thal'].fillna(data['thal'].mode()[0], inplace=True)\n",
    "# Binarizar target: 0 = sem doença, 1 = com doença\n",
    "data['target'] = (data['target'] > 0).astype(int)\n",
    "data = data.dropna(subset=['target'])"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Pré-processamento"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# One-hot encoding para variáveis categóricas\n",
    "data_encoded = pd.get_dummies(data, columns=['cp', 'thal'], drop_first=True)\n",
    "\n",
    "# Normalização dos dados numéricos\n",
    "numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']\n",
    "scaler = StandardScaler()\n",
    "data_encoded[numerical_features] = scaler.fit_transform(data_encoded[numerical_features])"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Separar features e target\n",
    "X = data_encoded.drop('target', axis=1)\n",
    "y = data_encoded['target']\n",
    "\n",
    "# Dividir em treino e teste\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Treinamento dos Modelos"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Regressão Logística\n",
    "lr = LogisticRegression(max_iter=1000, random_state=42)\n",
    "lr.fit(X_train, y_train)\n",
    "\n",
    "# MLP\n",
    "mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)\n",
    "mlp.fit(X_train, y_train)"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Avaliação dos Modelos"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def evaluate_model(model, X_test, y_test):\n",
    "    y_pred = model.predict(X_test)\n",
    "    y_pred_proba = model.predict_proba(X_test)[:, 1]\n",
    "    return {\n",
    "        'accuracy': accuracy_score(y_test, y_pred),\n",
    "        'precision': precision_score(y_test, y_pred),\n",
    "        'recall': recall_score(y_test, y_pred),\n",
    "        'f1': f1_score(y_test, y_pred),\n",
    "        'roc_auc': roc_auc_score(y_test, y_pred_proba),\n",
    "        'confusion_matrix': confusion_matrix(y_test, y_pred),\n",
    "        'y_pred_proba': y_pred_proba\n",
    "    }\n",
    "\n",
    "lr_eval = evaluate_model(lr, X_test, y_test)\n",
    "mlp_eval = evaluate_model(mlp, X_test, y_test)\n",
    "\n",
    "print('Logistic Regression:', lr_eval)\n",
    "print('MLP:', mlp_eval)"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Comparação Visual"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# ROC Curve\n",
    "plt.figure(figsize=(8,6))\n",
    "for name, eval_result, color in [\n",
    "    ('Logistic Regression', lr_eval, 'blue'),\n",
    "    ('MLP', mlp_eval, 'red')]:\n",
    "    fpr, tpr, _ = roc_curve(y_test, eval_result['y_pred_proba'])\n",
    "    plt.plot(fpr, tpr, label=f'{name} (AUC={eval_result[\"roc_auc\"]:.2f})', color=color)\n",
    "plt.plot([0,1],[0,1],'k--')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.title('ROC Curve')\n",
    "plt.legend()\n",
    "plt.grid(True)\n",
    "plt.show()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "# Matriz de confusão\n",
    "fig, axes = plt.subplots(1, 2, figsize=(12, 5))\n",
    "for ax, (name, eval_result) in zip(axes, [('Logistic Regression', lr_eval), ('MLP', mlp_eval)]):\n",
    "    sns.heatmap(eval_result['confusion_matrix'], annot=True, fmt='d', cmap='Blues', ax=ax)\n",
    "    ax.set_title(f'{name} Confusion Matrix')\n",
    "    ax.set_xlabel('Predicted')\n",
    "    ax.set_ylabel('True')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ],
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Conclusão\n",
    "\n",
    "- O notebook treina e compara uma MLP e uma Regressão Logística para predição de doença cardíaca.\n",
    "- Métricas e gráficos permitem avaliar qual modelo é mais adequado para o problema."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": ""
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## 5) Salvamento do Melhor Modelo

Vamos salvar nosso modelo de melhor desempenho em disco para que ele possa ser usado em produção ou compartilhado com outros.

In [None]:
import joblib
import os

model_performances = {
    'Logistic Regression': lr_eval['roc_auc'],
    'Random Forest': rf_eval['roc_auc'],
    'XGBoost': xgb_eval['roc_auc']
}

best_model_name = max(model_performances, key=model_performances.get)
print(f"The best performing model is: {best_model_name} with ROC-AUC: {model_performances[best_model_name]:.4f}")

model_objects = {
    'Logistic Regression': best_lr,
    'Random Forest': best_rf,
    'XGBoost': best_xgb
}

best_model = model_objects[best_model_name]

save_dir = 'saved_models'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

model_filename = os.path.join(save_dir, f'{best_model_name.replace(" ", "_").lower()}_model.joblib')
joblib.dump(best_model, model_filename)

scaler_filename = os.path.join(save_dir, 'scaler.joblib')
joblib.dump(scaler, scaler_filename)

preprocessing_info = {
    'numerical_features_to_scale': numerical_features_to_scale,
    'feature_names': list(X.columns)
}
preprocessing_filename = os.path.join(save_dir, 'preprocessing_info.joblib')
joblib.dump(preprocessing_info, preprocessing_filename)

print(f"Model saved to {model_filename}")
print(f"Scaler saved to {scaler_filename}")
print(f"Preprocessing info saved to {preprocessing_filename}")

In [None]:
loaded_model = joblib.load(model_filename)
loaded_scaler = joblib.load(scaler_filename)
loaded_preprocessing_info = joblib.load(preprocessing_filename)

sample_index = 0
sample_X = X_test.iloc[[sample_index]]
sample_y = y_test.iloc[sample_index]

sample_pred = loaded_model.predict(sample_X)[0]
sample_pred_proba = loaded_model.predict_proba(sample_X)[0, 1]

print(f"Sample features: {sample_X.values[0]}")
print(f"True label: {'Heart Disease' if sample_y == 1 else 'No Heart Disease'}")
print(f"Predicted label: {'Heart Disease' if sample_pred == 1 else 'No Heart Disease'}")
print(f"Predicted probability of Heart Disease: {sample_pred_proba:.4f}")