## 1. Setup & Environment

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.inspection import permutation_importance

from pathlib import Path
sys.path.append(str(Path.cwd().parent))

from data_processing.disease_dataset_process import DataProcessor, CATEGORICAL_COLUMNS
from models_classes.mlp_disease_neural_net import DengueTabularNN
from models_classes.lgbm_classifier import LGBMDiseaseClassifier

print(f'PyTorch: {torch.__version__}')
print(f'CUDA built with: {torch.version.cuda}')
print(f'CUDA available: {torch.cuda.is_available()}')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Data Loading & Feature Engineering

`DataProcessor` handles loading the raw CSVs, renaming columns, dropping leaky/irrelevant fields, deriving date features, encoding categorical columns, binarizing symptoms/comorbidities, and mapping the target — all in one call.

In [None]:
type_disease = 'chikungunya'  # Change to 'dengue' for Dengue dataset

data_processor = DataProcessor(type_disease=type_disease)
df, categorical_columns, _ = data_processor.load_data_process()
numerical_columns = df.drop(columns=categorical_columns + ['final_classification']).columns

## 3. Model Training — Neural Network

### 3.1 Convert to tensors and split

In [None]:
categorical_tensors, numerical_tensors, target_tensor, embedding_sizes = DengueTabularNN._prepare_data(df, categorical_columns)
x_train_cat, x_test_cat, x_train_num, x_test_num, y_train, y_test = train_test_split(categorical_tensors, numerical_tensors, target_tensor, test_size=0.1, shuffle=True, random_state=42)

print(f'Train size: {len(y_train)} | Test size: {len(y_test)}')

### 3.2 Instantiate model

In [None]:
dengue_model = DengueTabularNN(numericals_shape=x_train_num.shape[1], embedding_sizes=embedding_sizes, hidden_layers=[2048, 1024, 512, 256], probability_dropout=[0.1, 0.2]).to(device)

### 3.3 Model architecture — `DengueTabularNN`

Tabular neural network with:
- **Embeddings** for categorical columns (each category gets a dense vector)
- **BatchNorm** for numerical inputs
- **4 hidden layers** (2048 → 1024 → 512 → 256) with LeakyReLU, BatchNorm, Dropout
- **Single logit output** for BCEWithLogitsLoss

### 3.4 Training loop

**Setup:** 90/10 train/test split, batch size 4096  
**Loss:** BCEWithLogitsLoss with `pos_weight` to handle class imbalance  
**Optimizer:** AdamW (lr=1e-4, weight_decay=1e-4)  
**Scheduler:** ReduceLROnPlateau (patience=3, factor=0.5)  
**Early stopping:** patience=8 on validation loss, saves best checkpoint

In [None]:
train_dataset = TensorDataset(x_train_cat, x_train_num, y_train)
train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True)

test_dataset = TensorDataset(x_test_cat, x_test_num, y_test)
test_loader = DataLoader(test_dataset, batch_size=4096, shuffle=False)

pos_weight = (y_train == 0).sum().float() / (y_train == 1).sum().float()
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight.to(device))
optimizer = torch.optim.AdamW(params=dengue_model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5, min_lr=1e-6)

epochs = 150
train_losses = []
val_losses = []
patience = 8
counter = 0
best_val_loss = float('inf')

for epoch in range(epochs):
    dengue_model.train()
    epoch_train_loss = 0
    for cat, num, target in train_loader:
        cat, num, target = cat.to(device), num.to(device), target.to(device)
        optimizer.zero_grad()
        pred = dengue_model(cat, num)
        loss = criterion(pred, target.unsqueeze(1).float())
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item() * len(cat)

    avg_train_loss = epoch_train_loss / len(train_dataset)
    train_losses.append(avg_train_loss)

    dengue_model.eval()
    epoch_val_loss = 0
    with torch.no_grad():
        for cat, num, target in test_loader:
            cat, num, target = cat.to(device), num.to(device), target.to(device)
            pred = dengue_model(cat, num)
            loss = criterion(pred, target.unsqueeze(1).float())
            epoch_val_loss += loss.item() * len(cat)

    avg_val_loss = epoch_val_loss / len(test_dataset)
    val_losses.append(avg_val_loss)
    scheduler.step(avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(dengue_model.state_dict(), f'C:\\Users\\angej\\Documents\\2_Programação\\health_index_project\\models_saved\\best_{type_disease}_model.pth')
    else:
        counter += 1
        if counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break

    print(f'Epoch: {epoch:3d} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | LR: {scheduler._last_lr[0]:.6f}')

### 3.5 Loss curves

In [None]:
sns.set_style('whitegrid')

plt.figure(figsize=(9, 5), dpi = 100)
sns.lineplot(x=range(1, len(train_losses) + 1), y=train_losses, label='Train Loss')
sns.lineplot(x=range(1, len(val_losses) + 1), y=val_losses, label='Validation Loss')

for spine in plt.gca().spines.values(): spine.set_visible(False)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.title('Training Loss')
plt.show()

## 4. Evaluation

### 4.1 Threshold sweep — Neural Network

Loads the best checkpoint and evaluates Accuracy, Precision, Recall, and F1 across thresholds from 0.30 to 0.60.

In [None]:
dengue_model.load_state_dict(torch.load(f'C:\\Users\\angej\\Documents\\2_Programação\\health_index_project\\models_saved\\best_{type_disease}_model.pth',weights_only=True))
display(dengue_model.evaluate(test_loader, y_test))

### 4.2 Permutation feature importance — Neural Network

Subsamples 2000 test records and computes permutation importance (100 repeats) via a sklearn-compatible wrapper inside `DengueTabularNN.plot_feature_importance`.

In [None]:
dengue_model.plot_feature_importance(x_test_cat, x_test_num, y_test, categorical_columns, numerical_columns)

## 5. Baseline — LightGBM

Trains a gradient-boosted tree model on the same train/test split as a performance baseline.

In [None]:
lgbm = LGBMDiseaseClassifier()
lgbm.fit(x_train_cat, x_train_num, y_train, categorical_columns, numerical_columns)

### 5.1 Feature importance — LightGBM

In [None]:
lgbm.plot_feature_importance(top_n=30)

### 5.2 Threshold sweep — LightGBM

In [None]:
lgbm.evaluate(x_test_cat, x_test_num, y_test, categorical_columns, numerical_columns)