### Model Training & evaluation notebook

### Create dataloaders for out datasets

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader

df_train = pd.read_csv('../data/processed/train.csv')
df_val = pd.read_csv('../data/processed/validation.csv')

In [24]:
batch_size = 512

In [25]:
df_train = df_train.drop(columns=['Unnamed: 0'])
X_train_tensor = torch.tensor(df_train.drop(columns=['loan_status']).values, dtype=torch.float)
y_train_tensor = torch.tensor(df_train['loan_status'].values, dtype=torch.float)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, persistent_workers=True, num_workers=10)

KeyError: "['Unnamed: 0'] not found in axis"

In [None]:
df_val = df_val.drop(columns=['Unnamed: 0'])
X_val_tensor = torch.tensor(df_val.drop(columns=['loan_status']).values, dtype=torch.float)
y_val_tensor = torch.tensor(df_val['loan_status'].values, dtype=torch.float)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, persistent_workers=True, num_workers=10)

In [None]:
import torch.nn as nn

pos_weight = 2.0
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight))

### Define model architecture with pytorch lightning

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.optim import Adam
from sklearn.metrics import recall_score

In [26]:
class CreditRiskModel (pl.LightningModule):
    def __init__(self, input_dim = 23, hidden = 64, sigmoid_threashold=0.5):
        super().__init__()
        self.save_hyperparameters()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(), 
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, 1)
        )
        self.threashold = sigmoid_threashold
    
    def forward(self, x):
        logits = self.model(x)
        return logits.squeeze(1)
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        logits = self.forward(X)
        # loss = F.binary_cross_entropy_with_logits(logits, y)
        loss = criterion(logits, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        logits = self.forward(X)
        # loss = F.binary_cross_entropy_with_logits(logits, y)
        loss = criterion(logits, y)

        probs = torch.sigmoid(logits)
        preds = (probs > self.threashold).float()

        acc = (preds == y).float().mean()
        preds_np = preds.detach().cpu().numpy()
        y_np = y.detach().cpu().numpy()

        fnr = 1. - recall_score(y_np, preds_np)

        self.log('val_loss', loss, on_epoch=True, prog_bar=True)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True)
        self.log('val_fnr', fnr, on_epoch=True, prog_bar=True)

        return loss
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=1e-3)

In [28]:
from lightning.pytorch.loggers import TensorBoardLogger

model = CreditRiskModel(X_train_tensor.shape[1], 64, 0.4)

logger = TensorBoardLogger(save_dir="../models/model/lightning_logs", name="credit_risk_model")

trainer = pl.Trainer(
    max_epochs=80,
    accelerator='auto',
    default_root_dir="../models/model/",
    logger=logger
)
trainer.fit(model, train_loader, val_dataloaders=val_loader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | Sequential | 5.8 K  | train
---------------------------------------------
5.8 K     Trainable params
0         Non-trainable params
5.8 K     Total params
0.023     Total estimated model params size (MB)
6         Modules in train mode
0         Modules in eval mode


Epoch 79: 100%|██████████| 90/90 [00:00<00:00, 167.48it/s, v_num=12, train_loss_step=0.307, val_loss=0.376, val_acc=0.881, val_fnr=0.205, train_loss_epoch=0.339]

`Trainer.fit` stopped: `max_epochs=80` reached.


Epoch 79: 100%|██████████| 90/90 [00:00<00:00, 165.20it/s, v_num=12, train_loss_step=0.307, val_loss=0.376, val_acc=0.881, val_fnr=0.205, train_loss_epoch=0.339]
