### Imports

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [None]:
# Load data and preprocess
data = pd.read_csv("Data/Dev_data_to_be_shared.csv")
features = data.drop(['account_number', 'bad_flag'], axis=1)

# Handle missing values and normalize data
scaler = StandardScaler()
for column in features.columns:
    features[column] = features[column].fillna(features[column].mean() if features[column].dtype.kind in 'fc' else features[column].mode()[0])

# Scale the features and save the scaler for later use
X = scaler.fit_transform(features.values)
y = data['bad_flag'].values

# Split and convert to tensors
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


### Model Definition

In [116]:
class CreditDefaultNN(nn.Module):
    def __init__(self, input_dim):
        super(CreditDefaultNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)  # Remove sigmoid here since BCEWithLogitsLoss includes it
        return x
    
    def train_model(self, device, train_loader, criterion, optimizer, epochs=20):
        self.train()
        for epoch in range(1, epochs + 1):
            for batch_idx, (X_batch, y_batch) in enumerate(train_loader):
                X_batch, y_batch = X_batch.to(self.device), y_batch.to(self.device)
                
                optimizer.zero_grad()
                y_pred = self(X_batch)
                loss = criterion(y_pred, y_batch.unsqueeze(1).float())
                
                loss.backward()
                optimizer.step()
                
                if batch_idx % 100 == 0:
                    print(f'Train Epoch: {epoch} [{batch_idx * len(X_batch)}/{len(train_loader.dataset)}] Loss: {loss.item():.6f}')
            
            # Test after each epoch
            self.eval()
            test_loss = 0
            correct = 0
            with torch.no_grad():
                X_test_tensor = torch.FloatTensor(X_test).to(self.device)
                y_test_tensor = torch.FloatTensor(y_test).to(self.device)
                outputs = self(X_test_tensor)
                test_loss += criterion(outputs, y_test_tensor.unsqueeze(1)).item()
                pred = (torch.sigmoid(outputs) >= 0.5).float()
                correct += pred.eq(y_test_tensor.unsqueeze(1)).sum().item()
            
            test_loss /= len(X_test)
            accuracy = 100. * correct / len(X_test)
            print(f'Test set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(X_test)} ({accuracy:.2f}%)')
            
            self.train()

    def predict(self, X):
        self.eval()
        with torch.no_grad():
            # If X is already a tensor, just move it to the right device
            if isinstance(X, torch.Tensor):
                X = X.to(next(self.parameters()).device)
            else:
                # If X is not a tensor, convert it and move to right device
                X = torch.tensor(X, dtype=torch.float32).to(next(self.parameters()).device)
            
            outputs = self(X)
            # Apply sigmoid and move to CPU before converting to numpy
            return torch.sigmoid(outputs).cpu().squeeze().numpy()

    def get_probabilities(self, X):
        self.eval()
        with torch.no_grad():
            # Move input tensor to same device as model
            X = torch.tensor(X, dtype=torch.float32).to(next(self.parameters()).device)
            logits = self(X)
            # Move back to CPU for numpy conversion
            return torch.sigmoid(logits).cpu().numpy()

    def get_probabilities(model, X_test, device):
        model.eval()
        with torch.no_grad():
            X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
            logits = model(X_test_tensor)
            probabilities = torch.sigmoid(logits)
        return probabilities.cpu().numpy()

### Training the Model

In [117]:
# Initialize model and training components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = X_train.shape[1]
model = CreditDefaultNN(input_dim).to(device)
criterion = nn.BCEWithLogitsLoss()  # Changed from BCELoss to BCEWithLogitsLoss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Ensure no NaN values in the tensors
X_train_tensor = torch.nan_to_num(X_train_tensor).to(device)
X_test_tensor = torch.nan_to_num(X_test_tensor).to(device)
y_train_tensor = torch.nan_to_num(y_train_tensor).to(device)
y_test_tensor = torch.nan_to_num(y_test_tensor).to(device)

# Training loop
epochs = 20
batch_size = 256
for epoch in range(epochs):
    model.train()
    for i in range(0, len(X_train_tensor), batch_size):
        batch_X = X_train_tensor[i:i+batch_size].to(device)
        batch_y = y_train_tensor[i:i+batch_size].view(-1, 1).to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)  # Remove sigmoid here since BCEWithLogitsLoss includes it
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # Evaluate progress
    model.eval()
    with torch.no_grad():
        train_outputs = model(X_train_tensor)
        train_preds = torch.sigmoid(train_outputs).squeeze()
        train_loss = criterion(train_outputs, y_train_tensor.view(-1, 1))
        
        test_outputs = model(X_test_tensor)
        test_preds = torch.sigmoid(test_outputs).squeeze()
        auc = roc_auc_score(y_test, test_preds.cpu().numpy())
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss.item():.4f}, Test AUC: {auc:.4f}")

# Modify predict methods to apply sigmoid threshold
y_pred_proba = CreditDefaultNN.get_probabilities(model, X_test, device)
y_pred = (y_pred_proba >= 0.5).astype(int)


Epoch 1/20, Loss: 0.0636, Test AUC: 0.7660
Epoch 2/20, Loss: 0.0591, Test AUC: 0.7715
Epoch 3/20, Loss: 0.0566, Test AUC: 0.7670
Epoch 4/20, Loss: 0.0539, Test AUC: 0.7649
Epoch 5/20, Loss: 0.0516, Test AUC: 0.7659
Epoch 6/20, Loss: 0.0486, Test AUC: 0.7614
Epoch 7/20, Loss: 0.0462, Test AUC: 0.7601
Epoch 8/20, Loss: 0.0437, Test AUC: 0.7527
Epoch 9/20, Loss: 0.0413, Test AUC: 0.7445
Epoch 10/20, Loss: 0.0384, Test AUC: 0.7426
Epoch 11/20, Loss: 0.0360, Test AUC: 0.7390
Epoch 12/20, Loss: 0.0331, Test AUC: 0.7402
Epoch 13/20, Loss: 0.0320, Test AUC: 0.7325
Epoch 14/20, Loss: 0.0287, Test AUC: 0.7267
Epoch 15/20, Loss: 0.0279, Test AUC: 0.7236
Epoch 16/20, Loss: 0.0254, Test AUC: 0.7229
Epoch 17/20, Loss: 0.0242, Test AUC: 0.7172
Epoch 18/20, Loss: 0.0218, Test AUC: 0.7164
Epoch 19/20, Loss: 0.0215, Test AUC: 0.7130
Epoch 20/20, Loss: 0.0194, Test AUC: 0.7176


### Evaluate Validation Data and Save Predictions

In [119]:
# Load and preprocess validation data
val_data = pd.read_csv("Data/Dev_data_to_be_shared.csv")
account_numbers = val_data['account_number']
val_features = val_data.drop(['account_number', 'bad_flag'], axis=1)

# Handle missing values the same way as training data
for column in val_features.columns:
    val_features[column] = val_features[column].fillna(
        val_features[column].mean() if val_features[column].dtype.kind in 'fc' else val_features[column].mode()[0]
    )

# Scale validation data using the same scaler
X_val = scaler.transform(val_features.values)

# Verify no NaN values
X_val = np.nan_to_num(X_val, nan=0.0)

# Get predictions
model.eval()  # Ensure model is in evaluation mode
with torch.no_grad():
    predictions = model.predict(X_val)

# Ensure predictions are valid probabilities
predictions = np.clip(predictions, 0, 1)  # Clip values between 0 and 1
predictions = np.nan_to_num(predictions, nan=0.5)  # Replace any remaining NaNs with 0.5

# Save results
predictions_df = pd.DataFrame({
    'account_number': account_numbers,
    'default_probability': predictions
})
predictions_df.to_csv('Data/credit_default_predictions.csv', index=False)

print("\nVerification:")
print("Number of NaN predictions:", np.isnan(predictions).sum())
print("Min prediction value:", predictions.min())
print("Max prediction value:", predictions.max())
print("\nFirst few predictions:")
print(predictions_df.head())
print(f"\nPredictions saved to 'credit_default_predictions.csv'")
print(f"Total accounts evaluated: {len(predictions_df)}")


Verification:
Number of NaN predictions: 0
Min prediction value: 0.0
Max prediction value: 1.0

First few predictions:
   account_number  default_probability
0               1         4.240636e-12
1               2         6.612599e-23
2               3         2.224915e-03
3               4         6.843677e-10
4               5         1.543895e-06

Predictions saved to 'credit_default_predictions.csv'
Total accounts evaluated: 96806
