In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Load data
input_csv = 'path/to/your/data.csv' # Replace with your data path
df = pd.read_csv(input_csv)

continuous_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 
                       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 
                       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 
                        'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
class_col = 'default.payment.next.month'

# Preprocess data
X_continuous = df[continuous_features].values
X_categorical = pd.get_dummies(df[categorical_features], drop_first=True).values
X = np.hstack([X_continuous, X_categorical])
y = df[class_col].values

# Standardize continuous features
scaler = StandardScaler()
X_continuous = scaler.fit_transform(X_continuous)

# Combine continuous and one-hot encoded categorical features
X = np.hstack([X_continuous, X_categorical])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors and DataLoader
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Define VAE with integrated classifier
class VAEClassifier(nn.Module):
    def __init__(self, input_dim, latent_dim, output_dim):
        super(VAEClassifier, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, latent_dim * 2)  # latent_dim * 2 for mean and log-variance
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Sigmoid()
        )
        self.latent_dim = latent_dim

    def encode(self, x):
        h = self.encoder(x)
        mu, logvar = h.chunk(2, dim=-1)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def classify(self, z):
        return self.classifier(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), self.classify(z), mu, logvar

def loss_function(recon_x, x, pred_y, y, mu, logvar, class_weights, alpha):
    BCE = nn.functional.binary_cross_entropy(pred_y, y, weight=class_weights[y.long()].view(-1, 1), reduction='sum')
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD + alpha * BCE

def train_vae_classifier(model, dataloader, optimizer, class_weights, alpha, epochs=50):
    model.train()
    for epoch in range(epochs):
        train_loss = 0
        for data, target in dataloader:
            optimizer.zero_grad()
            recon_batch, pred_y, mu, logvar = model(data)
            loss = loss_function(recon_batch, data, pred_y, target, mu, logvar, class_weights, alpha)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {train_loss / len(dataloader.dataset)}')

# Parameters for model
input_dim = X_train.shape[1]
latent_dim = 12  # You can change this value based on your experiment
output_dim = 1  # Binary classification
alpha = 4.0  # You can change this value based on your experiment

# Initialize and train the VAEClassifier
vae_classifier = VAEClassifier(input_dim, latent_dim, output_dim)
optimizer = optim.Adam(vae_classifier.parameters(), lr=1e-3)

train_vae_classifier(vae_classifier, train_dataloader, optimizer, class_weights, alpha)


# Predict and evaluate on the test set
with torch.no_grad():
    mu_test, logvar_test = vae_classifier.encode(X_test_tensor)
    latent_test = vae_classifier.reparameterize(mu_test, logvar_test)
    pred_y_test = vae_classifier.classify(latent_test).numpy().squeeze()

# Evaluate the model
y_pred = (pred_y_test > 0.5).astype(int)
auc = roc_auc_score(y_test, pred_y_test)

print(f'VAE Classifier AUC on Test Set: {auc}')

# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
type_II_error = fn / (fn + tp)

print(f'Type II Error (False Negative Rate): {type_II_error}')



Epoch 1, Loss: 18.128470605214435
Epoch 2, Loss: 14.313530550638834
Epoch 3, Loss: 13.486757625579834
Epoch 4, Loss: 13.083155586242675
Epoch 5, Loss: 12.778535640716553
Epoch 6, Loss: 12.588304039001464
Epoch 7, Loss: 12.473267387390136
Epoch 8, Loss: 12.390573603312175
Epoch 9, Loss: 12.215712656656901
Epoch 10, Loss: 12.14114464823405
Epoch 11, Loss: 12.212037745157877
Epoch 12, Loss: 11.997419469197592
Epoch 13, Loss: 12.093397855122884
Epoch 14, Loss: 12.060405234018962
Epoch 15, Loss: 12.02161243311564
Epoch 16, Loss: 12.037288511912028
Epoch 17, Loss: 11.926370989481608
Epoch 18, Loss: 11.901849361419679
Epoch 19, Loss: 11.89214650217692
Epoch 20, Loss: 11.926893332163493
Epoch 21, Loss: 11.818083700815837
Epoch 22, Loss: 11.846307713826498
Epoch 23, Loss: 11.796888298034668
Epoch 24, Loss: 11.833888449350994
Epoch 25, Loss: 11.934447603861491
Epoch 26, Loss: 11.77062213007609
Epoch 27, Loss: 11.748546326955159
Epoch 28, Loss: 11.732370301564535
Epoch 29, Loss: 11.86362199529012