In [23]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
import json
from datetime import datetime
import os
from torch.optim import lr_scheduler

class StudentDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class TabR(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_hidden=3, dropout=0.1):
        super().__init__()
        self.num_hidden = num_hidden
        
        self.input_proj = nn.Linear(input_size, hidden_size)
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size) for _ in range(num_hidden)
        ])
        self.replicate = nn.ModuleList([
            nn.Linear(hidden_size, input_size) for _ in range(num_hidden)
        ])
        self.output = nn.Sequential(
            nn.Linear(hidden_size, hidden_size//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size//2, 1),
            nn.Sigmoid()
        )
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.ReLU()
        
    def forward(self, x):
        h = self.activation(self.input_proj(x))
        replication_loss = 0
        for i in range(self.num_hidden):
            h_prev = h
            h = self.activation(self.hidden_layers[i](h))
            h = self.dropout(h)
            x_pred = self.replicate[i](h_prev)
            replication_loss += nn.MSELoss()(x_pred, x)
        return self.output(h), replication_loss

class FTTransformer(nn.Module):
    def __init__(self, input_size, d_model=128, nhead=8, num_layers=3, dropout=0.1):
        super().__init__()
        self.feature_embeddings = nn.Linear(input_size, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.feature_embeddings(x).unsqueeze(1)
        x = self.transformer(x)
        x = x.squeeze(1)
        return self.fc(x)

class SAINT(nn.Module):
    def __init__(self, input_size, d_model=128, nhead=8, num_layers=3, dropout=0.1):
        super().__init__()
        self.feature_embeddings = nn.Linear(input_size, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, input_size, d_model))
        
        self.intersample_attention = nn.MultiheadAttention(
            embed_dim=d_model,
            num_heads=nhead,
            dropout=dropout,
            batch_first=True
        )
        
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=d_model,
                nhead=nhead,
                dim_feedforward=d_model * 4,
                dropout=dropout,
                batch_first=True
            ) for _ in range(num_layers)
        ])
        
        self.fc = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        x = self.feature_embeddings(x).unsqueeze(1)
        x = x + self.pos_embedding
        
        x_inter, _ = self.intersample_attention(x, x, x)
        x = x + x_inter
        
        for layer in self.transformer_layers:
            x = layer(x)
        
        x = x.mean(dim=1)
        return self.fc(x)

class TabNet(nn.Module):
    def __init__(self, input_size, n_d=64, n_steps=3, dropout=0.1):
        super().__init__()
        self.n_d = n_d
        self.n_steps = n_steps
        
        self.feature_transforms = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_size, 2 * n_d),
                nn.BatchNorm1d(2 * n_d),
                nn.ReLU(),
                nn.Dropout(dropout)
            ) for _ in range(n_steps)
        ])
        
        self.attention = nn.ModuleList([
            nn.Sequential(
                nn.Linear(2 * n_d, input_size),
                nn.Sigmoid()
            ) for _ in range(n_steps)
        ])
        
        self.output = nn.Sequential(
            nn.Linear(n_d * n_steps, n_d),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(n_d, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        batch_size = x.size(0)
        features = []
        
        for step in range(self.n_steps):
            transformed = self.feature_transforms[step](x)
            attention = self.attention[step](transformed)
            masked_x = x * attention
            features.append(transformed[:, :self.n_d])
        
        combined_features = torch.cat(features, dim=1)
        output = self.output(combined_features)
        return output

def train_model(model, train_loader, test_loader, criterion, optimizer, scheduler, 
                epochs, device, model_name, save_dir, early_stopping_patience=10):
    model = model.to(device)
    best_test_acc = 0
    no_improve = 0
    history = {'train_loss': [], 'train_acc': [], 'test_acc': [], 'best_acc': 0}
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        train_preds, train_true = [], []
        
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            
            if isinstance(model, TabR):
                outputs, replication_loss = model(batch_X)
                outputs = outputs.squeeze()
                loss = criterion(outputs, batch_y) + 0.1 * replication_loss
            else:
                outputs = model(batch_X).squeeze()
                loss = criterion(outputs, batch_y)
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
            predicted = (outputs > 0.5).float()
            train_preds.extend(predicted.cpu().numpy())
            train_true.extend(batch_y.cpu().numpy())
        
        # Evaluation
        model.eval()
        test_preds, test_true = [], []
        test_loss = 0
        
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                
                if isinstance(model, TabR):
                    outputs, _ = model(batch_X)
                else:
                    outputs = model(batch_X)
                
                outputs = outputs.squeeze()
                predicted = (outputs > 0.5).float()
                test_preds.extend(predicted.cpu().numpy())
                test_true.extend(batch_y.cpu().numpy())
        
        train_acc = balanced_accuracy_score(train_true, train_preds)
        test_acc = balanced_accuracy_score(test_true, test_preds)
        
        history['train_loss'].append(train_loss/len(train_loader))
        history['train_acc'].append(train_acc)
        history['test_acc'].append(test_acc)
        
        if scheduler is not None:
            scheduler.step(test_acc)
        
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            history['best_acc'] = test_acc
            no_improve = 0
            
            # Save best model and create confusion matrix
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            
            torch.save({
                'model_state_dict': model.state_dict(),
                'test_acc': test_acc,
                'epoch': epoch,
            }, f'{save_dir}/{model_name}_best.pth')
            
            cm = confusion_matrix(test_true, test_preds)
            plt.figure(figsize=(10,8))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title(f'{model_name} Confusion Matrix (Best Model)')
            plt.savefig(f'{save_dir}/{model_name}_confusion_matrix.png')
            plt.close()
            
        else:
            no_improve += 1
        
        if epoch % 5 == 0:
            print(f'\n{model_name} - Epoch {epoch}:')
            print(f'Train Loss: {train_loss/len(train_loader):.4f}')
            print(f'Train Acc: {100 * train_acc:.2f}%')
            print(f'Test Acc: {100 * test_acc:.2f}%')
            print(f'Best Test Acc: {100 * best_test_acc:.2f}%')
            
            # Print classification report
            print('\nClassification Report:')
            print(classification_report(test_true, test_preds))
        
        if no_improve >= early_stopping_patience:
            print(f'Early stopping at epoch {epoch}')
            break
    
    return history

def run_experiments():
    # Create save directory
    save_dir = 'tabular_models_results'
    os.makedirs(save_dir, exist_ok=True)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load and preprocess data
    df = pd.read_csv(r'C:\Users\andre\OneDrive\Desktop\CS485-Final-Project\dataset\dataset_pruned.csv')
    X = df.drop('Target_encoded', axis=1)
    y = df['Target_encoded']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Create datasets
    train_dataset = StudentDataset(X_train, y_train.values)
    test_dataset = StudentDataset(X_test, y_test.values)
    
    # Training parameters
    params = {
        'batch_size': 64,
        'epochs': 100,
        'learning_rate': 0.001,
    }
    
    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'])
    
    # Models to test
    models = {
        'TabR': TabR(input_size=X_train.shape[1]),
        'FTTransformer': FTTransformer(input_size=X_train.shape[1]),
        'SAINT': SAINT(input_size=X_train.shape[1]),
        'TabNet': TabNet(input_size=X_train.shape[1])
    }
    
    # Results storage
    results = {}
    
    # Train each model
    for model_name, model in models.items():
        print(f"\n{'='*50}")
        print(f"Training {model_name}")
        print(f"{'='*50}")
        
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='max', patience=5, factor=0.5
        )
        
        history = train_model(
            model=model,
            train_loader=train_loader,
            test_loader=test_loader,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            epochs=params['epochs'],
            device=device,
            model_name=model_name,
            save_dir=save_dir
        )
        
        results[model_name] = history
    
    # Print final comparative results
    print(f"\n{'='*50}")
    print("FINAL RESULTS COMPARISON")
    print(f"{'='*50}")
    
    for model_name, history in results.items():
        print(f"\n{model_name}:")
        print(f"Best Test Accuracy: {history['best_acc']*100:.2f}%")
        print(f"Final Train Accuracy: {history['train_acc'][-1]*100:.2f}%")
        print(f"Final Test Accuracy: {history['test_acc'][-1]*100:.2f}%")
    
    # Plot comparative results
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 2, 1)
    for model_name, history in results.items():
        plt.plot(history['test_acc'], label=f'{model_name}')
    plt.title('Test Accuracy Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    for model_name, history in results.items():
        plt.plot(history['train_loss'], label=f'{model_name}')
    plt.title('Training Loss Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'{save_dir}/model_comparison.png')
    plt.close()
    
    # Save results to JSON
    results_json = {}
    for model_name, history in results.items():
        results_json[model_name] = {
            'best_accuracy': float(history['best_acc']),
            'final_train_accuracy': float(history['train_acc'][-1]),
            'final_test_accuracy': float(history['test_acc'][-1]),
            'training_history': {
                'train_loss': [float(x) for x in history['train_loss']],
                'train_acc': [float(x) for x in history['train_acc']],
                'test_acc': [float(x) for x in history['test_acc']]
            }
        }   
    
    # Save results to JSON file
    with open(f'{save_dir}/results.json', 'w') as f:
        json.dump(results_json, f, indent=4)
    
    # Save parameters used
    params_used = {
        'batch_size': params['batch_size'],
        'epochs': params['epochs'],
        'learning_rate': params['learning_rate'],
        'early_stopping_patience': 10,
        'model_configurations': {
            'TabR': {'hidden_size': 256, 'num_hidden': 3, 'dropout': 0.1},
            'FTTransformer': {'d_model': 256, 'nhead': 8, 'num_layers': 3, 'dropout': 0.1},
            'SAINT': {'d_model': 128, 'nhead': 8, 'num_layers': 3, 'dropout': 0.1},
            'TabNet': {'n_d': 64, 'n_steps': 3, 'dropout': 0.1}
        },
        'optimizer': 'Adam',
        'scheduler': 'ReduceLROnPlateau',
        'scheduler_params': {
            'mode': 'max',
            'patience': 5,
            'factor': 0.5
        },
        'loss_function': 'BCELoss'
    }
    
    with open(f'{save_dir}/training_params.json', 'w') as f:
        json.dump(params_used, f, indent=4)
    
    # Create a summary text file
    with open(f'{save_dir}/summary.txt', 'w') as f:
        f.write("=== Model Performance Summary ===\n\n")
        for model_name, history in results.items():
            f.write(f"\n{model_name}:\n")
            f.write(f"Best Test Accuracy: {history['best_acc']*100:.2f}%\n")
            f.write(f"Final Train Accuracy: {history['train_acc'][-1]*100:.2f}%\n")
            f.write(f"Final Test Accuracy: {history['test_acc'][-1]*100:.2f}%\n")
            f.write("-" * 40 + "\n")
    
    print("\nResults saved successfully:")
    print(f"- Model checkpoints: {save_dir}/*_best.pth")
    print(f"- Confusion matrices: {save_dir}/*_confusion_matrix.png")
    print(f"- Results comparison: {save_dir}/model_comparison.png")
    print(f"- Detailed results: {save_dir}/results.json")
    print(f"- Training parameters: {save_dir}/training_params.json")
    print(f"- Summary: {save_dir}/summary.txt")
    
    return results, params_used

if __name__ == "__main__":
    print("\nStarting experiments with multiple tabular models...")
    results, params = run_experiments()
    print("\nExperiments completed. Results saved in 'tabular_models_results' directory.")
    print("Check the results.json and training_params.json files for detailed information.")


Starting experiments with multiple tabular models...
Using device: cpu

Training TabR

TabR - Epoch 0:
Train Loss: 0.7757
Train Acc: 69.47%
Test Acc: 88.43%
Best Test Acc: 88.43%

Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.96      0.92       449
         1.0       0.93      0.81      0.86       277

    accuracy                           0.90       726
   macro avg       0.91      0.88      0.89       726
weighted avg       0.90      0.90      0.90       726


TabR - Epoch 5:
Train Loss: 0.2872
Train Acc: 90.67%
Test Acc: 89.64%
Best Test Acc: 90.27%

Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.97      0.93       449
         1.0       0.95      0.82      0.88       277

    accuracy                           0.91       726
   macro avg       0.92      0.90      0.91       726
weighted avg       0.92      0.91      0.91       726


TabR - Epoch 10:
Train L