### Libraries

In [1]:
# Python default library
import random

# Libraries for data manipulation
import pandas as pd
import numpy as np

# Sklearn libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

# Torch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch.optim import Adam

# Libraries for visualizations
import seaborn as sns
import matplotlib.pyplot as plt

# To make the results are reproducible
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

# Check if GPU is available and set the device accordingly
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')


Using device: cuda:0


### Preparing the data

In [10]:
# Load datasets
df_train_val = pd.read_csv('Csv/third.csv')
df_test = pd.read_csv('Csv/third_test.csv')

# Drop the first column by index
df_train_val.drop(df_train_val.columns[0], axis=1, inplace=True)
df_test.drop(df_test.columns[0], axis=1, inplace=True)

# Split features and target
X_train_val = df_train_val.iloc[:, :-1].values
y_train_val = df_train_val.iloc[:, -1].values
X_test = df_test.iloc[:, :-1].values
y_test = df_test.iloc[:, -1].values

# Define Stratified K-Folds cross-validator
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### Define TabNet model

In [4]:
class TabNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TabNet, self).__init__()
        self.bn0 = nn.BatchNorm1d(num_features=input_dim)
        self.fc1 = nn.Linear(input_dim, 64)
        self.bn1 = nn.BatchNorm1d(num_features=64)
        self.fc2 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = self.bn0(x)
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        return x

### Define taining and evaluation function

In [47]:
def train_and_evaluate_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=600):
    model.to(device)  # Move model to the appropriate device
    train_loss_list, val_loss_list = [], []
    train_accuracy_list, val_accuracy_list = [], []
    
    for epoch in range(epochs):
        # Training step
        model.train()
        total_loss, total_correct = 0, 0
        for data, target in train_loader:
            # Move data to the device
            data, target = data.to(device), target.to(device) 
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            _, preds = torch.max(output, 1)
            total_correct += torch.sum(preds == target).item()

        train_loss = total_loss / len(train_loader.dataset)
        train_accuracy = total_correct / len(train_loader.dataset)
        
        # Evaluation step
        model.eval()
        total_loss, total_correct = 0, 0
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss = criterion(output, target)
                total_loss += loss.item()
                _, preds = torch.max(output, 1)
                total_correct += torch.sum(preds == target).item()
                
        val_loss = total_loss / len(val_loader.dataset)
        val_accuracy = total_correct / len(val_loader.dataset)
        
        # Append current epoch metrics to lists
        train_loss_list.append(train_loss)
        val_loss_list.append(val_loss)
        train_accuracy_list.append(train_accuracy)
        val_accuracy_list.append(val_accuracy)
        
        # Optional: Print epoch results here or implement early stopping

    return {
        'train_loss': train_loss_list,
        'val_loss': val_loss_list,
        'train_accuracy': train_accuracy_list,
        'val_accuracy': val_accuracy_list
    }

### Stratified K-fold validation

In [48]:
# Placeholder for fold performance
# Initialize lists to store the aggregated results across folds
aggregated_train_losses, aggregated_val_losses = [], []
aggregated_train_accuracies, aggregated_val_accuracies = [], []

for fold, (train_index, val_index) in enumerate(skf.split(X_train_val, y_train_val)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    
    # Split data into training and validation folds
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val[train_index], y_train_val[val_index]
    
    # Normalize features (fit on training data, transform both training and validation data)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    # Convert to PyTorch tensors and create DataLoaders
    train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).long())
    val_dataset = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).long())
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
    
    # Model initialization (re-initialize  model here inside the loop)
    model = TabNet(input_dim=408, output_dim=len(np.unique(y_train_val))).to(device)
    optimizer = Adam(model.parameters(), lr=0.002)
    criterion = nn.CrossEntropyLoss()
    
    # Train and evaluate the model for this fold
    metrics = train_and_evaluate_model(model, train_loader, val_loader, criterion, optimizer, device)
    
    # Print results for the current fold
    print(f"Results for fold {fold}:")
    print(f"Training Loss: {np.mean(metrics['train_loss'])}, Validation Loss: {np.mean(metrics['val_loss'])}")
    print(f"Training Accuracy: {np.mean(metrics['train_accuracy'])}, Validation Accuracy: {np.mean(metrics['val_accuracy'])}\n")

    # Append fold results to aggregated lists
    aggregated_train_losses.append(np.mean(metrics['train_loss']))
    aggregated_val_losses.append(np.mean(metrics['val_loss']))
    aggregated_train_accuracies.append(np.mean(metrics['train_accuracy']))
    aggregated_val_accuracies.append(np.mean(metrics['val_accuracy']))



FOLD 0
--------------------------------
Results for fold 0:
Training Loss: 0.01670611488435379, Validation Loss: 0.031032352759083382
Training Accuracy: 0.5960845563929656, Validation Accuracy: 0.5713240349021682

FOLD 1
--------------------------------
Results for fold 1:
Training Loss: 0.016691288179363474, Validation Loss: 0.018216422121776297
Training Accuracy: 0.5945267136041604, Validation Accuracy: 0.5769024637489533

FOLD 2
--------------------------------
Results for fold 2:
Training Loss: 0.016661496762194538, Validation Loss: 0.01803798614976555
Training Accuracy: 0.5965111999911854, Validation Accuracy: 0.5729157300894707

FOLD 3
--------------------------------
Results for fold 3:
Training Loss: 0.016653761398575676, Validation Loss: 0.046304941444224985
Training Accuracy: 0.5967142101609757, Validation Accuracy: 0.5705791352637842

FOLD 4
--------------------------------
Results for fold 4:
Training Loss: 0.016692202946201107, Validation Loss: 0.018623637561679562
Trainin

### Results

In [49]:
# Calculate and print the average results across all folds
avg_train_loss = np.mean(aggregated_train_losses)
avg_val_loss = np.mean(aggregated_val_losses)
avg_train_accuracy = np.mean(aggregated_train_accuracies)
avg_val_accuracy = np.mean(aggregated_val_accuracies)

print('Average results across all folds:')
print(f'Average Training Loss: {avg_train_loss}')
print(f'Average Validation Loss: {avg_val_loss}')
print(f'Average Training Accuracy: {avg_train_accuracy}')
print(f'Average Validation Accuracy: {avg_val_accuracy}')

Average results across all folds:
Average Training Loss: 0.016680972834137716
Average Validation Loss: 0.026443068007305955
Average Training Accuracy: 0.5957280561788234
Average Validation Accuracy: 0.5733218390223923
