In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from rff.layers import GaussianEncoding #pip install random-fourier-features-pytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
import optuna
from optuna.trial import TrialState
from tab_transformer_pytorch import TabTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Run regardless if you do or do not have GPU so all tensors are moved to right location later on
if torch.cuda.is_available():
    device_in_use = torch.device("cuda")
    print("GPU is available and being used")
else:
    device_in_use = torch.device("cpu")
    print("GPU is not available, using CPU instead")

GPU is available and being used


# LOAD AND PROCESS DATA
**EXAMPLE WITH California Sklearn DATASET**

In [13]:
# df_train = pd.read_csv('/home/cscadmin/CyberResearch/CAT-Transformer/datasets/california/train.csv')
# df_test = pd.read_csv('/home/cscadmin/CyberResearch/CAT-Transformer/datasets/california/test.csv')
# df_val = pd.read_csv('/home/cscadmin/CyberResearch/CAT-Transformer/datasets/california/validation.csv') #READ FROM RIGHT SPOT

df_train = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\california\train.csv')
df_test = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\california\test.csv')
df_val = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\california\validation.csv') #READ FROM RIGHT SPOT

#Take a look at what the datasets look like initially to get an idea
df_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.8631,20.0,4.401210,1.076613,999.0,2.014113,32.79,-117.09
1,4.2026,24.0,5.617544,0.989474,731.0,2.564912,34.59,-120.14
2,3.1094,14.0,5.869565,1.094203,302.0,2.188406,39.26,-121.00
3,3.3068,52.0,4.801205,1.066265,1526.0,2.298193,37.77,-122.45
4,4.0791,11.0,5.878902,1.098493,4773.0,2.568891,33.17,-117.33
...,...,...,...,...,...,...,...,...
14442,6.3700,35.0,6.129032,0.926267,658.0,3.032258,33.78,-117.96
14443,3.0500,33.0,6.868597,1.269488,1753.0,3.904232,34.02,-117.43
14444,2.9344,36.0,3.986717,1.079696,1756.0,3.332068,34.03,-118.38
14445,5.7192,15.0,6.395349,1.067979,1777.0,3.178891,37.58,-121.96


In [14]:
cont_columns = ['HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude']
target = ['MedInc']

#CHECKING TO MAKE SURE YOUR LIST IS CORRECT (NO NEED TO TOUCH)
yourlist = cont_columns + target
yourlist.sort()
oglist = list(df_train.columns)
oglist.sort()

assert(yourlist == oglist), "You may of spelled feature name wrong or you forgot to put on of them in the list"

In [15]:
# Create a StandardScaler and fit it to the cont features
scaler = StandardScaler()
scaler.fit(df_train[cont_columns])

# Transform the training, test, and validation datasets
df_train[cont_columns] = scaler.transform(df_train[cont_columns])
df_test[cont_columns] = scaler.transform(df_test[cont_columns])
df_val[cont_columns] = scaler.transform(df_val[cont_columns])

In [16]:
class SingleTaskDataset(Dataset):
    def __init__(self, df : pd.DataFrame, num_columns,task1_column):
        self.n = df.shape[0]
        
        self.task1_labels = df[task1_column].astype(np.float32).values

        self.num = df[num_columns].astype(np.float32).values


    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        # Retrieve features and labels from the dataframe using column names
        num_features = self.num[idx]
        labels_task1 = self.task1_labels[idx]

        return num_features, labels_task1

#Wrapping in Dataset
train_dataset = SingleTaskDataset(df_train, cont_columns, 'MedInc')
val_dataset = SingleTaskDataset(df_val, cont_columns, 'MedInc')
test_dataset = SingleTaskDataset(df_test, cont_columns, 'MedInc')

#This is a hyperparameter that is not tuned. Maybe mess with what makes sense here
#Also try looking to see what other papers have done
batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [17]:
# each task loss is scaled by its own learnable parameter, then regularization is applied 
class LossFunctions(nn.Module):
    def __init__(self, num_tasks):
        super(LossFunctions, self).__init__()
        self.num_tasks = num_tasks

        self.loss_fns = [nn.MSELoss() for x in range(num_tasks)] 

    def forward(self, predictions, labels_task1):

        #task 1
        target = labels_task1
        prediction = predictions
        loss_fn = self.loss_fns[0]
        task_loss = loss_fn(prediction, target)
        
        return task_loss
    
# Training and Testing Loops
def train(dataloader, model, loss_function, optimizer, device_in_use):
    model.train()

    total_loss = 0
    total_r2_score = 0
    root_mean_squared_error_total = 0

    for (numerical, labels_task1) in dataloader:
        numerical, labels_task1 = numerical.to(device_in_use), labels_task1.to(device_in_use)
        #FT requires categorical, so must pass empty tensor since there are no cat features in this dataset
        categorical = torch.tensor([]).to(device_in_use)
        #running them through model and modifying the shape slightly for the loss function
        task_predictions = model(categorical, numerical)
        task_predictions = task_predictions.squeeze(1)
        
        loss = loss_function(task_predictions, labels_task1)
        total_loss += loss.item()
        
        # Calculate R^2 score for the regression task
        r2 = r2_score_manual(labels_task1, task_predictions)
        total_r2_score += r2

        # Calculate RMSE score for the regression task
        rmse_value = rmse(labels_task1, task_predictions)
        root_mean_squared_error_total+=rmse_value

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(dataloader)
    avg_r2_score = total_r2_score / len(dataloader)
    avg_rmse_score = root_mean_squared_error_total / len(dataloader)

    return avg_loss, avg_r2_score, avg_rmse_score

def test(dataloader, model, loss_function, device_in_use):
  model.eval()
  total_loss = 0
  
  total_loss = 0
  total_r2_score = 0
  root_mean_squared_error_total = 0

  with torch.no_grad():
    for (numerical, labels_task1) in dataloader:
        numerical, labels_task1 = numerical.to(device_in_use), labels_task1.to(device_in_use)

        categorical = torch.tensor([]).to(device_in_use)
        task_predictions = model(categorical, numerical)
        task_predictions = task_predictions.squeeze(1)
        loss = loss_function(task_predictions, labels_task1)
        total_loss += loss.item()
        
        # Calculate R^2 score for the regression task
        r2 = r2_score_manual(labels_task1, task_predictions)
        total_r2_score += r2
        
        # Calculate RMSE score for the regression task
        rmse_value = rmse(labels_task1, task_predictions)
        root_mean_squared_error_total+=rmse_value

    avg_loss = total_loss / len(dataloader)
    avg_r2_score = total_r2_score / len(dataloader)
    avg_rmse_score = root_mean_squared_error_total / len(dataloader)

    return avg_loss, avg_r2_score, avg_rmse_score

def format_metric(value): # Used to format the metrics output
    return f"{value:.4f}"

def r2_score_manual(y_true, y_pred):
    # Calculate the mean of true labels
    y_mean = torch.mean(y_true)

    # Calculate the total sum of squares
    total_ss = torch.sum((y_true - y_mean)**2)

    # Calculate the residual sum of squares
    residual_ss = torch.sum((y_true - y_pred)**2)

    # Calculate R-squared
    r2 = 1 - (residual_ss / total_ss)

    return r2.item()  # Convert to a Python float

def rmse(y_true, y_pred):
    # Calculate the squared differences
    squared_diff = (y_true - y_pred)**2

    # Calculate the mean of the squared differences
    mean_squared_diff = torch.mean(squared_diff)

    # Calculate the square root to obtain RMSE
    rmse = torch.sqrt(mean_squared_diff)

    return rmse.item()  # Convert to a Python float

# RUN EXPERIMENTS

1. Using Optuna to optimize FT-Transformers hyperparameters for your dataset

In [20]:
# Define the early stopping mechanism
class EarlyStopping:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.best_metric = float('-inf')
        self.early_stop = False

    def __call__(self, metric):
        if metric > self.best_metric:
            self.best_metric = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# Function to log results to a text file
def log_to_file(filename, text):
    with open(filename, 'a') as f:
        f.write(text + '\n')

def objective(trial):
    trial_number = trial.number
    '''
        attn_dropout = 0.,
        ff_dropout = 0.
    '''

    # Define hyperparameters to search over
    depth = trial.suggest_int('depth', 1, 2,3,4,5,6,7,8)
    # Ensure that embed_size is divisible by num_layers
    dim = trial.suggest_categorical("dim", [50, 60, 70, 80, 90, 100, 120, 140, 160])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    dim_head = trial.suggest_categorical("dim_head", [5,10,15,20,25,30])
    attn_dropout = trial.suggest_categorical('attn_dropout', [0,.1,.2,.5])
    ff_dropout = trial.suggest_categorical('ff_dropout', [0,.1,.2,.5])
    mlp_act = trial.suggest_categorical('mlp_act', [])

    learning_rate = trial.suggest_categorical('learning_rate', [0.0001, 0.001, 0.01])

    num_epochs = 2


    # Create your model with the sampled hyperparameters
    model = TabTransformer(categories=(),
                           num_continuous=len(cont_columns),
                           dim=dim,
                           depth=depth,
                           heads=heads,
                           dim_head=dim_head,
                           dim_out=1,
                           mlp_hidden_mults=,
                            mlp_act: Any | None = None,
                            num_special_tokens: int = 2,
                            continuous_mean_std: Any | None = None,
                            attn_dropout: float = 0,
                            ff_dropout: float = 0
                           )

    # Define loss function and optimizer
    loss_function = LossFunctions(1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for epoch in range(num_epochs):
        train_loss, r2_train, rmse_train = train(train_dataloader, model, loss_function, optimizer, device_in_use)
        
        # Validation loop
        test_loss, r2_test, rmse_test = test(val_dataloader, model, loss_function, device_in_use)
        
        # Check if we should early stop based on validation rmse
        if early_stopping(rmse_test):
            break
    
    # Log the final test accuracy for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation RMSE = {rmse_test:.4f}"
    log_to_file('all_trials_log.txt', final_log)

    # Return the test accuracy as the objective to optimize
    return rmse_test

In [22]:
# Set the number of optimization trials
num_trials = 3

# Create an Optuna study
study = optuna.create_study(direction='minimize')  # Maximize validation accuracy

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_rmse = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation RMSE (at Early Stopping):", best_val_rmse)

[I 2023-10-30 17:17:32,695] A new study created in memory with name: no-name-7d0d0827-4ac9-42bd-9699-146be859bef4
Best trial: 0. Best value: 1.85345:  33%|███▎      | 1/3 [00:00<00:01,  1.00it/s]

[I 2023-10-30 17:17:33,695] Trial 0 finished with value: 1.8534508485060472 and parameters: {'depth': 1, 'dim': 160, 'heads': 5, 'dim_head': 15, 'attn_dropout': 0, 'ff_dropout': 0.2, 'learning_rate': 0.01}. Best is trial 0 with value: 1.8534508485060472.


Best trial: 0. Best value: 1.85345:  67%|██████▋   | 2/3 [00:02<00:01,  1.05s/it]

[I 2023-10-30 17:17:34,782] Trial 1 finished with value: 2.040074797777029 and parameters: {'depth': 2, 'dim': 80, 'heads': 1, 'dim_head': 10, 'attn_dropout': 0.5, 'ff_dropout': 0.2, 'learning_rate': 0.0001}. Best is trial 0 with value: 1.8534508485060472.


Best trial: 2. Best value: 1.30279: 100%|██████████| 3/3 [00:03<00:00,  1.09s/it]

[I 2023-10-30 17:17:35,980] Trial 2 finished with value: 1.3027858092234685 and parameters: {'depth': 2, 'dim': 80, 'heads': 10, 'dim_head': 25, 'attn_dropout': 0, 'ff_dropout': 0.5, 'learning_rate': 0.001}. Best is trial 2 with value: 1.3027858092234685.
Best Hyperparameters: {'depth': 2, 'dim': 80, 'heads': 10, 'dim_head': 25, 'attn_dropout': 0, 'ff_dropout': 0.5, 'learning_rate': 0.001}
Best Validation RMSE (at Early Stopping): 1.3027858092234685





In [23]:
#Testing against the test dataset
model = FTTransformer(categories=(),
                          num_continuous=len(cont_columns),
                          dim=best_params['dim'],
                          depth=best_params['depth'],
                          heads=best_params['heads'],
                          dim_head=best_params['dim_head'],
                          dim_out=1,
                          attn_dropout=best_params['attn_dropout'],
                          ff_dropout=best_params['ff_dropout']).to(device_in_use)
loss_functions = LossFunctions(1)
optimizer = torch.optim.Adam(params=model.parameters(), lr = best_params['learning_rate']) # Maybe try messing around with optimizers. try other torch optimizers with different configurations.
epochs = 75 #Set the number of epochs

train_losses = []
train_accuracies_1 = [] 
train_accuracies_2 = []
train_recalls = [] 
train_f1_scores = [] 
test_losses = []
test_accuracies_1 = []
test_accuracies_2 = []
test_recalls = []  
test_f1_scores = [] 
all_attention_scores = []

for t in range(epochs):
  train_loss, r2_train, rmse_train = train(train_dataloader, model, loss_functions, optimizer, device_in_use=device_in_use)
  test_loss, r2_test, rmse_test = test(test_dataloader, model, loss_functions, device_in_use=device_in_use)
  train_losses.append(train_loss)

  # train_accuracies_2.append(train_accuracy_2)
  # train_recalls.append(train_recall) 
  # train_f1_scores.append(train_f1)
  test_losses.append(test_loss)

  # test_accuracies_2.append(test_accuracy_2)
  # test_recalls.append(test_recall)
  # test_f1_scores.append(test_f1)
  # Formatting for easier reading
  epoch_str = f"Epoch [{t+1:2}/{epochs}]"
  train_metrics = f"Train: Loss {format_metric(train_loss)}, R2 {format_metric(r2_train)}, RMSE {format_metric(rmse_train)}"
  test_metrics = f"Test: Loss {format_metric(test_loss)}, R2 {format_metric(r2_test)}, RMSE {format_metric(rmse_test)}"
  print(f"{epoch_str:20} | {train_metrics:65} | {test_metrics}")


torch.save(model.state_dict(), 'final_model_trained.pth')

# Plotting the loss curves
plt.figure(figsize=(15, 5))
plt.subplot(1, 1, 1)
plt.plot(range(1, epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, epochs+1), [l for l in test_losses], label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss Curve')
plt.legend()


Epoch [ 1/75]        | Train: Loss 3.9265, R2 -0.0887, RMSE 1.9477                       | Test: Loss 2.5701, R2 0.2847, RMSE 1.5962
Epoch [ 2/75]        | Train: Loss 1.9876, R2 0.4561, RMSE 1.3922                        | Test: Loss 1.5746, R2 0.5792, RMSE 1.2507
Epoch [ 3/75]        | Train: Loss 1.2633, R2 0.6523, RMSE 1.1154                        | Test: Loss 1.1124, R2 0.6911, RMSE 1.0498
Epoch [ 4/75]        | Train: Loss 1.0668, R2 0.7062, RMSE 1.0273                        | Test: Loss 1.2599, R2 0.7102, RMSE 1.0760
Epoch [ 5/75]        | Train: Loss 0.9745, R2 0.7318, RMSE 0.9786                        | Test: Loss 0.9202, R2 0.7518, RMSE 0.9544
Epoch [ 6/75]        | Train: Loss 0.9247, R2 0.7437, RMSE 0.9560                        | Test: Loss 0.9212, R2 0.7473, RMSE 0.9536
Epoch [ 7/75]        | Train: Loss 0.8987, R2 0.7496, RMSE 0.9439                        | Test: Loss 0.8648, R2 0.7584, RMSE 0.9247
Epoch [ 8/75]        | Train: Loss 0.8615, R2 0.7620, RMSE 0.9234    

KeyboardInterrupt: 