In [1]:
import sys
# sys.path.insert(0, '/home/wdwatson2/projects/CAT-Transformer/model')
sys.path.insert(0, r'C:\Users\smbm2\projects\CAT-Transformer\model')
# sys.path.insert(0, '/home/warin/projects/CAT-Transformer/model')
from testingModel import CATTransformer, MyFTTransformer, Combined_Dataset, train, test, EarlyStopping
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import optuna
from optuna.trial import TrialState

device_in_use = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device_in_use)

cuda


In [2]:
# df_train = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/california/train.csv')
# df_test = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/california/test.csv')
# df_val = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/california/validation.csv')

df_train = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\california\train.csv')
df_test = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\california\test.csv')
df_val = pd.read_csv(r'C:\Users\smbm2\projects\CAT-Transformer\datasets\california\validation.csv') #READ FROM RIGHT SPOT

cont_columns = [ 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude']
target = ['MedInc']
cat_columns=[]

#CHECKING TO MAKE SURE YOUR LIST IS CORRECT (NO NEED TO TOUCH)
yourlist = cont_columns + target
yourlist.sort()
oglist = list(df_train.columns)
oglist.sort()

cat_features=()

assert(yourlist == oglist), "You may of spelled feature name wrong or you forgot to put on of them in the list"

target_classes = [max(len(df_train[target].value_counts()), len(df_val[target].value_counts()),len(df_test[target].value_counts()))]
print(target_classes)
# Create a StandardScaler and fit it to the cont features
scaler = StandardScaler()
scaler.fit(df_train[cont_columns])

# Transform the training, test, and validation datasets
df_train[cont_columns] = scaler.transform(df_train[cont_columns])
df_test[cont_columns] = scaler.transform(df_test[cont_columns])
df_val[cont_columns] = scaler.transform(df_val[cont_columns])

#Wrapping in Dataset
train_dataset = Combined_Dataset(df_train, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
val_dataset = Combined_Dataset(df_val, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
test_dataset = Combined_Dataset(df_test, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[9851]


In [3]:
def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    alpha = trial.suggest_float('sigma', 0.001, 5, log=True)
    num_layers = trial.suggest_int('num_layers', 1, 5)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 350, 500])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    pre_norm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)
    decoder_dropout = trial.suggest_categorical('decoder_dropout', [0,.1,.2,.5])
    classification_dropout = trial.suggest_categorical('class_drop', [0,.1,.2,.5])

    learning_rate = trial.suggest_float('learning_rate', 0.00001, 0.001, log=True)
    weight_decay = trial.suggest_float('weight_decay', 0.000001, 0.001, log=True)

    epochs = 400

    # Create your model with the sampled hyperparameters
    model = CATTransformer(alpha = alpha,
                           embed_size= embed_size,
                           n_cont = len(cont_columns),
                           cat_feat=cat_columns,
                           num_layers=num_layers,
                           heads=heads,
                           forward_expansion=forward_expansion,
                           decoder_dropout=decoder_dropout,
                           classification_dropout=classification_dropout,
                           pre_norm_on=pre_norm_on,
                           mlp_scale_classification=mlp_scale_classification,
                           targets_classes=target_classes,
                           regression_on=True
                           ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, weight_decay=weight_decay)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=20, mode='min', verbose=False)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for t in range(epochs):
        train_loss, train_rmse = train(regression_on=True, 
                                    get_attn=False,
                                    dataloader=train_dataloader, 
                                    model=model, 
                                    loss_function=loss_function, 
                                    optimizer=optimizer, 
                                    device_in_use=device_in_use)
        val_loss, val_rmse = test(regression_on=True, 
                                  get_attn=False,
                                   dataloader=val_dataloader, 
                                   model=model, 
                                   loss_function=loss_function, 
                                   device_in_use=device_in_use)
        # Check if we should early stop based on validation rmse
        early_stopping(val_rmse)
    
        if early_stopping.early_stop:
            print("Early stopping")
            break

    
    # Log the final test rmse for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation RMSE = {val_rmse:.4f}"

    # Return the test rmse as the objective to optimize
    return val_rmse

In [4]:
# Set the number of optimization trials
num_trials = 150

# Create an Optuna study
study = optuna.create_study(direction='minimize')  

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_rmse = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation RMSE (at Early Stopping):", best_val_rmse)

[I 2024-01-06 15:58:16,669] A new study created in memory with name: no-name-ae8aed08-80df-451a-851c-f0bbd78f683d


  0%|          | 0/150 [00:00<?, ?it/s]

Early stopping
[I 2024-01-06 15:59:14,323] Trial 0 finished with value: 1.8552341277782733 and parameters: {'sigma': 0.07238185835822583, 'num_layers': 2, 'embed_size': 140, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 1.676847038341081e-05, 'weight_decay': 0.0002899079579516983}. Best is trial 0 with value: 1.8552341277782733.
Early stopping
[I 2024-01-06 16:02:52,310] Trial 1 finished with value: 0.9835986907665546 and parameters: {'sigma': 0.0174490961493367, 'num_layers': 4, 'embed_size': 60, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 2.5461021227494194e-05, 'weight_decay': 0.00019337611397390493}. Best is trial 1 with value: 0.9835986907665546.
Early stopping
[I 2024-01-06 16:03:28,859] Trial 2 finished with value: 1.9012051362257738 and parameters: {'sigma': 0.07978026647801911