In [5]:
import sys
sys.path.insert(0, '/home/wdwatson2/projects/CAT-Transformer/model')
# sys.path.insert(0, r'C:\Users\smbm2\projects\CAT-Transformer\model')
# sys.path.insert(0, '/home/warin/projects/CAT-Transformer/model')
from testingModel import CATTransformer, MyFTTransformer, Combined_Dataset, train, test, EarlyStopping
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import optuna
from optuna.trial import TrialState

device_in_use = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device_in_use)

cuda


In [6]:
df_train = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/california/train.csv')
df_test = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/california/test.csv')
df_val = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/california/validation.csv')

cont_columns = [ 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude']
target = ['MedInc']
cat_columns=[]

#CHECKING TO MAKE SURE YOUR LIST IS CORRECT (NO NEED TO TOUCH)
yourlist = cont_columns + target
yourlist.sort()
oglist = list(df_train.columns)
oglist.sort()

cat_features=()

assert(yourlist == oglist), "You may of spelled feature name wrong or you forgot to put on of them in the list"

target_classes = [max(len(df_train[target].value_counts()), len(df_val[target].value_counts()),len(df_test[target].value_counts()))]
print(target_classes)
# Create a StandardScaler and fit it to the cont features
scaler = StandardScaler()
scaler.fit(df_train[cont_columns])

# Transform the training, test, and validation datasets
df_train[cont_columns] = scaler.transform(df_train[cont_columns])
df_test[cont_columns] = scaler.transform(df_test[cont_columns])
df_val[cont_columns] = scaler.transform(df_val[cont_columns])

#Wrapping in Dataset
train_dataset = Combined_Dataset(df_train, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
val_dataset = Combined_Dataset(df_val, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
test_dataset = Combined_Dataset(df_test, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[9851]


In [7]:
def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    alpha = trial.suggest_float('sigma', 0.001, 5, log=True)
    num_layers = trial.suggest_int('num_layers', 1, 5)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 350, 500])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    pre_norm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)
    decoder_dropout = trial.suggest_categorical('decoder_dropout', [0,.1,.2,.5])
    classification_dropout = trial.suggest_categorical('class_drop', [0,.1,.2,.5])

    learning_rate = trial.suggest_float('learning_rate', 0.00001, 0.001, log=True)
    weight_decay = trial.suggest_float('weight_decay', 0.000001, 0.001, log=True)

    epochs = 400

    # Create your model with the sampled hyperparameters
    model = CATTransformer(alpha = alpha,
                           embed_size= embed_size,
                           n_cont = len(cont_columns),
                           cat_feat=cat_columns,
                           num_layers=num_layers,
                           heads=heads,
                           forward_expansion=forward_expansion,
                           decoder_dropout=decoder_dropout,
                           classification_dropout=classification_dropout,
                           pre_norm_on=pre_norm_on,
                           mlp_scale_classification=mlp_scale_classification,
                           targets_classes=target_classes,
                           regression_on=True
                           ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, weight_decay=weight_decay)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=20, mode='min', verbose=False)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for t in range(epochs):
        train_loss, train_rmse = train(regression_on=True, 
                                    get_attn=False,
                                    dataloader=train_dataloader, 
                                    model=model, 
                                    loss_function=loss_function, 
                                    optimizer=optimizer, 
                                    device_in_use=device_in_use)
        val_loss, val_rmse = test(regression_on=True, 
                                  get_attn=False,
                                   dataloader=val_dataloader, 
                                   model=model, 
                                   loss_function=loss_function, 
                                   device_in_use=device_in_use)
        # Check if we should early stop based on validation rmse
        early_stopping(val_rmse)
    
        if early_stopping.early_stop:
            print("Early stopping")
            break

    
    # Log the final test rmse for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation RMSE = {val_rmse:.4f}"

    # Return the test rmse as the objective to optimize
    return val_rmse

In [8]:
# Set the number of optimization trials
num_trials = 100

# Create an Optuna study
study = optuna.create_study(direction='minimize')  

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_rmse = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation RMSE (at Early Stopping):", best_val_rmse)

[I 2024-01-06 15:46:54,615] A new study created in memory with name: no-name-377a34eb-ed99-47e0-a8ad-52699356170a
Best trial: 0. Best value: 0.944113:   1%|          | 1/100 [00:06<10:15,  6.22s/it]

Early stopping
[I 2024-01-06 15:47:00,835] Trial 0 finished with value: 0.9441127502001249 and parameters: {'sigma': 5, 'num_layers': 2, 'embed_size': 200, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.00017877835206345802, 'weight_decay': 0.00032184106357890674}. Best is trial 0 with value: 0.9441127502001249.


Best trial: 1. Best value: 0.835411:   2%|▏         | 2/100 [00:14<12:20,  7.56s/it]

Early stopping
[I 2024-01-06 15:47:09,328] Trial 1 finished with value: 0.8354114981798025 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 200, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 0.00011427919398924068, 'weight_decay': 4.170626692543791e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   3%|▎         | 3/100 [00:25<14:56,  9.24s/it]

Early stopping
[I 2024-01-06 15:47:20,570] Trial 2 finished with value: 0.9838140331781827 and parameters: {'sigma': 1, 'num_layers': 2, 'embed_size': 90, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 9.801491910477277e-05, 'weight_decay': 4.8136222423471674e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   4%|▍         | 4/100 [00:36<15:27,  9.66s/it]

Early stopping
[I 2024-01-06 15:47:30,879] Trial 3 finished with value: 0.8818699946770301 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 90, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 0.0005656440801888712, 'weight_decay': 8.264462139093583e-05}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   5%|▌         | 5/100 [00:46<15:47,  9.97s/it]

Early stopping
[I 2024-01-06 15:47:41,398] Trial 4 finished with value: 1.008298479593717 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.00033812146661804237, 'weight_decay': 1.4826048915235332e-05}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   6%|▌         | 6/100 [01:03<18:58, 12.11s/it]

Early stopping
[I 2024-01-06 15:47:57,655] Trial 5 finished with value: 2.049694675665635 and parameters: {'sigma': 2, 'num_layers': 2, 'embed_size': 500, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 4.6176678378892705e-05, 'weight_decay': 6.699805149429496e-05}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   7%|▋         | 7/100 [01:16<19:22, 12.50s/it]

Early stopping
[I 2024-01-06 15:48:10,962] Trial 6 finished with value: 0.8638921059094943 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 60, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.0002043562186881004, 'weight_decay': 0.0009112040194628541}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   8%|▊         | 8/100 [01:25<17:44, 11.57s/it]

Early stopping
[I 2024-01-06 15:48:20,528] Trial 7 finished with value: 1.1236858459619374 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 0.0007454078455488071, 'weight_decay': 2.0644149689118215e-05}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:   9%|▉         | 9/100 [01:41<19:20, 12.75s/it]

Early stopping
[I 2024-01-06 15:48:35,896] Trial 8 finished with value: 1.0585916867622962 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 0.00035446565874941443, 'weight_decay': 2.7355143661714135e-05}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  10%|█         | 10/100 [01:51<17:47, 11.86s/it]

Early stopping
[I 2024-01-06 15:48:45,747] Trial 9 finished with value: 1.4076829781899085 and parameters: {'sigma': 0.001, 'num_layers': 2, 'embed_size': 160, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 5.8767756837626887e-05, 'weight_decay': 3.31668063444782e-05}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  11%|█         | 11/100 [02:07<19:36, 13.22s/it]

Early stopping
[I 2024-01-06 15:49:02,068] Trial 10 finished with value: 0.8488559906299298 and parameters: {'sigma': 3, 'num_layers': 2, 'embed_size': 200, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1.3967027913184602e-05, 'weight_decay': 1.0727651164457411e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  12%|█▏        | 12/100 [02:24<21:02, 14.35s/it]

Early stopping
[I 2024-01-06 15:49:18,980] Trial 11 finished with value: 0.8852759141188401 and parameters: {'sigma': 3, 'num_layers': 2, 'embed_size': 200, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1.1692226936423669e-05, 'weight_decay': 1.1130471991899249e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  13%|█▎        | 13/100 [02:46<24:13, 16.71s/it]

Early stopping
[I 2024-01-06 15:49:41,128] Trial 12 finished with value: 0.8429606602742121 and parameters: {'sigma': 4, 'num_layers': 2, 'embed_size': 180, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1.012525103100322e-05, 'weight_decay': 1.0611785198711922e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  14%|█▍        | 14/100 [03:11<27:25, 19.14s/it]

Early stopping
[I 2024-01-06 15:50:05,880] Trial 13 finished with value: 0.8455723478243902 and parameters: {'sigma': 4, 'num_layers': 2, 'embed_size': 120, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 2.5744126186095003e-05, 'weight_decay': 4.049400073724855e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  15%|█▌        | 15/100 [03:37<30:04, 21.22s/it]

Early stopping
[I 2024-01-06 15:50:31,937] Trial 14 finished with value: 0.8724183761156522 and parameters: {'sigma': 0.5, 'num_layers': 2, 'embed_size': 180, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 3.174120543975353e-05, 'weight_decay': 3.3870847116262672e-06}. Best is trial 1 with value: 0.8354114981798025.


Best trial: 1. Best value: 0.835411:  15%|█▌        | 15/100 [03:44<21:13, 14.98s/it]


[W 2024-01-06 15:50:39,350] Trial 15 failed with parameters: {'sigma': 4, 'num_layers': 2, 'embed_size': 140, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 9.633500006839301e-05, 'weight_decay': 2.2411469185416494e-06} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/wdwatson2/miniconda3/envs/ml-env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_122358/1535139487.py", line 46, in objective
    train_loss, train_rmse = train(regression_on=True,
                            ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/wdwatson2/projects/CAT-Transformer/model/testingModel.py", line 748, in train
    loss = loss_function(predictions, labels.unsqueeze(1))
    ^^^^^^^^^^^^^^^
  File "/home/wdwatson2/miniconda3/envs/ml-env/li

KeyboardInterrupt: 