In [4]:
import sys
sys.path.insert(0, '/home/wdwatson2/projects/CAT-Transformer/model')
# sys.path.insert(0, r'C:\Users\smbm2\projects\CAT-Transformer\model')
# sys.path.insert(0, '/home/warin/projects/CAT-Transformer/model')
from testingModel import CATTransformer, MyFTTransformer, Combined_Dataset, train, test, EarlyStopping
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import optuna
from optuna.trial import TrialState

device_in_use = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device_in_use)

cuda


In [5]:
df_train = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/higgs/train.csv')
df_test = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/higgs/test.csv')
df_val = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/higgs/validation.csv') 

cont_columns = ['lepton_pT', 'lepton_eta', 'lepton_phi',
       'missing_energy_magnitude', 'missing_energy_phi', 'jet1pt', 'jet1eta',
       'jet1phi', 'jet1b-tag', 'jet2pt', 'jet2eta', 'jet2phi', 'jet2b-tag',
       'jet3pt', 'jet3eta', 'jet3phi', 'jet3b-tag', 'jet4pt', 'jet4eta',
       'jet4phi', 'jet4b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb',
       'm_wbb', 'm_wwbb']
target = ['class']
cat_columns = []


#CHECKING TO MAKE SURE YOUR LIST IS CORRECT (NO NEED TO TOUCH)
yourlist = cont_columns + cat_columns+target
yourlist.sort()
oglist = list(df_train.columns)
oglist.sort()

assert(yourlist == oglist), "You may of spelled feature name wrong or you forgot to put on of them in the list"

cat_features = ()

target_classes = [max(len(df_train[target].value_counts()), len(df_val[target].value_counts()),len(df_test[target].value_counts()))]
print(target_classes)
# Create a StandardScaler and fit it to the cont features
scaler = StandardScaler()
scaler.fit(df_train[cont_columns])

# Transform the training, test, and validation datasets
df_train[cont_columns] = scaler.transform(df_train[cont_columns])
df_test[cont_columns] = scaler.transform(df_test[cont_columns])
df_val[cont_columns] = scaler.transform(df_val[cont_columns])

#Wrapping in Dataset
train_dataset = Combined_Dataset(df_train, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
val_dataset = Combined_Dataset(df_val, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
test_dataset = Combined_Dataset(df_test, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[2]


In [8]:
def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    alpha = trial.suggest_categorical('sigma', [.001, 0.01, 0.1, .5, 1, 1.5, 2, 3, 4, 5])
    num_layers = trial.suggest_int('num_layers', 1, 2)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 350, 500])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    pre_norm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)
    decoder_dropout = trial.suggest_categorical('decoder_dropout', [0,.1,.2,.5])
    classification_dropout = trial.suggest_categorical('class_drop', [0,.1,.2,.5])

    learning_rate = trial.suggest_categorical('learning_rate', [0.00001, 0.0001, 0.001, 0.01, 0.1])

    num_epochs = 400

    # Create your model with the sampled hyperparameters
    model = CATTransformer(alpha = alpha,
                           embed_size= embed_size,
                           n_cont = len(cont_columns),
                           cat_feat=cat_columns,
                           num_layers=num_layers,
                           heads=heads,
                           forward_expansion=forward_expansion,
                           decoder_dropout=decoder_dropout,
                           classification_dropout=classification_dropout,
                           pre_norm_on=pre_norm_on,
                           mlp_scale_classification=mlp_scale_classification,
                           targets_classes=target_classes,
                           ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for epoch in range(num_epochs):
        train_loss, train_acc= train(regression_on=False, 
                                  get_attn=False,
                                   dataloader=train_dataloader, 
                                   model=model, 
                                   loss_function=loss_function, 
                                   optimizer=optimizer, 
                                   device_in_use=device_in_use)
        val_loss, val_acc = test(regression_on=False,
                               get_attn=False,
                               dataloader=val_dataloader,
                               model=model,
                               loss_function=loss_function,
                               device_in_use=device_in_use)
        # Check if we should early stop based on validation accuracy
        if early_stopping(val_acc):
            break

    
    # Log the final test accuracy for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation Accuracy = {val_acc:.4f}"

    # Return the test accuracy as the objective to optimize
    return val_acc

In [9]:
# Set the number of optimization trials
num_trials = 75

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize validation accuracy

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy (at Early Stopping):", best_val_accuracy)

[I 2024-01-02 17:30:53,240] A new study created in memory with name: no-name-b09bb0b6-d55c-44f5-b4d6-561f5664bfde
Best trial: 0. Best value: 0.726729:   1%|▏         | 1/75 [08:32<10:32:32, 512.88s/it]

[I 2024-01-02 17:39:26,116] Trial 0 finished with value: 0.7267287686135854 and parameters: {'sigma': 0.5, 'num_layers': 2, 'embed_size': 70, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:   3%|▎         | 2/75 [16:15<9:47:55, 483.23s/it] 

[I 2024-01-02 17:47:08,598] Trial 1 finished with value: 0.7027265927789488 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 8, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:   4%|▍         | 3/75 [34:46<15:23:43, 769.77s/it]

[I 2024-01-02 18:05:39,339] Trial 2 finished with value: 0.6768205616373155 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 500, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.001}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:   5%|▌         | 4/75 [45:45<14:19:17, 726.16s/it]

[I 2024-01-02 18:16:38,647] Trial 3 finished with value: 0.6041340858094785 and parameters: {'sigma': 1, 'num_layers': 2, 'embed_size': 180, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 0.01}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:   7%|▋         | 5/75 [55:43<13:13:17, 679.97s/it]

[I 2024-01-02 18:26:36,718] Trial 4 finished with value: 0.6673012851023322 and parameters: {'sigma': 3, 'num_layers': 2, 'embed_size': 160, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 8, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:   8%|▊         | 6/75 [1:16:32<16:44:19, 873.33s/it]

[I 2024-01-02 18:47:25,397] Trial 5 finished with value: 0.5330794859590671 and parameters: {'sigma': 5, 'num_layers': 2, 'embed_size': 500, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 8, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 0.1}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:   9%|▉         | 7/75 [1:25:11<14:18:35, 757.58s/it]

[I 2024-01-02 18:56:04,679] Trial 6 finished with value: 0.7224450941728429 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 140, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:  11%|█         | 8/75 [1:38:09<14:13:23, 764.23s/it]

[I 2024-01-02 19:09:03,135] Trial 7 finished with value: 0.6541782824505338 and parameters: {'sigma': 1, 'num_layers': 2, 'embed_size': 250, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 8, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 0.01}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:  12%|█▏        | 9/75 [1:47:47<12:56:25, 705.84s/it]

[I 2024-01-02 19:18:40,585] Trial 8 finished with value: 0.6648534711361936 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:  13%|█▎        | 10/75 [1:56:03<11:34:24, 641.00s/it]

[I 2024-01-02 19:26:56,389] Trial 9 finished with value: 0.4669205140409329 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 60, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.1}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:  15%|█▍        | 11/75 [2:05:36<11:01:45, 620.40s/it]

[I 2024-01-02 19:36:30,072] Trial 10 finished with value: 0.7171414972462092 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 70, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:  16%|█▌        | 12/75 [2:15:24<10:40:58, 610.45s/it]

[I 2024-01-02 19:46:17,787] Trial 11 finished with value: 0.7082341742027606 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 0. Best value: 0.726729:  17%|█▋        | 13/75 [2:24:37<10:12:55, 593.15s/it]

[I 2024-01-02 19:55:31,128] Trial 12 finished with value: 0.6360236622016727 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 90, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.7267287686135854.


Best trial: 13. Best value: 0.729109:  19%|█▊        | 14/75 [2:41:13<12:06:39, 714.74s/it]

[I 2024-01-02 20:12:06,836] Trial 13 finished with value: 0.7291085877473312 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 13 with value: 0.7291085877473312.


Best trial: 13. Best value: 0.729109:  20%|██        | 15/75 [2:58:12<13:26:26, 806.44s/it]

[I 2024-01-02 20:29:05,793] Trial 14 finished with value: 0.7280886652614401 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 350, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 13 with value: 0.7291085877473312.


Best trial: 13. Best value: 0.729109:  21%|██▏       | 16/75 [3:15:07<14:14:39, 869.14s/it]

[I 2024-01-02 20:46:00,530] Trial 15 finished with value: 0.7240769701502686 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 350, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 13 with value: 0.7291085877473312.


Best trial: 13. Best value: 0.729109:  23%|██▎       | 17/75 [3:32:08<14:44:18, 914.80s/it]

[I 2024-01-02 21:03:01,515] Trial 16 finished with value: 0.7277486910994765 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 5, 'forward_expansion': 8, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 1e-05}. Best is trial 13 with value: 0.7291085877473312.


Best trial: 13. Best value: 0.729109:  24%|██▍       | 18/75 [3:40:12<12:26:13, 785.50s/it]

[I 2024-01-02 21:11:06,023] Trial 17 finished with value: 0.7157136057659618 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 50, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 13 with value: 0.7291085877473312.


Best trial: 18. Best value: 0.736452:  25%|██▌       | 19/75 [3:51:09<11:36:59, 746.78s/it]

[I 2024-01-02 21:22:02,600] Trial 18 finished with value: 0.736452029645747 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  27%|██▋       | 20/75 [4:01:20<10:47:04, 705.91s/it]

[I 2024-01-02 21:32:13,247] Trial 19 finished with value: 0.7049024274155165 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 200, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  28%|██▊       | 21/75 [4:12:06<10:19:21, 688.17s/it]

[I 2024-01-02 21:43:00,056] Trial 20 finished with value: 0.6585979465560617 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  29%|██▉       | 22/75 [4:28:46<11:30:24, 781.60s/it]

[I 2024-01-02 21:59:39,536] Trial 21 finished with value: 0.7272047324403346 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 350, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  31%|███       | 23/75 [4:38:15<10:22:11, 717.91s/it]

[I 2024-01-02 22:09:08,889] Trial 22 finished with value: 0.7293805670769021 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  32%|███▏      | 24/75 [4:47:51<9:34:05, 675.40s/it] 

[I 2024-01-02 22:18:45,116] Trial 23 finished with value: 0.730468484395186 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  33%|███▎      | 25/75 [4:57:38<9:00:36, 648.72s/it]

[I 2024-01-02 22:28:31,623] Trial 24 finished with value: 0.7114299313252193 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.01}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  35%|███▍      | 26/75 [5:06:59<8:28:26, 622.58s/it]

[I 2024-01-02 22:37:53,219] Trial 25 finished with value: 0.727068742775549 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  36%|███▌      | 27/75 [5:16:21<8:03:22, 604.22s/it]

[I 2024-01-02 22:47:14,610] Trial 26 finished with value: 0.4669205140409329 and parameters: {'sigma': 3, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.1}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  37%|███▋      | 28/75 [5:23:50<7:16:51, 557.68s/it]

[I 2024-01-02 22:54:43,698] Trial 27 finished with value: 0.7144896987828925 and parameters: {'sigma': 0.01, 'num_layers': 1, 'embed_size': 80, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  39%|███▊      | 29/75 [5:32:28<6:58:33, 545.94s/it]

[I 2024-01-02 23:03:22,231] Trial 28 finished with value: 0.729924525736044 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  40%|████      | 30/75 [5:42:34<7:02:50, 563.80s/it]

[I 2024-01-02 23:13:27,702] Trial 29 finished with value: 0.678520432447134 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  41%|████▏     | 31/75 [5:51:13<6:43:30, 550.24s/it]

[I 2024-01-02 23:22:06,320] Trial 30 finished with value: 0.7161215747603182 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.01}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  43%|████▎     | 32/75 [6:00:01<6:29:42, 543.78s/it]

[I 2024-01-02 23:30:55,030] Trial 31 finished with value: 0.7309444482219352 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  44%|████▍     | 33/75 [6:08:41<6:15:36, 536.58s/it]

[I 2024-01-02 23:39:34,811] Trial 32 finished with value: 0.7294485619092949 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  45%|████▌     | 34/75 [6:17:31<6:05:17, 534.58s/it]

[I 2024-01-02 23:48:24,717] Trial 33 finished with value: 0.7002107839804175 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  47%|████▋     | 35/75 [6:26:47<6:00:35, 540.88s/it]

[I 2024-01-02 23:57:40,295] Trial 34 finished with value: 0.7305364792275787 and parameters: {'sigma': 0.01, 'num_layers': 1, 'embed_size': 180, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  48%|████▊     | 36/75 [6:37:25<6:10:36, 570.17s/it]

[I 2024-01-03 00:08:18,817] Trial 35 finished with value: 0.6977629700142789 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 180, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.2, 'learning_rate': 0.0001}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  49%|████▉     | 37/75 [6:46:38<5:57:52, 565.07s/it]

[I 2024-01-03 00:17:31,996] Trial 36 finished with value: 0.6744407425035698 and parameters: {'sigma': 0.01, 'num_layers': 1, 'embed_size': 180, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  51%|█████     | 38/75 [6:55:31<5:42:30, 555.41s/it]

[I 2024-01-03 00:26:24,858] Trial 37 finished with value: 0.5330794859590671 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 160, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  52%|█████▏    | 39/75 [7:06:21<5:50:13, 583.71s/it]

[I 2024-01-03 00:37:14,608] Trial 38 finished with value: 0.7295165567416876 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 180, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  53%|█████▎    | 40/75 [7:17:05<5:51:05, 601.86s/it]

[I 2024-01-03 00:47:58,822] Trial 39 finished with value: 0.6934113007411437 and parameters: {'sigma': 3, 'num_layers': 1, 'embed_size': 250, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  55%|█████▍    | 41/75 [7:41:51<8:11:17, 866.99s/it]

[I 2024-01-03 01:12:44,435] Trial 40 finished with value: 0.5330794859590671 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 500, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0.2, 'learning_rate': 0.01}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  56%|█████▌    | 42/75 [7:50:39<7:00:55, 765.31s/it]

[I 2024-01-03 01:21:32,482] Trial 41 finished with value: 0.7331202828585027 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 120, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  57%|█████▋    | 43/75 [7:59:28<6:10:21, 694.42s/it]

[I 2024-01-03 01:30:21,502] Trial 42 finished with value: 0.7085061535323316 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 120, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  59%|█████▊    | 44/75 [8:08:18<5:33:21, 645.22s/it]

[I 2024-01-03 01:39:11,931] Trial 43 finished with value: 0.7293805670769021 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 120, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  60%|██████    | 45/75 [8:15:38<4:51:50, 583.70s/it]

[I 2024-01-03 01:46:32,065] Trial 44 finished with value: 0.7066702930577277 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 60, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  61%|██████▏   | 46/75 [8:23:15<4:23:42, 545.60s/it]

[I 2024-01-03 01:54:08,764] Trial 45 finished with value: 0.7199292853743116 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.0001}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  63%|██████▎   | 47/75 [8:32:02<4:12:03, 540.14s/it]

[I 2024-01-03 02:02:56,180] Trial 46 finished with value: 0.5330794859590671 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 120, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  64%|██████▍   | 48/75 [8:39:35<3:51:14, 513.88s/it]

[I 2024-01-03 02:10:28,786] Trial 47 finished with value: 0.7094580811858299 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  65%|██████▌   | 49/75 [8:46:58<3:33:27, 492.60s/it]

[I 2024-01-03 02:17:51,715] Trial 48 finished with value: 0.7154416264363909 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  67%|██████▋   | 50/75 [8:56:03<3:31:46, 508.27s/it]

[I 2024-01-03 02:26:56,552] Trial 49 finished with value: 0.6955871353777113 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 140, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.001}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  68%|██████▊   | 51/75 [9:03:31<3:16:08, 490.35s/it]

[I 2024-01-03 02:34:25,096] Trial 50 finished with value: 0.634391786224247 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 80, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 18. Best value: 0.736452:  69%|██████▉   | 52/75 [9:13:46<3:22:17, 527.71s/it]

[I 2024-01-03 02:44:39,974] Trial 51 finished with value: 0.7335962466852519 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 18 with value: 0.736452029645747.


Best trial: 52. Best value: 0.73652:  71%|███████   | 53/75 [9:24:02<3:23:08, 554.01s/it] 

[I 2024-01-03 02:54:55,343] Trial 52 finished with value: 0.7365200244781397 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  72%|███████▏  | 54/75 [9:34:16<3:20:13, 572.08s/it]

[I 2024-01-03 03:05:09,588] Trial 53 finished with value: 0.7334602570204665 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  73%|███████▎  | 55/75 [9:44:18<3:13:42, 581.13s/it]

[I 2024-01-03 03:15:11,828] Trial 54 finished with value: 0.7309444482219352 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  75%|███████▍  | 56/75 [9:54:34<3:07:18, 591.49s/it]

[I 2024-01-03 03:25:27,492] Trial 55 finished with value: 0.6932073162439655 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.0001}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  76%|███████▌  | 57/75 [10:04:48<2:59:27, 598.18s/it]

[I 2024-01-03 03:35:41,277] Trial 56 finished with value: 0.7312844223838988 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  77%|███████▋  | 58/75 [10:15:00<2:50:41, 602.45s/it]

[I 2024-01-03 03:45:53,699] Trial 57 finished with value: 0.7335962466852519 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  79%|███████▊  | 59/75 [10:23:52<2:35:03, 581.45s/it]

[I 2024-01-03 03:54:46,142] Trial 58 finished with value: 0.7223091045080574 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 200, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  80%|████████  | 60/75 [10:34:03<2:27:32, 590.17s/it]

[I 2024-01-03 04:04:56,674] Trial 59 finished with value: 0.6432311144353029 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.01}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  81%|████████▏ | 61/75 [10:44:05<2:18:32, 593.72s/it]

[I 2024-01-03 04:14:58,671] Trial 60 finished with value: 0.7278166859318692 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 200, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  83%|████████▎ | 62/75 [10:54:19<2:09:56, 599.74s/it]

[I 2024-01-03 04:25:12,443] Trial 61 finished with value: 0.7355001019922486 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  84%|████████▍ | 63/75 [11:04:22<2:00:09, 600.76s/it]

[I 2024-01-03 04:35:15,595] Trial 62 finished with value: 0.7291085877473312 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  85%|████████▌ | 64/75 [11:14:37<1:50:54, 604.95s/it]

[I 2024-01-03 04:45:30,329] Trial 63 finished with value: 0.736180050316176 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  87%|████████▋ | 65/75 [11:24:51<1:41:19, 607.94s/it]

[I 2024-01-03 04:55:45,236] Trial 64 finished with value: 0.7354321071598559 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  88%|████████▊ | 66/75 [11:35:04<1:31:23, 609.33s/it]

[I 2024-01-03 05:05:57,800] Trial 65 finished with value: 0.6985789080029918 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  89%|████████▉ | 67/75 [11:45:11<1:21:08, 608.53s/it]

[I 2024-01-03 05:16:04,486] Trial 66 finished with value: 0.5330794859590671 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  91%|█████████ | 68/75 [11:55:26<1:11:14, 610.60s/it]

[I 2024-01-03 05:26:19,915] Trial 67 finished with value: 0.7275447066022982 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  92%|█████████▏| 69/75 [12:05:32<1:00:54, 609.06s/it]

[I 2024-01-03 05:36:25,383] Trial 68 finished with value: 0.7036105256000544 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  93%|█████████▎| 70/75 [12:15:46<50:53, 610.72s/it]  

[I 2024-01-03 05:46:39,960] Trial 69 finished with value: 0.6976269803494934 and parameters: {'sigma': 3, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  95%|█████████▍| 71/75 [12:24:39<39:09, 587.26s/it]

[I 2024-01-03 05:55:32,485] Trial 70 finished with value: 0.687087781328619 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 160, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 52. Best value: 0.73652:  96%|█████████▌| 72/75 [12:34:53<29:46, 595.34s/it]

[I 2024-01-03 06:05:46,683] Trial 71 finished with value: 0.736452029645747 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 52 with value: 0.7365200244781397.


Best trial: 72. Best value: 0.73924:  97%|█████████▋| 73/75 [12:45:07<20:01, 600.86s/it]

[I 2024-01-03 06:16:00,420] Trial 72 finished with value: 0.7392398177738492 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 72 with value: 0.7392398177738492.


Best trial: 72. Best value: 0.73924:  99%|█████████▊| 74/75 [12:55:22<10:05, 605.28s/it]

[I 2024-01-03 06:26:16,014] Trial 73 finished with value: 0.7344801795063575 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 72 with value: 0.7392398177738492.


Best trial: 72. Best value: 0.73924: 100%|██████████| 75/75 [13:03:00<00:00, 626.41s/it]

[I 2024-01-03 06:33:53,659] Trial 74 finished with value: 0.7206772285306317 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 72 with value: 0.7392398177738492.
Best Hyperparameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}
Best Validation Accuracy (at Early Stopping): 0.7392398177738492



