In [1]:
import sys
sys.path.insert(0, '/home/wdwatson2/projects/CAT-Transformer/model')
# sys.path.insert(0, r'C:\Users\smbm2\projects\CAT-Transformer\model')
# sys.path.insert(0, '/home/warin/projects/CAT-Transformer/model')
from testingModel import CATTransformer, MyFTTransformer, Combined_Dataset, train, test, EarlyStopping
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import optuna
from optuna.trial import TrialState

device_in_use = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device_in_use)

cuda


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/income/train.csv')
df_test = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/income/test.csv')
df_val = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/income/validation.csv') 

cont_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week']
cat_columns = ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country']
cat_features = (10,16,7,16,6,5,2,43)
target = ['income']

#CHECKING TO MAKE SURE YOUR LIST IS CORRECT (NO NEED TO TOUCH)
yourlist = cont_columns + cat_columns+target
yourlist.sort()
oglist = list(df_train.columns)
oglist.sort()

assert(yourlist == oglist), "You may of spelled feature name wrong or you forgot to put on of them in the list"

cat_features = ()

target_classes = [max(len(df_train[target].value_counts()), len(df_val[target].value_counts()),len(df_test[target].value_counts()))]
print(target_classes)
# Create a StandardScaler and fit it to the cont features
scaler = StandardScaler()
scaler.fit(df_train[cont_columns])

# Transform the training, test, and validation datasets
df_train[cont_columns] = scaler.transform(df_train[cont_columns])
df_test[cont_columns] = scaler.transform(df_test[cont_columns])
df_val[cont_columns] = scaler.transform(df_val[cont_columns])

#Wrapping in Dataset
train_dataset = Combined_Dataset(df_train, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
val_dataset = Combined_Dataset(df_val, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
test_dataset = Combined_Dataset(df_test, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[2]


In [5]:
def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    alpha = trial.suggest_float('sigma', 0.001, 5, log=True)
    num_layers = trial.suggest_int('num_layers', 1, 5)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 350, 500])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    pre_norm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)
    decoder_dropout = trial.suggest_categorical('decoder_dropout', [0,.1,.2,.5])
    classification_dropout = trial.suggest_categorical('class_drop', [0,.1,.2,.5])

    learning_rate = trial.suggest_float('learning_rate', 0.00001, 0.001, log=True)
    weight_decay = trial.suggest_float('weight_decay', 0.000001, 0.001, log=True)

    num_epochs = 400

    # Create your model with the sampled hyperparameters
    model = CATTransformer(alpha = alpha,
                           embed_size= embed_size,
                           n_cont = len(cont_columns),
                           cat_feat=cat_features,
                           num_layers=num_layers,
                           heads=heads,
                           forward_expansion=forward_expansion,
                           decoder_dropout=decoder_dropout,
                           classification_dropout=classification_dropout,
                           pre_norm_on=pre_norm_on,
                           mlp_scale_classification=mlp_scale_classification,
                           targets_classes=target_classes,
                           ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate, weight_decay=weight_decay)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for epoch in range(num_epochs):
        train_loss, train_acc, train_f1= train(regression_on=False, 
                                  get_attn=False,
                                   dataloader=train_dataloader, 
                                   model=model, 
                                   loss_function=loss_function, 
                                   optimizer=optimizer, 
                                   device_in_use=device_in_use)
        val_loss, val_acc, test_f1 = test(regression_on=False,
                               get_attn=False,
                               dataloader=val_dataloader,
                               model=model,
                               loss_function=loss_function,
                               device_in_use=device_in_use)
        # Check if we should early stop based on validation accuracy
        if early_stopping(val_acc):
            break

    
    # Log the final test accuracy for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation Accuracy = {val_acc:.4f}"

    # Return the test accuracy as the objective to optimize
    return val_acc

In [6]:
# Set the number of optimization trials
num_trials = 150

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize validation accuracy

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy (at Early Stopping):", best_val_accuracy)

[I 2024-01-06 16:10:26,243] A new study created in memory with name: no-name-77eb95a6-00ec-4067-8b4c-8f4159bd7132
Best trial: 0. Best value: 0.831599:   1%|          | 1/150 [04:34<11:20:37, 274.07s/it]

[I 2024-01-06 16:15:00,317] Trial 0 finished with value: 0.8315987032929534 and parameters: {'sigma': 0.46744698750436525, 'num_layers': 3, 'embed_size': 160, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 2.9917460732420438e-05, 'weight_decay': 5.964995167167225e-05}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   1%|▏         | 2/150 [09:29<11:47:32, 286.84s/it]

[I 2024-01-06 16:19:56,094] Trial 1 finished with value: 0.8259682648012284 and parameters: {'sigma': 0.046281318975548484, 'num_layers': 3, 'embed_size': 250, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1.0263869059749554e-05, 'weight_decay': 0.000283847144236642}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   2%|▏         | 3/150 [12:45<10:01:00, 245.31s/it]

[I 2024-01-06 16:23:11,988] Trial 2 finished with value: 0.8215321617471422 and parameters: {'sigma': 0.20904670879132264, 'num_layers': 2, 'embed_size': 50, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 1.4626663441930123e-05, 'weight_decay': 0.000664772919537542}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   3%|▎         | 4/150 [15:35<8:44:14, 215.44s/it] 

[I 2024-01-06 16:26:01,640] Trial 3 finished with value: 0.8247739293635898 and parameters: {'sigma': 0.6385357900257153, 'num_layers': 1, 'embed_size': 120, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 8, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0008988145349060302, 'weight_decay': 0.0009514163985018221}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   3%|▎         | 5/150 [19:30<8:58:02, 222.64s/it]

[I 2024-01-06 16:29:57,037] Trial 4 finished with value: 0.8215321617471422 and parameters: {'sigma': 0.003972960895485397, 'num_layers': 2, 'embed_size': 140, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.1, 'class_drop': 0.2, 'learning_rate': 0.000709647708252516, 'weight_decay': 0.0001136059680255276}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   4%|▍         | 6/150 [25:55<11:06:12, 277.58s/it]

[I 2024-01-06 16:36:21,279] Trial 5 finished with value: 0.8302337485070806 and parameters: {'sigma': 1.3656249589726233, 'num_layers': 5, 'embed_size': 100, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 3.110772284168387e-05, 'weight_decay': 2.6051702489206002e-05}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   5%|▍         | 7/150 [31:19<11:37:49, 292.79s/it]

[I 2024-01-06 16:41:45,386] Trial 6 finished with value: 0.8281863163282717 and parameters: {'sigma': 0.011840427957944284, 'num_layers': 4, 'embed_size': 140, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0.1, 'class_drop': 0.2, 'learning_rate': 0.00014262746498538726, 'weight_decay': 0.0007003305635340298}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   5%|▌         | 8/150 [36:48<12:00:23, 304.39s/it]

[I 2024-01-06 16:47:14,620] Trial 7 finished with value: 0.8157311039071831 and parameters: {'sigma': 0.03129179957193105, 'num_layers': 2, 'embed_size': 350, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 0.000303176040784912, 'weight_decay': 0.00030377616371203813}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   6%|▌         | 9/150 [40:55<11:13:00, 286.39s/it]

[I 2024-01-06 16:51:21,408] Trial 8 finished with value: 0.8278450776318035 and parameters: {'sigma': 0.739219381558773, 'num_layers': 3, 'embed_size': 100, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 4.9784646489850094e-05, 'weight_decay': 0.00033984166342273744}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   7%|▋         | 10/150 [45:23<10:55:22, 280.87s/it]

[I 2024-01-06 16:55:49,937] Trial 9 finished with value: 0.7882613888414947 and parameters: {'sigma': 4.830594396233215, 'num_layers': 3, 'embed_size': 160, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 0.00035737292547085, 'weight_decay': 1.797723954875346e-05}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   7%|▋         | 11/150 [50:13<10:56:51, 283.54s/it]

[I 2024-01-06 17:00:39,515] Trial 10 finished with value: 0.8172666780412898 and parameters: {'sigma': 0.1772188161016428, 'num_layers': 5, 'embed_size': 60, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0.5, 'learning_rate': 6.812121140525134e-05, 'weight_decay': 5.471638502877206e-06}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   8%|▊         | 12/150 [56:16<11:47:45, 307.72s/it]

[I 2024-01-06 17:06:42,544] Trial 11 finished with value: 0.8184610134789285 and parameters: {'sigma': 3.666893971803549, 'num_layers': 5, 'embed_size': 200, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 2.9111700697175368e-05, 'weight_decay': 3.732700393524817e-05}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 0. Best value: 0.831599:   9%|▊         | 13/150 [1:01:22<11:41:40, 307.31s/it]

[I 2024-01-06 17:11:48,895] Trial 12 finished with value: 0.8240914519706535 and parameters: {'sigma': 0.9591817189766739, 'num_layers': 4, 'embed_size': 80, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 2.5841602905825046e-05, 'weight_decay': 1.8859222516427222e-06}. Best is trial 0 with value: 0.8315987032929534.


Best trial: 13. Best value: 0.834158:   9%|▉         | 14/150 [1:07:30<12:17:48, 325.51s/it]

[I 2024-01-06 17:17:56,457] Trial 13 finished with value: 0.8341579935164648 and parameters: {'sigma': 0.19570857353637475, 'num_layers': 4, 'embed_size': 180, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 3.430982375163862e-05, 'weight_decay': 2.9896919928539906e-05}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 13. Best value: 0.834158:  10%|█         | 15/150 [1:12:44<12:04:41, 322.08s/it]

[I 2024-01-06 17:23:10,613] Trial 14 finished with value: 0.8249445487118239 and parameters: {'sigma': 0.16932910978527146, 'num_layers': 4, 'embed_size': 180, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 0.00012020047329849325, 'weight_decay': 5.7810520167967805e-05}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 13. Best value: 0.834158:  11%|█         | 16/150 [1:20:29<13:35:25, 365.12s/it]

[I 2024-01-06 17:30:55,657] Trial 15 finished with value: 0.8191434908718649 and parameters: {'sigma': 0.0019100087021830794, 'num_layers': 4, 'embed_size': 500, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 5.472609702028712e-05, 'weight_decay': 6.2572609305348e-06}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 13. Best value: 0.834158:  11%|█▏        | 17/150 [1:23:24<11:22:52, 308.07s/it]

[I 2024-01-06 17:33:51,048] Trial 16 finished with value: 0.8254564067565262 and parameters: {'sigma': 0.26287389716104675, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1.7145375930929865e-05, 'weight_decay': 9.71942980967648e-05}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 13. Best value: 0.834158:  12%|█▏        | 18/150 [1:28:22<11:10:49, 304.92s/it]

[I 2024-01-06 17:38:48,641] Trial 17 finished with value: 0.8193141102200989 and parameters: {'sigma': 0.014457919493142918, 'num_layers': 4, 'embed_size': 70, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 8.607616996903652e-05, 'weight_decay': 1.3643989282368645e-05}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 13. Best value: 0.834158:  13%|█▎        | 19/150 [1:33:08<10:53:08, 299.15s/it]

[I 2024-01-06 17:43:34,350] Trial 18 finished with value: 0.8314280839447193 and parameters: {'sigma': 0.09309343719863633, 'num_layers': 3, 'embed_size': 180, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 4.290637325975868e-05, 'weight_decay': 9.212494953876957e-06}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 13. Best value: 0.834158:  13%|█▎        | 20/150 [1:36:42<9:52:55, 273.66s/it] 

[I 2024-01-06 17:47:08,589] Trial 19 finished with value: 0.8242620713188875 and parameters: {'sigma': 0.44494543200076925, 'num_layers': 2, 'embed_size': 160, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 6, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.00018203818262212313, 'weight_decay': 2.2169683783780304e-06}. Best is trial 13 with value: 0.8341579935164648.


Best trial: 20. Best value: 0.838423:  14%|█▍        | 21/150 [1:41:27<9:55:41, 277.06s/it]

[I 2024-01-06 17:51:53,602] Trial 20 finished with value: 0.838423477222317 and parameters: {'sigma': 2.094486508964175, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 1.971951125183475e-05, 'weight_decay': 0.0001092633471314676}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  15%|█▍        | 22/150 [1:46:05<9:51:46, 277.39s/it]

[I 2024-01-06 17:56:31,766] Trial 21 finished with value: 0.8290394130694421 and parameters: {'sigma': 2.1047252760615067, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 1.534103675718188e-05, 'weight_decay': 0.00010842516928784937}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  15%|█▌        | 23/150 [1:50:26<9:36:46, 272.49s/it]

[I 2024-01-06 18:00:52,812] Trial 22 finished with value: 0.8339873741682307 and parameters: {'sigma': 1.9732567790423907, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 2.2349111661366296e-05, 'weight_decay': 4.738559670361841e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  16%|█▌        | 24/150 [1:55:26<9:49:39, 280.79s/it]

[I 2024-01-06 18:05:52,970] Trial 23 finished with value: 0.8285275550247398 and parameters: {'sigma': 1.7155881204222792, 'num_layers': 4, 'embed_size': 180, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 1.010458681766805e-05, 'weight_decay': 0.00018465891063994307}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  17%|█▋        | 25/150 [1:59:47<9:32:20, 274.73s/it]

[I 2024-01-06 18:10:13,549] Trial 24 finished with value: 0.8310868452482512 and parameters: {'sigma': 2.499741702510256, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 2.033785398046814e-05, 'weight_decay': 3.664663577408386e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  17%|█▋        | 26/150 [2:02:59<8:36:27, 249.90s/it]

[I 2024-01-06 18:13:25,538] Trial 25 finished with value: 0.8290394130694421 and parameters: {'sigma': 0.07499094931293278, 'num_layers': 2, 'embed_size': 50, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 3.893556480557328e-05, 'weight_decay': 6.634633685749828e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  18%|█▊        | 27/150 [2:08:08<9:08:36, 267.62s/it]

[I 2024-01-06 18:18:34,481] Trial 26 finished with value: 0.8271626002388671 and parameters: {'sigma': 4.753700921502038, 'num_layers': 4, 'embed_size': 120, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1.9712924695120747e-05, 'weight_decay': 3.1545218550334985e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  19%|█▊        | 28/150 [2:17:30<12:03:42, 355.92s/it]

[I 2024-01-06 18:27:56,435] Trial 27 finished with value: 0.8194847295683331 and parameters: {'sigma': 1.1421015100671805, 'num_layers': 3, 'embed_size': 500, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 1.3301604458286227e-05, 'weight_decay': 0.0001605146106365673}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  19%|█▉        | 29/150 [2:24:01<12:18:59, 366.44s/it]

[I 2024-01-06 18:34:27,430] Trial 28 finished with value: 0.8327930387305921 and parameters: {'sigma': 0.4110298853050399, 'num_layers': 5, 'embed_size': 250, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 6.905167637250562e-05, 'weight_decay': 1.838642301856102e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  20%|██        | 30/150 [2:28:53<11:28:13, 344.12s/it]

[I 2024-01-06 18:39:19,447] Trial 29 finished with value: 0.8343286128646988 and parameters: {'sigma': 0.3400531430714865, 'num_layers': 4, 'embed_size': 160, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 3.449132139309548e-05, 'weight_decay': 5.428915868318288e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  21%|██        | 31/150 [2:33:29<10:42:01, 323.71s/it]

[I 2024-01-06 18:43:55,541] Trial 30 finished with value: 0.8317693226411875 and parameters: {'sigma': 0.35166322834229446, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 3.516617931886132e-05, 'weight_decay': 7.477094826240004e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  21%|██▏       | 32/150 [2:37:48<9:58:16, 304.21s/it] 

[I 2024-01-06 18:48:14,259] Trial 31 finished with value: 0.8302337485070806 and parameters: {'sigma': 0.11136617761551615, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 2.3286394726989576e-05, 'weight_decay': 4.4674802197437546e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  22%|██▏       | 33/150 [2:42:57<9:56:18, 305.80s/it]

[I 2024-01-06 18:53:23,762] Trial 32 finished with value: 0.8302337485070806 and parameters: {'sigma': 0.04062551204905288, 'num_layers': 4, 'embed_size': 160, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 2.4249813658876763e-05, 'weight_decay': 0.0001886705524405487}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  23%|██▎       | 34/150 [2:47:23<9:28:12, 293.90s/it]

[I 2024-01-06 18:57:49,905] Trial 33 finished with value: 0.8327930387305921 and parameters: {'sigma': 0.7042360448327722, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 1.2229838559211111e-05, 'weight_decay': 7.2367844629499e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  23%|██▎       | 35/150 [2:51:12<8:46:01, 274.45s/it]

[I 2024-01-06 19:01:38,955] Trial 34 finished with value: 0.8362054256952739 and parameters: {'sigma': 2.948530041107636, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 5.832575638519451e-05, 'weight_decay': 2.111449085927614e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  24%|██▍       | 36/150 [2:54:29<7:57:24, 251.27s/it]

[I 2024-01-06 19:04:56,133] Trial 35 finished with value: 0.8360348063470397 and parameters: {'sigma': 2.613205537311924, 'num_layers': 2, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 6.373984264215136e-05, 'weight_decay': 2.3006364965797787e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  25%|██▍       | 37/150 [2:57:04<6:58:37, 222.28s/it]

[I 2024-01-06 19:07:30,784] Trial 36 finished with value: 0.8322811806858897 and parameters: {'sigma': 2.8025716906191542, 'num_layers': 1, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 7.703248247096035e-05, 'weight_decay': 1.2770404927630402e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  25%|██▌       | 38/150 [3:00:14<6:37:04, 212.72s/it]

[I 2024-01-06 19:10:41,187] Trial 37 finished with value: 0.8315987032929534 and parameters: {'sigma': 1.1991081288654613, 'num_layers': 2, 'embed_size': 70, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 5.155213513607803e-05, 'weight_decay': 2.1809054904342128e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  26%|██▌       | 39/150 [3:03:29<6:23:41, 207.41s/it]

[I 2024-01-06 19:13:56,196] Trial 38 finished with value: 0.834669851561167 and parameters: {'sigma': 3.0743702472468812, 'num_layers': 2, 'embed_size': 70, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00010468967452734259, 'weight_decay': 3.5257916267276895e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  27%|██▋       | 40/150 [3:06:41<6:11:38, 202.71s/it]

[I 2024-01-06 19:17:07,955] Trial 39 finished with value: 0.8261388841494626 and parameters: {'sigma': 3.1900753147444623, 'num_layers': 2, 'embed_size': 70, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00017765802148200644, 'weight_decay': 1.0008083373051827e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  27%|██▋       | 41/150 [3:09:53<6:02:26, 199.51s/it]

[I 2024-01-06 19:20:19,998] Trial 40 finished with value: 0.8304043678553148 and parameters: {'sigma': 1.395114914101002, 'num_layers': 2, 'embed_size': 70, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00011046765189679316, 'weight_decay': 3.314734424071984e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  28%|██▊       | 42/150 [3:12:28<5:34:47, 186.00s/it]

[I 2024-01-06 19:22:54,465] Trial 41 finished with value: 0.8322811806858897 and parameters: {'sigma': 0.5847063393041448, 'num_layers': 1, 'embed_size': 70, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 9.74831472105174e-05, 'weight_decay': 7.907306736771065e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  29%|██▊       | 43/150 [3:16:54<6:14:37, 210.07s/it]

[I 2024-01-06 19:27:20,703] Trial 42 finished with value: 0.800375362566115 and parameters: {'sigma': 3.490910556283025, 'num_layers': 2, 'embed_size': 350, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 6.340231820985387e-05, 'weight_decay': 3.84841023906094e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  29%|██▉       | 44/150 [3:20:07<6:02:14, 205.04s/it]

[I 2024-01-06 19:30:34,015] Trial 43 finished with value: 0.8324518000341239 and parameters: {'sigma': 0.9124792277173375, 'num_layers': 2, 'embed_size': 60, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00015324983986931794, 'weight_decay': 0.000380841134567454}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  30%|███       | 45/150 [3:23:53<6:09:26, 211.11s/it]

[I 2024-01-06 19:34:19,267] Trial 44 finished with value: 0.8135130523801399 and parameters: {'sigma': 4.830273387060313, 'num_layers': 3, 'embed_size': 200, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.00024503177867530966, 'weight_decay': 1.4699583446578058e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  31%|███       | 46/150 [3:26:43<5:44:36, 198.81s/it]

[I 2024-01-06 19:37:09,392] Trial 45 finished with value: 0.8304043678553148 and parameters: {'sigma': 1.913415099558946, 'num_layers': 1, 'embed_size': 70, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 4.330717482122647e-05, 'weight_decay': 0.0001415801936925559}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  31%|███▏      | 47/150 [3:30:15<5:48:00, 202.73s/it]

[I 2024-01-06 19:40:41,257] Trial 46 finished with value: 0.8329636580788261 and parameters: {'sigma': 1.3670043657501754, 'num_layers': 2, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 2.9899422254146768e-05, 'weight_decay': 2.4488033014426615e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  32%|███▏      | 48/150 [3:34:18<6:05:20, 214.91s/it]

[I 2024-01-06 19:44:44,579] Trial 47 finished with value: 0.8297218904623784 and parameters: {'sigma': 3.023802940356423, 'num_layers': 3, 'embed_size': 100, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 5.5018084255046876e-05, 'weight_decay': 0.00026157056043128843}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  33%|███▎      | 49/150 [3:37:35<5:52:57, 209.68s/it]

[I 2024-01-06 19:48:02,063] Trial 48 finished with value: 0.7911619177614742 and parameters: {'sigma': 0.8073381024476305, 'num_layers': 2, 'embed_size': 90, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 0.00047571897391098054, 'weight_decay': 1.0444618810942656e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  33%|███▎      | 50/150 [3:42:08<6:20:47, 228.48s/it]

[I 2024-01-06 19:52:34,407] Trial 49 finished with value: 0.8297218904623784 and parameters: {'sigma': 0.29760971643219825, 'num_layers': 3, 'embed_size': 250, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.00012893771483507823, 'weight_decay': 0.000510782900082504}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  34%|███▍      | 51/150 [3:46:47<6:42:21, 243.85s/it]

[I 2024-01-06 19:57:14,130] Trial 50 finished with value: 0.830745606551783 and parameters: {'sigma': 0.5660906016645155, 'num_layers': 5, 'embed_size': 50, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 9.182441396596671e-05, 'weight_decay': 4.926979118968187e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  35%|███▍      | 52/150 [3:52:38<7:30:25, 275.77s/it]

[I 2024-01-06 20:03:04,391] Trial 51 finished with value: 0.8305749872035488 and parameters: {'sigma': 0.13996620100538767, 'num_layers': 4, 'embed_size': 180, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 3.3889626599068896e-05, 'weight_decay': 2.9831448809933523e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  35%|███▌      | 53/150 [3:57:21<7:29:39, 278.14s/it]

[I 2024-01-06 20:07:48,063] Trial 52 finished with value: 0.8297218904623784 and parameters: {'sigma': 0.24545262962303774, 'num_layers': 4, 'embed_size': 70, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 4.602478279316843e-05, 'weight_decay': 8.878722486838109e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  36%|███▌      | 54/150 [4:03:24<8:05:27, 303.41s/it]

[I 2024-01-06 20:13:50,434] Trial 53 finished with value: 0.8292100324176762 and parameters: {'sigma': 0.05932274934839896, 'num_layers': 5, 'embed_size': 120, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 2.82603155708964e-05, 'weight_decay': 1.1820157140800763e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  37%|███▋      | 55/150 [4:09:08<8:19:39, 315.58s/it]

[I 2024-01-06 20:19:34,391] Trial 54 finished with value: 0.8225558778365467 and parameters: {'sigma': 0.025811127164714415, 'num_layers': 4, 'embed_size': 180, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 6.0940126376272575e-05, 'weight_decay': 1.9088995096612886e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  37%|███▋      | 56/150 [4:14:43<8:23:37, 321.47s/it]

[I 2024-01-06 20:25:09,608] Trial 55 finished with value: 0.8302337485070806 and parameters: {'sigma': 2.2896083790868427, 'num_layers': 3, 'embed_size': 350, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 8.325578855150185e-05, 'weight_decay': 5.939844010390189e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  38%|███▊      | 57/150 [4:18:54<7:45:29, 300.31s/it]

[I 2024-01-06 20:29:20,566] Trial 56 finished with value: 0.8292100324176762 and parameters: {'sigma': 0.20034723422908698, 'num_layers': 3, 'embed_size': 70, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 3.807917311151229e-05, 'weight_decay': 5.414695964622428e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  39%|███▊      | 58/150 [4:23:19<7:24:20, 289.79s/it]

[I 2024-01-06 20:33:45,792] Trial 57 finished with value: 0.8338167548199966 and parameters: {'sigma': 1.5785621679052297, 'num_layers': 4, 'embed_size': 200, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 1.7836750895036698e-05, 'weight_decay': 0.00012732373163050283}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  39%|███▉      | 59/150 [4:26:33<6:35:55, 261.05s/it]

[I 2024-01-06 20:36:59,778] Trial 58 finished with value: 0.8300631291588466 and parameters: {'sigma': 4.024996565065716, 'num_layers': 2, 'embed_size': 60, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 7.48337532243345e-05, 'weight_decay': 3.808645823347131e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  40%|████      | 60/150 [4:29:34<5:55:38, 237.09s/it]

[I 2024-01-06 20:40:00,965] Trial 59 finished with value: 0.8290394130694421 and parameters: {'sigma': 0.00946535852342731, 'num_layers': 1, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 4.882274516430135e-05, 'weight_decay': 2.7110650038866562e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  41%|████      | 61/150 [4:33:54<6:01:44, 243.87s/it]

[I 2024-01-06 20:44:20,661] Trial 60 finished with value: 0.8343286128646988 and parameters: {'sigma': 1.1053247461545752, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0009559448248119488, 'weight_decay': 8.204570165582039e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  41%|████▏     | 62/150 [4:38:13<6:04:24, 248.46s/it]

[I 2024-01-06 20:48:39,834] Trial 61 finished with value: 0.8314280839447193 and parameters: {'sigma': 0.9835507403156913, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0008027653786400088, 'weight_decay': 8.74669575704445e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  42%|████▏     | 63/150 [4:42:30<6:03:46, 250.87s/it]

[I 2024-01-06 20:52:56,335] Trial 62 finished with value: 0.8363760450435079 and parameters: {'sigma': 2.388166420345755, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005773020108514673, 'weight_decay': 6.180063781659496e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  43%|████▎     | 64/150 [4:46:48<6:02:59, 253.25s/it]

[I 2024-01-06 20:57:15,136] Trial 63 finished with value: 0.8278450776318035 and parameters: {'sigma': 2.3148610376840413, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005584061794928545, 'weight_decay': 0.0002467359676294955}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  43%|████▎     | 65/150 [4:51:27<6:09:34, 260.88s/it]

[I 2024-01-06 21:01:53,813] Trial 64 finished with value: 0.830745606551783 and parameters: {'sigma': 1.5452435603104684, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0008385964966362385, 'weight_decay': 6.102074756251319e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  44%|████▍     | 66/150 [4:55:44<6:03:45, 259.83s/it]

[I 2024-01-06 21:06:11,195] Trial 65 finished with value: 0.8273332195871012 and parameters: {'sigma': 3.793679717221894, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0006792672249333779, 'weight_decay': 0.00011855987400561826}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  45%|████▍     | 67/150 [5:02:25<6:57:40, 301.93s/it]

[I 2024-01-06 21:12:51,352] Trial 66 finished with value: 0.7773417505545129 and parameters: {'sigma': 2.412667294945624, 'num_layers': 3, 'embed_size': 500, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0002955171226517989, 'weight_decay': 3.970531540532068e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  45%|████▌     | 68/150 [5:06:45<6:35:31, 289.41s/it]

[I 2024-01-06 21:17:11,549] Trial 67 finished with value: 0.8370585224364443 and parameters: {'sigma': 1.852366048474635, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00042029366405686556, 'weight_decay': 7.618032383023261e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  46%|████▌     | 69/150 [5:10:53<6:14:06, 277.11s/it]

[I 2024-01-06 21:21:19,976] Trial 68 finished with value: 0.8285275550247398 and parameters: {'sigma': 1.8332909803695763, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0004335247707729921, 'weight_decay': 6.01931297437489e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  47%|████▋     | 70/150 [5:14:12<5:38:13, 253.67s/it]

[I 2024-01-06 21:24:38,955] Trial 69 finished with value: 0.8199965876130353 and parameters: {'sigma': 4.899530059587809, 'num_layers': 2, 'embed_size': 100, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0006291053723562275, 'weight_decay': 0.00010618415167816067}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  47%|████▋     | 71/150 [5:18:28<5:34:42, 254.21s/it]

[I 2024-01-06 21:28:54,419] Trial 70 finished with value: 0.7926974918955809 and parameters: {'sigma': 3.006210508115181, 'num_layers': 4, 'embed_size': 90, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 0.0003759403162777276, 'weight_decay': 0.00017282683335977483}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  48%|████▊     | 72/150 [5:22:47<5:32:30, 255.78s/it]

[I 2024-01-06 21:33:13,855] Trial 71 finished with value: 0.8331342774270603 and parameters: {'sigma': 1.1373966568805354, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0007377053667355427, 'weight_decay': 7.681371541554115e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  49%|████▊     | 73/150 [5:27:05<5:29:04, 256.43s/it]

[I 2024-01-06 21:37:31,798] Trial 72 finished with value: 0.8350110902576352 and parameters: {'sigma': 1.9605448142690893, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005101620584697714, 'weight_decay': 8.858540318676153e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  49%|████▉     | 74/150 [5:31:45<5:33:45, 263.49s/it]

[I 2024-01-06 21:42:11,775] Trial 73 finished with value: 0.8348404709094012 and parameters: {'sigma': 2.6482882611549026, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005334911443633879, 'weight_decay': 4.620291372474127e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  50%|█████     | 75/150 [5:36:27<5:36:14, 269.00s/it]

[I 2024-01-06 21:46:53,625] Trial 74 finished with value: 0.8333048967752943 and parameters: {'sigma': 2.6485691156197855, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005725390349925817, 'weight_decay': 4.534972422683457e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  51%|█████     | 76/150 [5:41:12<5:37:42, 273.82s/it]

[I 2024-01-06 21:51:38,698] Trial 75 finished with value: 0.8205084456577376 and parameters: {'sigma': 0.0016524108192473076, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0002461857355244411, 'weight_decay': 0.0002120672775558067}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  51%|█████▏    | 77/150 [5:45:57<5:37:17, 277.23s/it]

[I 2024-01-06 21:56:23,875] Trial 76 finished with value: 0.834669851561167 and parameters: {'sigma': 3.882220615396427, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00046845191241832686, 'weight_decay': 4.530808780391675e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  52%|█████▏    | 78/150 [5:49:43<5:14:12, 261.84s/it]

[I 2024-01-06 22:00:09,808] Trial 77 finished with value: 0.8184610134789285 and parameters: {'sigma': 1.916727014839858, 'num_layers': 3, 'embed_size': 70, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0003529785259973671, 'weight_decay': 3.279674160348684e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  53%|█████▎    | 79/150 [5:54:22<5:15:50, 266.91s/it]

[I 2024-01-06 22:04:48,559] Trial 78 finished with value: 0.8317693226411875 and parameters: {'sigma': 3.2831842741157957, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005312795198490117, 'weight_decay': 2.559518193954389e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  53%|█████▎    | 80/150 [5:58:12<4:58:31, 255.87s/it]

[I 2024-01-06 22:08:38,666] Trial 79 finished with value: 0.8326224193823579 and parameters: {'sigma': 1.498633646977752, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0004094574776236709, 'weight_decay': 9.851077049885861e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  54%|█████▍    | 81/150 [6:01:18<4:30:04, 234.85s/it]

[I 2024-01-06 22:11:44,470] Trial 80 finished with value: 0.8312574645964852 and parameters: {'sigma': 0.823299414933157, 'num_layers': 2, 'embed_size': 50, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00027549642001183685, 'weight_decay': 6.699137154367524e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  55%|█████▍    | 82/150 [6:05:57<4:41:17, 248.21s/it]

[I 2024-01-06 22:16:23,837] Trial 81 finished with value: 0.8276744582835693 and parameters: {'sigma': 4.354934320944713, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00033854408432796635, 'weight_decay': 4.340734927127108e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  55%|█████▌    | 83/150 [6:10:54<4:53:32, 262.87s/it]

[I 2024-01-06 22:21:20,933] Trial 82 finished with value: 0.8327930387305921 and parameters: {'sigma': 2.649718093953104, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0004818195862535085, 'weight_decay': 0.0009915599323794923}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  56%|█████▌    | 84/150 [6:15:39<4:56:25, 269.47s/it]

[I 2024-01-06 22:26:05,811] Trial 83 finished with value: 0.8237502132741853 and parameters: {'sigma': 3.658881230973506, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00021116063301045826, 'weight_decay': 7.158820307192383e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  57%|█████▋    | 85/150 [6:20:41<5:02:30, 279.25s/it]

[I 2024-01-06 22:31:07,855] Trial 84 finished with value: 0.8341579935164648 and parameters: {'sigma': 1.9703529755970386, 'num_layers': 5, 'embed_size': 120, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 0.00044676880915198175, 'weight_decay': 2.5582129484529105e-06}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  57%|█████▋    | 86/150 [6:26:27<5:19:17, 299.34s/it]

[I 2024-01-06 22:36:54,090] Trial 85 finished with value: 0.7952567821190923 and parameters: {'sigma': 3.8178447689391706, 'num_layers': 5, 'embed_size': 250, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0006641097067253503, 'weight_decay': 0.00013911088674103448}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  58%|█████▊    | 87/150 [6:29:39<4:40:29, 267.13s/it]

[I 2024-01-06 22:40:06,067] Trial 86 finished with value: 0.8201672069612694 and parameters: {'sigma': 0.0025582315388674804, 'num_layers': 2, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.0006054150079948872, 'weight_decay': 1.584791981504953e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 20. Best value: 0.838423:  59%|█████▊    | 88/150 [6:33:21<4:22:02, 253.59s/it]

[I 2024-01-06 22:43:48,048] Trial 87 finished with value: 0.8273332195871012 and parameters: {'sigma': 2.2455418867138324, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.000498241158938868, 'weight_decay': 1.1962056278990347e-05}. Best is trial 20 with value: 0.838423477222317.


Best trial: 88. Best value: 0.839618:  59%|█████▉    | 89/150 [6:43:24<6:04:14, 358.27s/it]

[I 2024-01-06 22:53:50,568] Trial 88 finished with value: 0.8396178126599556 and parameters: {'sigma': 1.2680519381758064, 'num_layers': 4, 'embed_size': 500, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00010502262647115873, 'weight_decay': 2.134300590129885e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  60%|██████    | 90/150 [6:48:52<5:49:14, 349.24s/it]

[I 2024-01-06 22:59:18,729] Trial 89 finished with value: 0.7913325371097082 and parameters: {'sigma': 1.28296859269752, 'num_layers': 4, 'embed_size': 350, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.00010569442821255368, 'weight_decay': 2.206190307741846e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  61%|██████    | 91/150 [6:58:47<6:56:00, 423.06s/it]

[I 2024-01-06 23:09:14,044] Trial 90 finished with value: 0.8235795939259511 and parameters: {'sigma': 1.6696521740042696, 'num_layers': 4, 'embed_size': 500, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 0.00012124016077799558, 'weight_decay': 1.4574111209109497e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  61%|██████▏   | 92/150 [7:07:15<7:13:29, 448.43s/it]

[I 2024-01-06 23:17:41,682] Trial 91 finished with value: 0.8109537621566285 and parameters: {'sigma': 2.8482828180816013, 'num_layers': 4, 'embed_size': 500, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00032392850269999694, 'weight_decay': 4.394370804860259e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  62%|██████▏   | 93/150 [7:15:52<7:25:27, 468.90s/it]

[I 2024-01-06 23:26:18,344] Trial 92 finished with value: 0.8239208326224193 and parameters: {'sigma': 2.065991933331065, 'num_layers': 4, 'embed_size': 500, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00018920317600779385, 'weight_decay': 9.199842925689675e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  63%|██████▎   | 94/150 [7:19:42<6:10:47, 397.28s/it]

[I 2024-01-06 23:30:08,493] Trial 93 finished with value: 0.8360348063470397 and parameters: {'sigma': 3.215092611053231, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0001579106529835832, 'weight_decay': 1.6955520920759456e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  63%|██████▎   | 95/150 [7:23:29<5:17:19, 346.17s/it]

[I 2024-01-06 23:33:55,416] Trial 94 finished with value: 0.8310868452482512 and parameters: {'sigma': 3.1583372098932703, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00013889489549320948, 'weight_decay': 1.6627612475850204e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  64%|██████▍   | 96/150 [7:27:18<4:40:07, 311.25s/it]

[I 2024-01-06 23:37:45,183] Trial 95 finished with value: 0.8341579935164648 and parameters: {'sigma': 2.5229437692813, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00015326179892640403, 'weight_decay': 2.3648628172331854e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  65%|██████▍   | 97/150 [7:31:09<4:13:32, 287.02s/it]

[I 2024-01-06 23:41:35,678] Trial 96 finished with value: 0.8290394130694421 and parameters: {'sigma': 0.975728129639006, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.5, 'class_drop': 0.5, 'learning_rate': 5.8645749478638593e-05, 'weight_decay': 2.0208234270875665e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  65%|██████▌   | 98/150 [7:34:49<3:51:20, 266.94s/it]

[I 2024-01-06 23:45:15,749] Trial 97 finished with value: 0.8263095034976966 and parameters: {'sigma': 0.5274346489718055, 'num_layers': 3, 'embed_size': 60, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 9.066896314921806e-05, 'weight_decay': 3.260968186726388e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  66%|██████▌   | 99/150 [7:39:46<3:54:27, 275.84s/it]

[I 2024-01-06 23:50:12,353] Trial 98 finished with value: 0.8367172837399761 and parameters: {'sigma': 1.3339857904319516, 'num_layers': 4, 'embed_size': 140, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 7.483414103113569e-05, 'weight_decay': 2.8103347919221165e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  67%|██████▋   | 100/150 [7:44:34<3:53:01, 279.63s/it]

[I 2024-01-06 23:55:00,839] Trial 99 finished with value: 0.8375703804811465 and parameters: {'sigma': 0.638547663566504, 'num_layers': 4, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 7.851123115671388e-05, 'weight_decay': 2.7731381711579072e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  67%|██████▋   | 101/150 [7:49:22<3:50:24, 282.13s/it]

[I 2024-01-06 23:59:48,802] Trial 100 finished with value: 0.8268213615423989 and parameters: {'sigma': 1.405645334155634, 'num_layers': 4, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 7.9175459033733e-05, 'weight_decay': 2.475551162683305e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  68%|██████▊   | 102/150 [7:54:11<3:47:15, 284.08s/it]

[I 2024-01-07 00:04:37,437] Trial 101 finished with value: 0.8326224193823579 and parameters: {'sigma': 0.7520249135640832, 'num_layers': 4, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 6.527672556507183e-05, 'weight_decay': 2.8922741588278164e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  69%|██████▊   | 103/150 [7:58:59<3:43:37, 285.48s/it]

[I 2024-01-07 00:09:26,194] Trial 102 finished with value: 0.8273332195871012 and parameters: {'sigma': 1.6887397705912914, 'num_layers': 4, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 7.09348472469152e-05, 'weight_decay': 4.029518488979596e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  69%|██████▉   | 104/150 [8:04:15<3:45:45, 294.47s/it]

[I 2024-01-07 00:14:41,634] Trial 103 finished with value: 0.8290394130694421 and parameters: {'sigma': 1.0610626197115558, 'num_layers': 4, 'embed_size': 140, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 1.1535448543041597e-05, 'weight_decay': 1.3880076309051716e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  70%|███████   | 105/150 [8:09:08<3:40:33, 294.08s/it]

[I 2024-01-07 00:19:34,794] Trial 104 finished with value: 0.8271626002388671 and parameters: {'sigma': 1.280757292307079, 'num_layers': 4, 'embed_size': 140, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00039539258003635906, 'weight_decay': 5.463652671479032e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  71%|███████   | 106/150 [8:14:17<3:38:58, 298.60s/it]

[I 2024-01-07 00:24:43,946] Trial 105 finished with value: 0.8317693226411875 and parameters: {'sigma': 2.2361278147485235, 'num_layers': 4, 'embed_size': 160, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 9.639620605144757e-05, 'weight_decay': 1.9534247712614662e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  71%|███████▏  | 107/150 [8:19:14<3:33:31, 297.95s/it]

[I 2024-01-07 00:29:40,376] Trial 106 finished with value: 0.8324518000341239 and parameters: {'sigma': 0.6853527469598666, 'num_layers': 4, 'embed_size': 200, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 5.581165396600811e-05, 'weight_decay': 3.4020743127322024e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  72%|███████▏  | 108/150 [8:23:08<3:15:16, 278.96s/it]

[I 2024-01-07 00:33:35,041] Trial 107 finished with value: 0.8293806517659102 and parameters: {'sigma': 1.7954816829533626, 'num_layers': 3, 'embed_size': 100, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.00011507655582920424, 'weight_decay': 1.1417006525470371e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  73%|███████▎  | 109/150 [8:27:39<3:08:56, 276.51s/it]

[I 2024-01-07 00:38:05,829] Trial 108 finished with value: 0.8249445487118239 and parameters: {'sigma': 4.292607448555641, 'num_layers': 4, 'embed_size': 90, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 8.215093338898038e-05, 'weight_decay': 5.180507599790481e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  73%|███████▎  | 110/150 [8:32:04<3:01:55, 272.90s/it]

[I 2024-01-07 00:42:30,301] Trial 109 finished with value: 0.8141955297730763 and parameters: {'sigma': 0.8469157184408392, 'num_layers': 3, 'embed_size': 140, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 7.457434359359435e-05, 'weight_decay': 2.7123900222512e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  74%|███████▍  | 111/150 [8:43:30<4:17:56, 396.85s/it]

[I 2024-01-07 00:53:56,353] Trial 110 finished with value: 0.7839959051356424 and parameters: {'sigma': 1.4713731242999457, 'num_layers': 4, 'embed_size': 500, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.000771418346163245, 'weight_decay': 4.334802565378244e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  75%|███████▍  | 112/150 [8:46:45<3:33:01, 336.35s/it]

[I 2024-01-07 00:57:11,545] Trial 111 finished with value: 0.8336461354717625 and parameters: {'sigma': 3.1861046262615296, 'num_layers': 2, 'embed_size': 70, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0001022701544988322, 'weight_decay': 1.7334306458774035e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  75%|███████▌  | 113/150 [8:49:20<2:53:51, 281.94s/it]

[I 2024-01-07 00:59:46,536] Trial 112 finished with value: 0.8315987032929534 and parameters: {'sigma': 2.4365898382164537, 'num_layers': 1, 'embed_size': 70, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 6.877207391269813e-05, 'weight_decay': 6.228669922553685e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  76%|███████▌  | 114/150 [8:53:37<2:44:42, 274.52s/it]

[I 2024-01-07 01:04:03,747] Trial 113 finished with value: 0.8334755161235284 and parameters: {'sigma': 2.640783869840058, 'num_layers': 4, 'embed_size': 50, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00013363572346230615, 'weight_decay': 7.084837344040304e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  77%|███████▋  | 115/150 [8:57:22<2:31:32, 259.79s/it]

[I 2024-01-07 01:07:49,178] Trial 114 finished with value: 0.7998635045214127 and parameters: {'sigma': 2.052641711473346, 'num_layers': 2, 'embed_size': 250, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00015506800487216684, 'weight_decay': 9.169168653174192e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  77%|███████▋  | 116/150 [9:01:15<2:22:34, 251.59s/it]

[I 2024-01-07 01:11:41,625] Trial 115 finished with value: 0.8275038389353353 and parameters: {'sigma': 4.9866090127541565, 'num_layers': 3, 'embed_size': 120, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 8.949740541632066e-05, 'weight_decay': 3.7559253711363195e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  78%|███████▊  | 117/150 [9:04:32<2:09:27, 235.38s/it]

[I 2024-01-07 01:14:59,167] Trial 116 finished with value: 0.8314280839447193 and parameters: {'sigma': 3.3852852139688228, 'num_layers': 2, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 5.183945210755327e-05, 'weight_decay': 0.00010938129613849253}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  79%|███████▊  | 118/150 [9:09:33<2:16:00, 255.02s/it]

[I 2024-01-07 01:20:00,015] Trial 117 finished with value: 0.8304043678553148 and parameters: {'sigma': 1.2297736822819965, 'num_layers': 4, 'embed_size': 160, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 4.3120069099194344e-05, 'weight_decay': 2.193095032520486e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  79%|███████▉  | 119/150 [9:14:01<2:13:47, 258.95s/it]

[I 2024-01-07 01:24:28,125] Trial 118 finished with value: 0.8099300460672241 and parameters: {'sigma': 2.8242390477578603, 'num_layers': 4, 'embed_size': 80, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0008930418221963546, 'weight_decay': 0.00014226968323651922}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  80%|████████  | 120/150 [9:19:09<2:16:44, 273.49s/it]

[I 2024-01-07 01:29:35,560] Trial 119 finished with value: 0.8254564067565262 and parameters: {'sigma': 1.6150873113834365, 'num_layers': 4, 'embed_size': 180, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.0005349048813013641, 'weight_decay': 2.7351822246348757e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  81%|████████  | 121/150 [9:23:47<2:12:47, 274.75s/it]

[I 2024-01-07 01:34:13,249] Trial 120 finished with value: 0.8186316328271626 and parameters: {'sigma': 1.963908846219742, 'num_layers': 3, 'embed_size': 140, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00012161250146232834, 'weight_decay': 4.7715772131188786e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  81%|████████▏ | 122/150 [9:28:32<2:09:45, 278.04s/it]

[I 2024-01-07 01:38:58,972] Trial 121 finished with value: 0.8285275550247398 and parameters: {'sigma': 4.496098496210261, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00043727467463689824, 'weight_decay': 3.175898020232291e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  82%|████████▏ | 123/150 [9:33:18<2:06:07, 280.26s/it]

[I 2024-01-07 01:43:44,412] Trial 122 finished with value: 0.8317693226411875 and parameters: {'sigma': 3.6050086371130807, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.000580802579015752, 'weight_decay': 7.77622759145423e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  83%|████████▎ | 124/150 [9:38:14<2:03:35, 285.20s/it]

[I 2024-01-07 01:48:41,122] Trial 123 finished with value: 0.8312574645964852 and parameters: {'sigma': 4.069929028510573, 'num_layers': 5, 'embed_size': 80, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 6.277887717861248e-05, 'weight_decay': 3.119188173660301e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  83%|████████▎ | 125/150 [9:42:34<1:55:38, 277.55s/it]

[I 2024-01-07 01:53:00,824] Trial 124 finished with value: 0.828868793721208 and parameters: {'sigma': 2.9043598066938694, 'num_layers': 4, 'embed_size': 80, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 2.600546624282305e-05, 'weight_decay': 1.5257277732627627e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  84%|████████▍ | 126/150 [9:46:12<1:43:48, 259.53s/it]

[I 2024-01-07 01:56:38,313] Trial 125 finished with value: 0.8387647159187852 and parameters: {'sigma': 2.359321893054142, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0006463803784811749, 'weight_decay': 1.6327902776108565e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  85%|████████▍ | 127/150 [9:50:00<1:35:54, 250.21s/it]

[I 2024-01-07 02:00:26,771] Trial 126 finished with value: 0.8368879030882101 and parameters: {'sigma': 1.749941439290794, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0007349419584856398, 'weight_decay': 1.707561456875766e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  85%|████████▌ | 128/150 [9:53:43<1:28:46, 242.13s/it]

[I 2024-01-07 02:04:10,067] Trial 127 finished with value: 0.8319399419894216 and parameters: {'sigma': 1.6789985302883081, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0007005582316395188, 'weight_decay': 1.484743048358772e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  86%|████████▌ | 129/150 [9:57:32<1:23:19, 238.06s/it]

[I 2024-01-07 02:07:58,631] Trial 128 finished with value: 0.8298925098106126 and parameters: {'sigma': 2.3008782169630937, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 0.0009833305528577994, 'weight_decay': 1.2685014562953562e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  87%|████████▋ | 130/150 [10:01:15<1:17:52, 233.62s/it]

[I 2024-01-07 02:11:41,889] Trial 129 finished with value: 0.8341579935164648 and parameters: {'sigma': 1.0784620296668235, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0008592305096247397, 'weight_decay': 4.1646409121130935e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  87%|████████▋ | 131/150 [10:06:30<1:21:42, 258.04s/it]

[I 2024-01-07 02:16:56,906] Trial 130 finished with value: 0.8191434908718649 and parameters: {'sigma': 2.017336015571903, 'num_layers': 3, 'embed_size': 350, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0006091582917874355, 'weight_decay': 1.6786784454883367e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  88%|████████▊ | 132/150 [10:10:14<1:14:21, 247.87s/it]

[I 2024-01-07 02:20:41,045] Trial 131 finished with value: 0.7561849513734857 and parameters: {'sigma': 0.0011456289080968287, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.000753454939909742, 'weight_decay': 1.7672302651467986e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  89%|████████▊ | 133/150 [10:14:04<1:08:43, 242.54s/it]

[I 2024-01-07 02:24:31,150] Trial 132 finished with value: 0.8305749872035488 and parameters: {'sigma': 1.4052474198161609, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0006597036838432825, 'weight_decay': 1.1112568498984492e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  89%|████████▉ | 134/150 [10:17:57<1:03:53, 239.60s/it]

[I 2024-01-07 02:28:23,882] Trial 133 finished with value: 0.8341579935164648 and parameters: {'sigma': 2.518575986015412, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0005386028601912577, 'weight_decay': 1.927509182600745e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  90%|█████████ | 135/150 [10:21:33<58:06, 232.43s/it]  

[I 2024-01-07 02:31:59,571] Trial 134 finished with value: 0.8372291417846783 and parameters: {'sigma': 3.3214545354872094, 'num_layers': 3, 'embed_size': 60, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 8.114217143379185e-05, 'weight_decay': 2.4854108828085296e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  91%|█████████ | 136/150 [10:25:08<53:00, 227.20s/it]

[I 2024-01-07 02:35:34,586] Trial 135 finished with value: 0.8217027810953762 and parameters: {'sigma': 0.00584500015163369, 'num_layers': 3, 'embed_size': 60, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 8.379593970420024e-05, 'weight_decay': 3.4568020082602524e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  91%|█████████▏| 137/150 [10:28:44<48:29, 223.84s/it]

[I 2024-01-07 02:39:10,584] Trial 136 finished with value: 0.8275038389353353 and parameters: {'sigma': 1.7572753294526675, 'num_layers': 3, 'embed_size': 60, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0005122528909715498, 'weight_decay': 2.0581668263247865e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  92%|█████████▏| 138/150 [10:32:16<44:03, 220.28s/it]

[I 2024-01-07 02:42:42,546] Trial 137 finished with value: 0.8112950008530967 and parameters: {'sigma': 3.2714266521264657, 'num_layers': 3, 'embed_size': 60, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 7.61047471844481e-05, 'weight_decay': 2.5007888706887347e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  93%|█████████▎| 139/150 [10:36:34<42:28, 231.71s/it]

[I 2024-01-07 02:47:00,917] Trial 138 finished with value: 0.8305749872035488 and parameters: {'sigma': 0.02547721241773181, 'num_layers': 3, 'embed_size': 200, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 4.8122390983577604e-05, 'weight_decay': 2.2065582608232806e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  93%|█████████▎| 140/150 [10:42:44<45:31, 273.16s/it]

[I 2024-01-07 02:53:10,815] Trial 139 finished with value: 0.8123187169425012 and parameters: {'sigma': 2.1079615334227206, 'num_layers': 3, 'embed_size': 500, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0006388695602282543, 'weight_decay': 9.626616485276828e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  94%|█████████▍| 141/150 [10:46:58<40:07, 267.50s/it]

[I 2024-01-07 02:57:25,116] Trial 140 finished with value: 0.8283569356765057 and parameters: {'sigma': 1.2823053842097614, 'num_layers': 3, 'embed_size': 160, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 1.5343291913552198e-05, 'weight_decay': 0.00012651629259095664}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  95%|█████████▍| 142/150 [10:51:21<35:28, 266.06s/it]

[I 2024-01-07 03:01:47,794] Trial 141 finished with value: 0.8292100324176762 and parameters: {'sigma': 2.7666900072574387, 'num_layers': 4, 'embed_size': 70, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 9.479251894891942e-05, 'weight_decay': 2.951615017224732e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  95%|█████████▌| 143/150 [10:54:39<28:40, 245.74s/it]

[I 2024-01-07 03:05:06,135] Trial 142 finished with value: 0.8348404709094012 and parameters: {'sigma': 3.6033855182257795, 'num_layers': 2, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00010235703243465053, 'weight_decay': 2.217682092898413e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  96%|█████████▌| 144/150 [10:59:07<25:13, 252.33s/it]

[I 2024-01-07 03:09:33,836] Trial 143 finished with value: 0.8261388841494626 and parameters: {'sigma': 3.623370010529247, 'num_layers': 4, 'embed_size': 100, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00010813442646639882, 'weight_decay': 2.5989144022377708e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  97%|█████████▋| 145/150 [11:02:39<20:01, 240.23s/it]

[I 2024-01-07 03:13:05,821] Trial 144 finished with value: 0.8327930387305921 and parameters: {'sigma': 2.4455567009378325, 'num_layers': 2, 'embed_size': 140, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 6.796442008170598e-05, 'weight_decay': 1.8073452660330332e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  97%|█████████▋| 146/150 [11:06:29<15:49, 237.25s/it]

[I 2024-01-07 03:16:56,129] Trial 145 finished with value: 0.8302337485070806 and parameters: {'sigma': 3.025753584035052, 'num_layers': 3, 'embed_size': 90, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00016907463576301487, 'weight_decay': 1.4466298587745976e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  98%|█████████▊| 147/150 [11:10:26<11:51, 237.13s/it]

[I 2024-01-07 03:20:52,982] Trial 146 finished with value: 0.8259682648012284 and parameters: {'sigma': 1.8329298135702157, 'num_layers': 3, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 8.429192082667896e-05, 'weight_decay': 2.0587543143971832e-06}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  99%|█████████▊| 148/150 [11:14:43<08:05, 242.90s/it]

[I 2024-01-07 03:25:09,357] Trial 147 finished with value: 0.8377409998293807 and parameters: {'sigma': 4.133380161504655, 'num_layers': 4, 'embed_size': 60, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 5.710637493314308e-05, 'weight_decay': 8.185557969002911e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618:  99%|█████████▉| 149/150 [11:18:56<04:06, 246.15s/it]

[I 2024-01-07 03:29:23,080] Trial 148 finished with value: 0.8321105613376557 and parameters: {'sigma': 1.5077718540682061, 'num_layers': 4, 'embed_size': 60, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 5.581621827450652e-05, 'weight_decay': 8.29446490943233e-05}. Best is trial 88 with value: 0.8396178126599556.


Best trial: 88. Best value: 0.839618: 100%|██████████| 150/150 [11:23:12<00:00, 273.28s/it]

[I 2024-01-07 03:33:38,858] Trial 149 finished with value: 0.8298925098106126 and parameters: {'sigma': 2.2423435489310974, 'num_layers': 4, 'embed_size': 60, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 8, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 7.396791421711799e-05, 'weight_decay': 6.651081700060878e-05}. Best is trial 88 with value: 0.8396178126599556.
Best Hyperparameters: {'sigma': 1.2680519381758064, 'num_layers': 4, 'embed_size': 500, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.00010502262647115873, 'weight_decay': 2.134300590129885e-05}
Best Validation Accuracy (at Early Stopping): 0.8396178126599556



