In [1]:
import sys
sys.path.insert(0, '/home/wdwatson2/projects/CAT-Transformer/model')
# sys.path.insert(0, r'C:\Users\smbm2\projects\CAT-Transformer\model')
# sys.path.insert(0, '/home/warin/projects/CAT-Transformer/model')
from testingModel import CATTransformer, MyFTTransformer, Combined_Dataset, train, test, EarlyStopping
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import optuna
from optuna.trial import TrialState

device_in_use = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device_in_use)

cuda


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
df_train = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/income/train.csv')
df_test = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/income/test.csv')
df_val = pd.read_csv('/home/wdwatson2/projects/CAT-Transformer/datasets/income/validation.csv') 

cont_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week']
cat_columns = ['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country']
cat_features = (10,16,7,16,6,5,2,43)
target = ['income']

#CHECKING TO MAKE SURE YOUR LIST IS CORRECT (NO NEED TO TOUCH)
yourlist = cont_columns + cat_columns+target
yourlist.sort()
oglist = list(df_train.columns)
oglist.sort()

assert(yourlist == oglist), "You may of spelled feature name wrong or you forgot to put on of them in the list"

cat_features = ()

target_classes = [max(len(df_train[target].value_counts()), len(df_val[target].value_counts()),len(df_test[target].value_counts()))]
print(target_classes)
# Create a StandardScaler and fit it to the cont features
scaler = StandardScaler()
scaler.fit(df_train[cont_columns])

# Transform the training, test, and validation datasets
df_train[cont_columns] = scaler.transform(df_train[cont_columns])
df_test[cont_columns] = scaler.transform(df_test[cont_columns])
df_val[cont_columns] = scaler.transform(df_val[cont_columns])

#Wrapping in Dataset
train_dataset = Combined_Dataset(df_train, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
val_dataset = Combined_Dataset(df_val, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])
test_dataset = Combined_Dataset(df_test, cat_columns=cat_columns, num_columns=cont_columns, task1_column=target[0])

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

[2]


In [9]:
def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    alpha = trial.suggest_categorical('sigma', [.001, 0.01, 0.1, .5, 1, 1.5, 2, 3, 4, 5])
    num_layers = trial.suggest_int('num_layers', 1, 2)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 350, 500])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    pre_norm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)
    decoder_dropout = trial.suggest_categorical('decoder_dropout', [0,.1,.2,.5])
    classification_dropout = trial.suggest_categorical('class_drop', [0,.1,.2,.5])

    learning_rate = trial.suggest_categorical('learning_rate', [0.00001, 0.0001, 0.001, 0.01, 0.1])

    num_epochs = 400

    # Create your model with the sampled hyperparameters
    model = CATTransformer(alpha = alpha,
                           embed_size= embed_size,
                           n_cont = len(cont_columns),
                           cat_feat=cat_features,
                           num_layers=num_layers,
                           heads=heads,
                           forward_expansion=forward_expansion,
                           decoder_dropout=decoder_dropout,
                           classification_dropout=classification_dropout,
                           pre_norm_on=pre_norm_on,
                           mlp_scale_classification=mlp_scale_classification,
                           targets_classes=target_classes,
                           ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=10)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for epoch in range(num_epochs):
        train_loss, train_acc= train(regression_on=False, 
                                  get_attn=False,
                                   dataloader=train_dataloader, 
                                   model=model, 
                                   loss_function=loss_function, 
                                   optimizer=optimizer, 
                                   device_in_use=device_in_use)
        val_loss, val_acc = test(regression_on=False,
                               get_attn=False,
                               dataloader=val_dataloader,
                               model=model,
                               loss_function=loss_function,
                               device_in_use=device_in_use)
        # Check if we should early stop based on validation accuracy
        if early_stopping(val_acc):
            break

    
    # Log the final test accuracy for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation Accuracy = {val_acc:.4f}"

    # Return the test accuracy as the objective to optimize
    return val_acc

In [10]:
# Set the number of optimization trials
num_trials = 75

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize validation accuracy

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy (at Early Stopping):", best_val_accuracy)

[I 2024-01-03 19:23:24,787] A new study created in memory with name: no-name-280ea317-c6c8-4c4e-93b7-0ac1fca2c8af
Best trial: 0. Best value: 0.832793:   1%|▏         | 1/75 [02:29<3:04:28, 149.57s/it]

[I 2024-01-03 19:25:54,360] Trial 0 finished with value: 0.8327930387305921 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:   3%|▎         | 2/75 [06:29<4:06:51, 202.90s/it]

[I 2024-01-03 19:29:54,583] Trial 1 finished with value: 0.7858727179662174 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 500, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:   4%|▍         | 3/75 [09:37<3:55:12, 196.01s/it]

[I 2024-01-03 19:33:02,395] Trial 2 finished with value: 0.8230677358812489 and parameters: {'sigma': 3, 'num_layers': 2, 'embed_size': 120, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0.1, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:   5%|▌         | 4/75 [12:10<3:31:50, 179.03s/it]

[I 2024-01-03 19:35:35,387] Trial 3 finished with value: 0.8275038389353353 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:   7%|▋         | 5/75 [14:45<3:18:40, 170.30s/it]

[I 2024-01-03 19:38:10,212] Trial 4 finished with value: 0.8235795939259511 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 90, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0, 'learning_rate': 0.01}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:   8%|▊         | 6/75 [18:12<3:30:11, 182.78s/it]

[I 2024-01-03 19:41:37,211] Trial 5 finished with value: 0.8300631291588466 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:   9%|▉         | 7/75 [20:40<3:14:06, 171.27s/it]

[I 2024-01-03 19:44:04,803] Trial 6 finished with value: 0.821361542398908 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 7, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  11%|█         | 8/75 [23:27<3:10:03, 170.20s/it]

[I 2024-01-03 19:46:52,699] Trial 7 finished with value: 0.8281863163282717 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 140, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  12%|█▏        | 9/75 [26:33<3:12:34, 175.07s/it]

[I 2024-01-03 19:49:58,468] Trial 8 finished with value: 0.8286981743729739 and parameters: {'sigma': 1, 'num_layers': 2, 'embed_size': 160, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  13%|█▎        | 10/75 [29:39<3:13:09, 178.31s/it]

[I 2024-01-03 19:53:04,027] Trial 9 finished with value: 0.8188022521753967 and parameters: {'sigma': 2, 'num_layers': 2, 'embed_size': 140, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  15%|█▍        | 11/75 [32:35<3:09:32, 177.69s/it]

[I 2024-01-03 19:56:00,333] Trial 10 finished with value: 0.8314280839447193 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  16%|█▌        | 12/75 [35:31<3:06:09, 177.30s/it]

[I 2024-01-03 19:58:56,719] Trial 11 finished with value: 0.8252857874082921 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  17%|█▋        | 13/75 [38:26<3:02:23, 176.51s/it]

[I 2024-01-03 20:01:51,435] Trial 12 finished with value: 0.8228971165330149 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  19%|█▊        | 14/75 [41:07<2:54:34, 171.71s/it]

[I 2024-01-03 20:04:32,035] Trial 13 finished with value: 0.8297218904623784 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 50, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  20%|██        | 15/75 [43:45<2:47:42, 167.71s/it]

[I 2024-01-03 20:07:10,478] Trial 14 finished with value: 0.8104419041119263 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  21%|██▏       | 16/75 [46:34<2:45:10, 167.97s/it]

[I 2024-01-03 20:09:59,057] Trial 15 finished with value: 0.8222146391400785 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 60, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  23%|██▎       | 17/75 [50:00<2:53:24, 179.38s/it]

[I 2024-01-03 20:13:24,979] Trial 16 finished with value: 0.8293806517659102 and parameters: {'sigma': 0.5, 'num_layers': 2, 'embed_size': 250, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.01}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  24%|██▍       | 18/75 [53:02<2:51:06, 180.12s/it]

[I 2024-01-03 20:16:26,819] Trial 17 finished with value: 0.8153898652107149 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 180, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  25%|██▌       | 19/75 [55:27<2:38:21, 169.68s/it]

[I 2024-01-03 20:18:52,170] Trial 18 finished with value: 0.7561849513734857 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 200, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  27%|██▋       | 20/75 [58:30<2:39:18, 173.79s/it]

[I 2024-01-03 20:21:55,553] Trial 19 finished with value: 0.8217027810953762 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 100, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.5, 'class_drop': 0.5, 'learning_rate': 0.01}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  28%|██▊       | 21/75 [1:01:31<2:38:22, 175.96s/it]

[I 2024-01-03 20:24:56,581] Trial 20 finished with value: 0.8266507421941648 and parameters: {'sigma': 4, 'num_layers': 2, 'embed_size': 80, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  29%|██▉       | 22/75 [1:05:00<2:44:12, 185.90s/it]

[I 2024-01-03 20:28:25,648] Trial 21 finished with value: 0.8297218904623784 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  31%|███       | 23/75 [1:08:30<2:47:18, 193.04s/it]

[I 2024-01-03 20:31:55,357] Trial 22 finished with value: 0.830745606551783 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  32%|███▏      | 24/75 [1:12:01<2:48:46, 198.55s/it]

[I 2024-01-03 20:35:26,764] Trial 23 finished with value: 0.8193141102200989 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  33%|███▎      | 25/75 [1:14:40<2:35:30, 186.62s/it]

[I 2024-01-03 20:38:05,528] Trial 24 finished with value: 0.8112950008530967 and parameters: {'sigma': 3, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.0001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  35%|███▍      | 26/75 [1:17:10<2:23:28, 175.68s/it]

[I 2024-01-03 20:40:35,708] Trial 25 finished with value: 0.7561849513734857 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 80, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  36%|███▌      | 27/75 [1:20:06<2:20:34, 175.73s/it]

[I 2024-01-03 20:43:31,530] Trial 26 finished with value: 0.8280156969800375 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 200, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  37%|███▋      | 28/75 [1:22:52<2:15:15, 172.67s/it]

[I 2024-01-03 20:46:17,057] Trial 27 finished with value: 0.8249445487118239 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.5, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  39%|███▊      | 29/75 [1:26:30<2:22:58, 186.48s/it]

[I 2024-01-03 20:49:55,779] Trial 28 finished with value: 0.7561849513734857 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 500, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  40%|████      | 30/75 [1:29:38<2:20:00, 186.68s/it]

[I 2024-01-03 20:53:02,912] Trial 29 finished with value: 0.7872376727520901 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 250, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 7, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  41%|████▏     | 31/75 [1:32:29<2:13:35, 182.17s/it]

[I 2024-01-03 20:55:54,566] Trial 30 finished with value: 0.8010578399590513 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 160, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 8, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 0. Best value: 0.832793:  43%|████▎     | 32/75 [1:35:58<2:16:15, 190.12s/it]

[I 2024-01-03 20:59:23,242] Trial 31 finished with value: 0.8290394130694421 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 0 with value: 0.8327930387305921.


Best trial: 32. Best value: 0.833134:  44%|████▍     | 33/75 [1:39:27<2:17:02, 195.77s/it]

[I 2024-01-03 21:02:52,183] Trial 32 finished with value: 0.8331342774270603 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  45%|████▌     | 34/75 [1:42:57<2:16:46, 200.16s/it]

[I 2024-01-03 21:06:22,595] Trial 33 finished with value: 0.830916225900017 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  47%|████▋     | 35/75 [1:46:09<2:11:48, 197.72s/it]

[I 2024-01-03 21:09:34,617] Trial 34 finished with value: 0.8111243815048627 and parameters: {'sigma': 3, 'num_layers': 1, 'embed_size': 180, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.0001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  48%|████▊     | 36/75 [1:48:54<2:01:59, 187.68s/it]

[I 2024-01-03 21:12:18,872] Trial 35 finished with value: 0.8252857874082921 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  49%|████▉     | 37/75 [1:51:40<1:54:51, 181.35s/it]

[I 2024-01-03 21:15:05,466] Trial 36 finished with value: 0.7561849513734857 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.01}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  51%|█████     | 38/75 [1:54:20<1:47:50, 174.88s/it]

[I 2024-01-03 21:17:45,236] Trial 37 finished with value: 0.80225217539669 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.1, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  52%|█████▏    | 39/75 [1:56:56<1:41:28, 169.12s/it]

[I 2024-01-03 21:20:20,919] Trial 38 finished with value: 0.8286981743729739 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 60, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  53%|█████▎    | 40/75 [1:59:31<1:36:10, 164.86s/it]

[I 2024-01-03 21:22:55,840] Trial 39 finished with value: 0.8305749872035488 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  55%|█████▍    | 41/75 [2:02:38<1:37:17, 171.70s/it]

[I 2024-01-03 21:26:03,515] Trial 40 finished with value: 0.828868793721208 and parameters: {'sigma': 1.5, 'num_layers': 2, 'embed_size': 100, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 0.0001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  56%|█████▌    | 42/75 [2:06:11<1:41:11, 183.98s/it]

[I 2024-01-03 21:29:36,141] Trial 41 finished with value: 0.830916225900017 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  57%|█████▋    | 43/75 [2:09:45<1:42:58, 193.09s/it]

[I 2024-01-03 21:33:10,483] Trial 42 finished with value: 0.8302337485070806 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  59%|█████▊    | 44/75 [2:13:14<1:42:07, 197.67s/it]

[I 2024-01-03 21:36:38,849] Trial 43 finished with value: 0.8305749872035488 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  60%|██████    | 45/75 [2:17:16<1:45:30, 211.02s/it]

[I 2024-01-03 21:40:41,029] Trial 44 finished with value: 0.8292100324176762 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 500, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  61%|██████▏   | 46/75 [2:20:07<1:36:14, 199.11s/it]

[I 2024-01-03 21:43:32,343] Trial 45 finished with value: 0.8280156969800375 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0.1, 'class_drop': 0.5, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  63%|██████▎   | 47/75 [2:22:47<1:27:28, 187.44s/it]

[I 2024-01-03 21:46:12,553] Trial 46 finished with value: 0.7561849513734857 and parameters: {'sigma': 0.001, 'num_layers': 1, 'embed_size': 140, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  64%|██████▍   | 48/75 [2:25:56<1:24:29, 187.76s/it]

[I 2024-01-03 21:49:21,054] Trial 47 finished with value: 0.8194847295683331 and parameters: {'sigma': 0.01, 'num_layers': 2, 'embed_size': 90, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 6, 'decoder_dropout': 0.2, 'class_drop': 0.2, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  65%|██████▌   | 49/75 [2:28:33<1:17:26, 178.71s/it]

[I 2024-01-03 21:51:58,646] Trial 48 finished with value: 0.8228971165330149 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 0.01}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  67%|██████▋   | 50/75 [2:32:53<1:24:34, 202.98s/it]

[I 2024-01-03 21:56:18,263] Trial 49 finished with value: 0.8090769493260536 and parameters: {'sigma': 2, 'num_layers': 2, 'embed_size': 350, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.1}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  68%|██████▊   | 51/75 [2:35:31<1:15:46, 189.44s/it]

[I 2024-01-03 21:58:56,115] Trial 50 finished with value: 0.8300631291588466 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 4, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  69%|██████▉   | 52/75 [2:39:05<1:15:29, 196.94s/it]

[I 2024-01-03 22:02:30,553] Trial 51 finished with value: 0.8300631291588466 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  71%|███████   | 53/75 [2:42:40<1:14:11, 202.35s/it]

[I 2024-01-03 22:06:05,517] Trial 52 finished with value: 0.8298925098106126 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  72%|███████▏  | 54/75 [2:46:18<1:12:24, 206.88s/it]

[I 2024-01-03 22:09:42,984] Trial 53 finished with value: 0.8305749872035488 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  73%|███████▎  | 55/75 [2:49:15<1:05:57, 197.87s/it]

[I 2024-01-03 22:12:39,827] Trial 54 finished with value: 0.8304043678553148 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 160, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  75%|███████▍  | 56/75 [2:51:56<59:10, 186.87s/it]  

[I 2024-01-03 22:15:21,029] Trial 55 finished with value: 0.8271626002388671 and parameters: {'sigma': 4, 'num_layers': 1, 'embed_size': 70, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  76%|███████▌  | 57/75 [2:55:32<58:44, 195.83s/it]

[I 2024-01-03 22:18:57,754] Trial 56 finished with value: 0.7561849513734857 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.1}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  77%|███████▋  | 58/75 [2:58:04<51:43, 182.58s/it]

[I 2024-01-03 22:21:29,427] Trial 57 finished with value: 0.8298925098106126 and parameters: {'sigma': 3, 'num_layers': 1, 'embed_size': 50, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0.5, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  79%|███████▊  | 59/75 [3:01:00<48:06, 180.44s/it]

[I 2024-01-03 22:24:24,858] Trial 58 finished with value: 0.8182903941306944 and parameters: {'sigma': 0.1, 'num_layers': 2, 'embed_size': 100, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 6, 'decoder_dropout': 0.1, 'class_drop': 0.2, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  80%|████████  | 60/75 [3:04:01<45:09, 180.63s/it]

[I 2024-01-03 22:27:25,926] Trial 59 finished with value: 0.7955980208155605 and parameters: {'sigma': 5, 'num_layers': 1, 'embed_size': 200, 'heads': 10, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 0.01}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  81%|████████▏ | 61/75 [3:07:05<42:25, 181.84s/it]

[I 2024-01-03 22:30:30,606] Trial 60 finished with value: 0.8029346527896264 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 250, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 0.0001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  83%|████████▎ | 62/75 [3:09:34<37:15, 171.96s/it]

[I 2024-01-03 22:32:59,508] Trial 61 finished with value: 0.8275038389353353 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  84%|████████▍ | 63/75 [3:12:12<33:31, 167.59s/it]

[I 2024-01-03 22:35:36,895] Trial 62 finished with value: 0.8215321617471422 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  85%|████████▌ | 64/75 [3:14:51<30:17, 165.20s/it]

[I 2024-01-03 22:38:16,511] Trial 63 finished with value: 0.8244326906671217 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 80, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  87%|████████▋ | 65/75 [3:17:32<27:18, 163.88s/it]

[I 2024-01-03 22:40:57,326] Trial 64 finished with value: 0.8319399419894216 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  88%|████████▊ | 66/75 [3:20:26<25:02, 166.95s/it]

[I 2024-01-03 22:43:51,431] Trial 65 finished with value: 0.8246033100153557 and parameters: {'sigma': 0.01, 'num_layers': 1, 'embed_size': 140, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.5, 'class_drop': 0.2, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  89%|████████▉ | 67/75 [3:23:39<23:18, 174.86s/it]

[I 2024-01-03 22:47:04,762] Trial 66 finished with value: 0.7561849513734857 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 180, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.2, 'class_drop': 0.5, 'learning_rate': 0.1}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  91%|█████████ | 68/75 [3:26:48<20:52, 178.98s/it]

[I 2024-01-03 22:50:13,362] Trial 67 finished with value: 0.8065176591025423 and parameters: {'sigma': 0.001, 'num_layers': 2, 'embed_size': 60, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 4, 'decoder_dropout': 0, 'class_drop': 0.1, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  92%|█████████▏| 69/75 [3:30:16<18:46, 187.71s/it]

[I 2024-01-03 22:53:41,417] Trial 68 finished with value: 0.8027640334413922 and parameters: {'sigma': 1.5, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 3, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  93%|█████████▎| 70/75 [3:33:07<15:13, 182.76s/it]

[I 2024-01-03 22:56:32,635] Trial 69 finished with value: 0.8290394130694421 and parameters: {'sigma': 0.5, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 3, 'decoder_dropout': 0.2, 'class_drop': 0, 'learning_rate': 1e-05}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  95%|█████████▍| 71/75 [3:36:43<12:50, 192.58s/it]

[I 2024-01-03 23:00:08,125] Trial 70 finished with value: 0.7561849513734857 and parameters: {'sigma': 1, 'num_layers': 1, 'embed_size': 350, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 5, 'decoder_dropout': 0, 'class_drop': 0.2, 'learning_rate': 0.1}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  96%|█████████▌| 72/75 [3:39:26<09:11, 183.88s/it]

[I 2024-01-03 23:02:51,715] Trial 71 finished with value: 0.8242620713188875 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  97%|█████████▋| 73/75 [3:42:13<05:57, 178.68s/it]

[I 2024-01-03 23:05:38,256] Trial 72 finished with value: 0.8290394130694421 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134:  99%|█████████▊| 74/75 [3:44:58<02:54, 174.59s/it]

[I 2024-01-03 23:08:23,306] Trial 73 finished with value: 0.8280156969800375 and parameters: {'sigma': 2, 'num_layers': 1, 'embed_size': 50, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 1, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.


Best trial: 32. Best value: 0.833134: 100%|██████████| 75/75 [3:48:39<00:00, 182.93s/it]

[I 2024-01-03 23:12:04,590] Trial 74 finished with value: 0.8247739293635898 and parameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 500, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 2, 'decoder_dropout': 0.5, 'class_drop': 0, 'learning_rate': 0.001}. Best is trial 32 with value: 0.8331342774270603.
Best Hyperparameters: {'sigma': 0.1, 'num_layers': 1, 'embed_size': 350, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'decoder_dropout': 0, 'class_drop': 0, 'learning_rate': 1e-05}
Best Validation Accuracy (at Early Stopping): 0.8331342774270603



