In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
import matplotlib.pyplot as plt
from rff.layers import GaussianEncoding #pip install random-fourier-features-pytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder

import os

import optuna
from optuna.trial import TrialState

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Run regardless if you do or do not have GPU so all tensors are moved to right location later on
if torch.cuda.is_available():
    device_in_use = torch.device("cuda")
    print("GPU is available and being used")
else:
    device_in_use = torch.device("cpu")
    print("GPU is not available, using CPU instead")

GPU is available and being used


In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_val = pd.read_csv('validation.csv')

In [6]:
num_features = len(df_train.columns)-1
class_count = 2

In [4]:
class SingleTaskDataset(Dataset):
    def __init__(self, df : pd.DataFrame, task1_column):
        self.n = df.shape[0]
        
        self.task1_labels = df[task1_column].astype(np.int64).values

        # self.scalar = StandardScaler()
        # self.x = self.scalar.fit_transform(df.drop(columns=[task1_column])).astype(np.float32)
        self.x = df.drop(task1_column, axis=1).astype(np.float32).values

    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        # Retrieve features and labels from the dataframe using column names
        features = self.x[idx]
        labels_task1 = self.task1_labels[idx]

        return features, labels_task1
        # return self.x[index], self.task1_labels[index], self.task2_labels[index]

train_dataset = SingleTaskDataset(df_train, 'income')
val_dataset = SingleTaskDataset(df_val, 'income')
test_dataset = SingleTaskDataset(df_test, 'income')

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [5]:
# each task loss is scaled by its own learnable parameter, then regularization is applied 
class UncertaintyLoss(nn.Module):
    def __init__(self, num_tasks):
        super(UncertaintyLoss, self).__init__()
        self.num_tasks = num_tasks

        self.loss_fns = [nn.CrossEntropyLoss() for x in range(num_tasks)] 

    def forward(self, predictions, labels_task1):

        #task 1
        target = labels_task1
        prediction = predictions[0]
        loss_fn = self.loss_fns[0]
        task_loss = loss_fn(prediction, target)
        
        return task_loss
    
#All layers of the model
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()

        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        assert(self.head_dim * heads == embed_size), "Embed size needs to be div by heads"
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys =nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)


    def forward(self, values, keys, query):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3) #(batch_size, head_dim, #query_embeddings, #key_embeddings)

        # Calculate simplified attention scores
        avg_attention = attention.mean(dim=0)  # Average across batches
        # print("batch average", avg_attention.shape)
        avg_attention = avg_attention.mean(dim=0).squeeze(dim=0)
        # print("head average", avg_attention.shape)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads*self.head_dim) #(batch_size, n_features, embed_size)
        out = self.fc_out(out)

        return out, avg_attention
    
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion, pre_norm_on):
        super(TransformerBlock, self).__init__()

        self.pre_norm_on = pre_norm_on
        if self.pre_norm_on:
            self.pre_norm = nn.LayerNorm(embed_size)
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(nn.Linear(embed_size, forward_expansion*embed_size),
                                          nn.ReLU(),
                                          nn.Linear(forward_expansion*embed_size, embed_size)
                                          )
        self.dropout = nn.Dropout(dropout)

    def forward(self,value,key,query):
        if self.pre_norm_on:
            query = self.pre_norm(query)
            key = self.pre_norm(key)
            value = self.pre_norm(value)
            
        attention, avg_attention = self.attention(value, key, query)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out, avg_attention
    
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, pre_norm_on):
        super(DecoderBlock, self).__init__()

        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion, pre_norm_on)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key):
        out, avg_attention = self.transformer_block(value, key, x)

        return out, avg_attention

class Decoder(nn.Module):
    def __init__(self,
                 embed_size,
                 num_layers,
                 heads,
                 forward_expansion,
                 decoder_dropout,
                 pre_norm_on
    ):
        super(Decoder, self).__init__()

        self.layers = nn.ModuleList(
                [
                    DecoderBlock(
                        embed_size,
                        heads,
                        dropout=decoder_dropout,
                        forward_expansion=forward_expansion,
                        pre_norm_on=pre_norm_on
                    )
                    for _ in range(num_layers)
                ]
            )
        self.avg_attention = None

    def forward(self, class_embed, context):
        for layer in self.layers:
            # x is the classification embedding (CLS Token)
            # context are the feature embeddings that will be used as key and value
            x, self.avg_attention = layer(class_embed, context, context)
  
        return x 

class Embeddings(nn.Module):
    def __init__(self, sigma, embed_size, input_size, embedding_dropout, n_features, num_target_labels, rff_on):
        super(Embeddings, self).__init__()

        self.rff_on = rff_on

        if self.rff_on:
            self.rffs = nn.ModuleList([GaussianEncoding(sigma=sigma, input_size=input_size, encoded_size=embed_size//2) for _ in range(n_features)])
            self.dropout = nn.Dropout(embedding_dropout)
            self.mlp_in = embed_size
        else:
            self.mlp_in = input_size

        self.embeddings = nn.ModuleList([nn.Linear(in_features=self.mlp_in, out_features=embed_size) for _ in range(n_features)])

        # Classifcation Embeddings for each target label
        self.target_label_embeddings = nn.ModuleList([nn.Embedding(1, embed_size) for _ in range(num_target_labels)])


    def forward(self, x):
        x = x.unsqueeze(2) #(batch_size, n_features) -> (batch_size, n_features, 1)
        rff_vectors = []
        if self.rff_on:
            for i, r in enumerate(self.rffs):
                input = x[:,i,:]
                out = r(input)
                rff_vectors.append(out)
        
            x = torch.stack(rff_vectors, dim=1)
        
        embeddings = []
        for i, e in enumerate(self.embeddings):
            goin_in = x[:,i,:]
            goin_out = e(goin_in)
            embeddings.append(goin_out)

        target_label_embeddings_ = []
        for e in self.target_label_embeddings:
            input = torch.tensor([0], device=x.device)
            temp = e(input)
            temp = temp.repeat(x.size(0), 1)
            tmep = temp.unsqueeze(1)
            target_label_embeddings_.append(temp)

        class_embeddings = torch.stack(target_label_embeddings_, dim=1)
        
        # class_embed = self.classification_embedding(torch.tensor([0], device=x.device))  # use index 0 for the classification embedding
        # class_embed = class_embed.repeat(x.size(0), 1) # -> (batch_size, embed_size)
        # class_embed = class_embed.unsqueeze(1)

        context = torch.stack(embeddings, dim=1)

        return class_embeddings, context

class classificationHead(nn.Module):
    def __init__(self, embed_size, dropout, mlp_scale_classification, num_target_classes):
        super(classificationHead, self).__init__()
        
        #flattening the embeddings out so each sample in batch is represented with a 460 dimensional vector
        self.input = embed_size
        self.lin1 = nn.Linear(self.input, mlp_scale_classification*self.input)
        self.drop = nn.Dropout(dropout)
        self.lin2 = nn.Linear(mlp_scale_classification*self.input, mlp_scale_classification*self.input)
        self.lin3 = nn.Linear(mlp_scale_classification*self.input, self.input)
        self.lin4 = nn.Linear(self.input, num_target_classes)
        self.relu = nn.ReLU()
        self.initialize_weights()

    def initialize_weights(self): #he_initialization.
        torch.nn.init.kaiming_normal_(self.lin1.weight, nonlinearity='relu')
        torch.nn.init.zeros_(self.lin1.bias)

        torch.nn.init.kaiming_normal_(self.lin3.weight, nonlinearity='relu')
        torch.nn.init.zeros_(self.lin3.bias)

    def forward(self, x):

        x= torch.reshape(x, (-1, self.input))

        x = self.lin1(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin2(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin3(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin4(x)
  
        return x


# DEFAULT PARAMETERS SET UP FOR VPN DATASET. BE CAREFUL AND MAKE SURE YOU SET THEM UP HOW YOU WANT.
class Classifier(nn.Module):
    def __init__(self, 
                 rff_on = False,
                 sigma=4,
                 embed_size=20,
                 input_size=1,
                 embedding_dropout = 0,
                 n_features=23, # YOU WILL PROBABLY NEED TO CHANGE
                 num_layers=1,
                 heads=1,
                 forward_expansion=4, # Determines how wide the MLP is in the encoder. Its a scaling factor. 
                 decoder_dropout=0,
                 classification_dropout = 0,
                 pre_norm_on = False,
                 mlp_scale_classification = 4,
                 targets_classes : list=  [3,8]
                 ):
        super(Classifier, self).__init__()

        self.embeddings = Embeddings(rff_on=rff_on, sigma=sigma, embed_size=embed_size, input_size=input_size, embedding_dropout=embedding_dropout, n_features=n_features, num_target_labels=len(targets_classes))
        self.decoder = Decoder(embed_size=embed_size, num_layers=num_layers, heads=heads, forward_expansion=forward_expansion, decoder_dropout=decoder_dropout, pre_norm_on=pre_norm_on)
        self.classifying_heads = nn.ModuleList([classificationHead(embed_size=embed_size, dropout=classification_dropout, mlp_scale_classification=mlp_scale_classification, num_target_classes=x) for x in targets_classes])
        
    def forward(self, x):
        class_embed, context = self.embeddings(x)

        x = self.decoder(class_embed, context)
        
        probability_dist_raw = []
        for i, e in enumerate(self.classifying_heads):
            input = x[:, i,:]
            output = e(input)
            probability_dist_raw.append(output)
        
        return probability_dist_raw

# Training and Testing Loops
def train(dataloader, model, loss_function, optimizer, device_in_use):
    model.train()

    total_loss = 0

    total_correct_1 = 0
    total_samples_1 = 0
    all_targets_1 = []
    all_predictions_1 = []

    total_correct_2 = 0
    total_samples_2 = 0
    all_targets_2 = []
    all_predictions_2 = []

    for (features,labels_task1,) in dataloader:
        features,labels_task1 = features.to(device_in_use),labels_task1.to(device_in_use)


        task_predictions = model(features) #contains a list of the tensor outputs for each task

        loss = loss_function(task_predictions, labels_task1)
        total_loss += loss.item()

        #computing accuracy for first target
        y_pred_softmax_1 = torch.softmax(task_predictions[0], dim=1)
        _, y_pred_labels_1 = torch.max(y_pred_softmax_1, dim=1)
        total_correct_1 += (y_pred_labels_1 == labels_task1).sum().item()
        total_samples_1 += labels_task1.size(0)
        all_targets_1.extend(labels_task1.cpu().numpy())
        all_predictions_1.extend(y_pred_labels_1.cpu().numpy())

        # #computing accuaracy for second target
        # y_pred_softmax_2 = torch.softmax(task_predictions[1], dim=1)
        # _, y_pred_labels_2 = torch.max(y_pred_softmax_2, dim=1)
        # total_correct_2 += (y_pred_labels_2 == labels_task2).sum().item()
        # total_samples_2 += labels_task2.size(0)
        # all_targets_2.extend(labels_task2.cpu().numpy())
        # all_predictions_2.extend(y_pred_labels_2.cpu().numpy())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss/len(dataloader)
    accuracy_1 = total_correct_1 / total_samples_1
    # accuracy_2 = total_correct_2 / total_samples_2

    # # precision = precision_score(all_targets, all_predictions, average='weighted')
    # recall = recall_score(all_targets, all_predictions, average='weighted')
    # f1 = f1_score(all_targets, all_predictions, average='weighted')

    return avg_loss, accuracy_1

def test(dataloader, model, loss_function, device_in_use):
  model.eval()
  total_loss = 0
  
  total_correct_1 = 0
  total_samples_1 = 0
  all_targets_1 = []
  all_predictions_1 = []

  total_correct_2 = 0
  total_samples_2 = 0
  all_targets_2 = []
  all_predictions_2 = []

  with torch.no_grad():
    for (features,labels_task1) in dataloader:
      features,labels_task1 = features.to(device_in_use),labels_task1.to(device_in_use)

      #compute prediction error
      task_predictions = model(features) #contains a list of the tensor outputs for each task

      loss = loss_function(task_predictions, labels_task1)
      total_loss += loss.item()

      #computing accuracy for first target
      y_pred_softmax_1 = torch.softmax(task_predictions[0], dim=1)
      _, y_pred_labels_1 = torch.max(y_pred_softmax_1, dim=1)
      total_correct_1 += (y_pred_labels_1 == labels_task1).sum().item()
      total_samples_1 += labels_task1.size(0)
      all_targets_1.extend(labels_task1.cpu().numpy())
      all_predictions_1.extend(y_pred_labels_1.cpu().numpy())

      # #computing accuaracy for second target
      # y_pred_softmax_2 = torch.softmax(task_predictions[1], dim=1)
      # _, y_pred_labels_2 = torch.max(y_pred_softmax_2, dim=1)
      # total_correct_2 += (y_pred_labels_2 == labels_task2).sum().item()
      # total_samples_2 += labels_task2.size(0)
      # all_targets_2.extend(labels_task2.cpu().numpy())
      # all_predictions_2.extend(y_pred_labels_2.cpu().numpy())

  avg = total_loss/len(dataloader)
  accuracy_1 = total_correct_1 / total_samples_1
  # accuracy_2 = total_correct_2 / total_samples_2
  # recall = recall_score(all_targets, all_predictions, average='weighted')
  f1_1 = f1_score(all_targets_1, all_predictions_1, average='weighted')
  # f1_2 = f1_score(all_targets_2, all_predictions_2, average="weighted")

  return avg, accuracy_1, all_predictions_1, all_targets_1, f1_1

def format_metric(value): # Used to format the metrics output
    return f"{value:.4f}"

In [10]:
# Define the early stopping mechanism
class EarlyStopping:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.best_metric = float('-inf')
        self.early_stop = False

    def __call__(self, metric):
        if metric > self.best_metric:
            self.best_metric = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# Function to log results to a text file
def log_to_file(filename, text):
    with open(filename, 'a') as f:
        f.write(text + '\n')

def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    rff_on = trial.suggest_categorical('rff_on', [True, False])
    sigma = trial.suggest_categorical('sigma', [.001, 0.1, 1, 2, 3, 5, 10])
    num_layers = trial.suggest_int('num_layers', 1, 2)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    prenorm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)

    learning_rate = trial.suggest_categorical('learning_rate', [0.001, 0.01])

    num_epochs = 75

    # Create your model with the sampled hyperparameters
    model = Classifier(
        n_features=num_features,
        targets_classes=[class_count],
        rff_on=rff_on,
        sigma=sigma,
        embed_size=embed_size,
        num_layers=num_layers,
        heads=heads,
        forward_expansion=forward_expansion,
        pre_norm_on=prenorm_on,
        mlp_scale_classification=mlp_scale_classification
    ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = UncertaintyLoss(1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=4)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for epoch in range(num_epochs):
        train_loss, train_accuracy = train(train_dataloader, model, loss_function, optimizer, device_in_use)
        
        # Validation loop
        val_loss, val_accuracy, _, _, _ = test(test_dataloader, model, loss_function, device_in_use)
        
        # Check if we should early stop based on validation accuracy
        if early_stopping(val_accuracy):
            break

    # # Evaluate the model on the test set
    # test_loss, test_accuracy, _, _, _ = test(test_dataloader, model, loss_function, device_in_use)
    
    # Log the final test accuracy for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation Accuracy = {val_accuracy:.4f}"
    log_to_file('all_trials_log.txt', final_log)

    # Return the test accuracy as the objective to optimize
    return val_accuracy

In [11]:
# Set the number of optimization trials
num_trials = 50

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize validation accuracy

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy (at Early Stopping):", best_val_accuracy)

[I 2023-10-03 19:50:14,706] A new study created in memory with name: no-name-343c4c21-5c14-4ab2-a242-66de2d5f0367
Best trial: 0. Best value: 0.852756:   2%|▏         | 1/50 [02:25<1:59:04, 145.81s/it]

[I 2023-10-03 19:52:40,512] Trial 0 finished with value: 0.8527560264087211 and parameters: {'rff_on': False, 'sigma': 1, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 3, 'learning_rate': 0.01}. Best is trial 0 with value: 0.8527560264087211.


Best trial: 0. Best value: 0.852756:   4%|▍         | 2/50 [04:37<1:50:08, 137.67s/it]

[I 2023-10-03 19:54:52,492] Trial 1 finished with value: 0.8470750806080147 and parameters: {'rff_on': False, 'sigma': 2, 'num_layers': 1, 'embed_size': 160, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 8, 'learning_rate': 0.001}. Best is trial 0 with value: 0.8527560264087211.


Best trial: 2. Best value: 0.854906:   6%|▌         | 3/50 [07:15<1:54:50, 146.60s/it]

[I 2023-10-03 19:57:29,711] Trial 2 finished with value: 0.8549055734684478 and parameters: {'rff_on': True, 'sigma': 10, 'num_layers': 2, 'embed_size': 100, 'heads': 5, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.01}. Best is trial 2 with value: 0.8549055734684478.


Best trial: 2. Best value: 0.854906:   8%|▊         | 4/50 [09:23<1:46:53, 139.42s/it]

[I 2023-10-03 19:59:38,130] Trial 3 finished with value: 0.8541378780899739 and parameters: {'rff_on': False, 'sigma': 1, 'num_layers': 1, 'embed_size': 90, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 2, 'learning_rate': 0.01}. Best is trial 2 with value: 0.8549055734684478.


Best trial: 2. Best value: 0.854906:  10%|█         | 5/50 [12:02<1:49:47, 146.39s/it]

[I 2023-10-03 20:02:16,879] Trial 4 finished with value: 0.7640104406571473 and parameters: {'rff_on': True, 'sigma': 1, 'num_layers': 2, 'embed_size': 70, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.01}. Best is trial 2 with value: 0.8549055734684478.


Best trial: 5. Best value: 0.857823:  12%|█▏        | 6/50 [14:31<1:48:05, 147.40s/it]

[I 2023-10-03 20:04:46,240] Trial 5 finished with value: 0.8578228159066482 and parameters: {'rff_on': True, 'sigma': 3, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 8, 'learning_rate': 0.001}. Best is trial 5 with value: 0.8578228159066482.


Best trial: 5. Best value: 0.857823:  14%|█▍        | 7/50 [16:50<1:43:41, 144.69s/it]

[I 2023-10-03 20:07:05,358] Trial 6 finished with value: 0.7640104406571473 and parameters: {'rff_on': False, 'sigma': 3, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 6, 'learning_rate': 0.01}. Best is trial 5 with value: 0.8578228159066482.


Best trial: 7. Best value: 0.861047:  16%|█▌        | 8/50 [19:13<1:40:56, 144.19s/it]

[I 2023-10-03 20:09:28,481] Trial 7 finished with value: 0.8610471364962383 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 1, 'learning_rate': 0.01}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  18%|█▊        | 9/50 [21:25<1:35:52, 140.30s/it]

[I 2023-10-03 20:11:40,220] Trial 8 finished with value: 0.8418547520343928 and parameters: {'rff_on': False, 'sigma': 1, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  20%|██        | 10/50 [23:38<1:31:59, 137.99s/it]

[I 2023-10-03 20:13:53,029] Trial 9 finished with value: 0.8027022877322278 and parameters: {'rff_on': False, 'sigma': 5, 'num_layers': 2, 'embed_size': 50, 'heads': 1, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 1, 'learning_rate': 0.01}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  22%|██▏       | 11/50 [26:02<1:30:57, 139.93s/it]

[I 2023-10-03 20:16:17,352] Trial 10 finished with value: 0.8496852448948258 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  24%|██▍       | 12/50 [28:37<1:31:25, 144.36s/it]

[I 2023-10-03 20:18:51,863] Trial 11 finished with value: 0.8507600184246891 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 2, 'embed_size': 70, 'heads': 5, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 8, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  26%|██▌       | 13/50 [30:57<1:28:16, 143.16s/it]

[I 2023-10-03 20:21:12,256] Trial 12 finished with value: 0.8435436818670352 and parameters: {'rff_on': True, 'sigma': 0.001, 'num_layers': 1, 'embed_size': 60, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 6, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  28%|██▊       | 14/50 [33:32<1:27:57, 146.59s/it]

[I 2023-10-03 20:23:46,782] Trial 13 finished with value: 0.8337171810225702 and parameters: {'rff_on': True, 'sigma': 3, 'num_layers': 2, 'embed_size': 140, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 7, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  30%|███       | 15/50 [35:56<1:25:05, 145.87s/it]

[I 2023-10-03 20:26:10,974] Trial 14 finished with value: 0.8377091969906341 and parameters: {'rff_on': True, 'sigma': 3, 'num_layers': 1, 'embed_size': 80, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'learning_rate': 0.01}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  32%|███▏      | 16/50 [38:27<1:23:38, 147.59s/it]

[I 2023-10-03 20:28:42,568] Trial 15 finished with value: 0.8598188238906802 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  34%|███▍      | 17/50 [41:06<1:23:01, 150.95s/it]

[I 2023-10-03 20:31:21,334] Trial 16 finished with value: 0.8570551205281745 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 2, 'embed_size': 70, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.01}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  36%|███▌      | 18/50 [43:29<1:19:15, 148.60s/it]

[I 2023-10-03 20:33:44,449] Trial 17 finished with value: 0.8487640104406572 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 80, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 4, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  38%|███▊      | 19/50 [45:56<1:16:27, 147.99s/it]

[I 2023-10-03 20:36:11,018] Trial 18 finished with value: 0.8576692768309535 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 6, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  40%|████      | 20/50 [48:33<1:15:23, 150.79s/it]

[I 2023-10-03 20:38:48,340] Trial 19 finished with value: 0.8395516658989712 and parameters: {'rff_on': True, 'sigma': 10, 'num_layers': 2, 'embed_size': 140, 'heads': 10, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 3, 'learning_rate': 0.01}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  42%|████▏     | 21/50 [51:06<1:13:08, 151.31s/it]

[I 2023-10-03 20:41:20,869] Trial 20 finished with value: 0.7640104406571473 and parameters: {'rff_on': True, 'sigma': 0.001, 'num_layers': 2, 'embed_size': 160, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 2, 'learning_rate': 0.01}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  44%|████▍     | 22/50 [53:35<1:10:24, 150.86s/it]

[I 2023-10-03 20:43:50,687] Trial 21 finished with value: 0.8555197297712268 and parameters: {'rff_on': True, 'sigma': 2, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 7, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  46%|████▌     | 23/50 [56:05<1:07:40, 150.40s/it]

[I 2023-10-03 20:46:20,002] Trial 22 finished with value: 0.855366190695532 and parameters: {'rff_on': True, 'sigma': 5, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 7, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 7. Best value: 0.861047:  48%|████▊     | 24/50 [58:36<1:05:19, 150.75s/it]

[I 2023-10-03 20:48:51,575] Trial 23 finished with value: 0.8575157377552587 and parameters: {'rff_on': True, 'sigma': 3, 'num_layers': 2, 'embed_size': 60, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.001}. Best is trial 7 with value: 0.8610471364962383.


Best trial: 24. Best value: 0.861201:  50%|█████     | 25/50 [1:01:07<1:02:50, 150.83s/it]

[I 2023-10-03 20:51:22,573] Trial 24 finished with value: 0.861200675571933 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 2, 'embed_size': 120, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'learning_rate': 0.001}. Best is trial 24 with value: 0.861200675571933.


Best trial: 24. Best value: 0.861201:  52%|█████▏    | 26/50 [1:03:41<1:00:36, 151.53s/it]

[I 2023-10-03 20:53:55,747] Trial 25 finished with value: 0.8578228159066482 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 2, 'embed_size': 50, 'heads': 5, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 4, 'learning_rate': 0.001}. Best is trial 24 with value: 0.861200675571933.


Best trial: 26. Best value: 0.861508:  54%|█████▍    | 27/50 [1:06:07<57:26, 149.87s/it]  

[I 2023-10-03 20:56:21,742] Trial 26 finished with value: 0.8615077537233226 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 26 with value: 0.8615077537233226.


Best trial: 27. Best value: 0.864886:  56%|█████▌    | 28/50 [1:08:31<54:20, 148.22s/it]

[I 2023-10-03 20:58:46,098] Trial 27 finished with value: 0.8648856133886074 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  58%|█████▊    | 29/50 [1:10:58<51:43, 147.80s/it]

[I 2023-10-03 21:01:12,943] Trial 28 finished with value: 0.8584369722094273 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  60%|██████    | 30/50 [1:13:14<48:08, 144.42s/it]

[I 2023-10-03 21:03:29,477] Trial 29 finished with value: 0.846307385229541 and parameters: {'rff_on': False, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  62%|██████▏   | 31/50 [1:15:41<45:57, 145.11s/it]

[I 2023-10-03 21:05:56,176] Trial 30 finished with value: 0.8602794411177644 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  64%|██████▍   | 32/50 [1:18:06<43:34, 145.23s/it]

[I 2023-10-03 21:08:21,700] Trial 31 finished with value: 0.8622754491017964 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  66%|██████▌   | 33/50 [1:20:30<40:59, 144.69s/it]

[I 2023-10-03 21:10:45,120] Trial 32 finished with value: 0.846307385229541 and parameters: {'rff_on': True, 'sigma': 2, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  68%|██████▊   | 34/50 [1:22:54<38:31, 144.47s/it]

[I 2023-10-03 21:13:09,088] Trial 33 finished with value: 0.8585905112851221 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  70%|███████   | 35/50 [1:25:10<35:30, 142.03s/it]

[I 2023-10-03 21:15:25,436] Trial 34 finished with value: 0.8507600184246891 and parameters: {'rff_on': False, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  72%|███████▏  | 36/50 [1:27:40<33:41, 144.36s/it]

[I 2023-10-03 21:17:55,229] Trial 35 finished with value: 0.8433901427913404 and parameters: {'rff_on': True, 'sigma': 10, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  74%|███████▍  | 37/50 [1:30:15<31:58, 147.57s/it]

[I 2023-10-03 21:20:30,281] Trial 36 finished with value: 0.8476892369107938 and parameters: {'rff_on': True, 'sigma': 5, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  76%|███████▌  | 38/50 [1:32:46<29:41, 148.44s/it]

[I 2023-10-03 21:23:00,767] Trial 37 finished with value: 0.7784431137724551 and parameters: {'rff_on': True, 'sigma': 0.001, 'num_layers': 1, 'embed_size': 160, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  78%|███████▊  | 39/50 [1:35:08<26:53, 146.65s/it]

[I 2023-10-03 21:25:23,244] Trial 38 finished with value: 0.8472286196837095 and parameters: {'rff_on': False, 'sigma': 1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  80%|████████  | 40/50 [1:37:42<24:48, 148.86s/it]

[I 2023-10-03 21:27:57,248] Trial 39 finished with value: 0.8638108398587441 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  82%|████████▏ | 41/50 [1:40:03<21:59, 146.58s/it]

[I 2023-10-03 21:30:18,514] Trial 40 finished with value: 0.8545984953170582 and parameters: {'rff_on': False, 'sigma': 2, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  84%|████████▍ | 42/50 [1:42:38<19:51, 148.89s/it]

[I 2023-10-03 21:32:52,780] Trial 41 finished with value: 0.8578228159066482 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  86%|████████▌ | 43/50 [1:45:09<17:27, 149.70s/it]

[I 2023-10-03 21:35:24,375] Trial 42 finished with value: 0.861200675571933 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 100, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  88%|████████▊ | 44/50 [1:47:41<15:01, 150.31s/it]

[I 2023-10-03 21:37:56,121] Trial 43 finished with value: 0.8590511285122063 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  90%|█████████ | 45/50 [1:50:06<12:24, 148.87s/it]

[I 2023-10-03 21:40:21,631] Trial 44 finished with value: 0.8622754491017964 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  92%|█████████▏| 46/50 [1:52:39<09:59, 149.90s/it]

[I 2023-10-03 21:42:53,937] Trial 45 finished with value: 0.8261937663135268 and parameters: {'rff_on': True, 'sigma': 1, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  94%|█████████▍| 47/50 [1:55:09<07:30, 150.08s/it]

[I 2023-10-03 21:45:24,439] Trial 46 finished with value: 0.8515277138031629 and parameters: {'rff_on': True, 'sigma': 10, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  96%|█████████▌| 48/50 [1:57:39<04:59, 149.90s/it]

[I 2023-10-03 21:47:53,909] Trial 47 finished with value: 0.8555197297712268 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 50, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 2, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886:  98%|█████████▊| 49/50 [2:00:06<02:29, 149.00s/it]

[I 2023-10-03 21:50:20,822] Trial 48 finished with value: 0.8569015814524796 and parameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 140, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}. Best is trial 27 with value: 0.8648856133886074.


Best trial: 27. Best value: 0.864886: 100%|██████████| 50/50 [2:02:28<00:00, 146.96s/it]

[I 2023-10-03 21:52:42,845] Trial 49 finished with value: 0.7640104406571473 and parameters: {'rff_on': False, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 80, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.01}. Best is trial 27 with value: 0.8648856133886074.
Best Hyperparameters: {'rff_on': True, 'sigma': 0.1, 'num_layers': 1, 'embed_size': 120, 'heads': 1, 'forward_expansion': 3, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.001}
Best Validation Accuracy (at Early Stopping): 0.8648856133886074



