In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import matplotlib.pyplot as plt
from rff.layers import GaussianEncoding #pip install random-fourier-features-pytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

import os

import optuna
from optuna.trial import TrialState

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Run regardless if you do or do not have GPU so all tensors are moved to right location later on
if torch.cuda.is_available():
    device_in_use = torch.device("cuda")
    print("GPU is available and being used")
else:
    device_in_use = torch.device("cpu")
    print("GPU is not available, using CPU instead")

GPU is available and being used


In [9]:
df = pd.read_csv(r'/home/cscadmin/CyberResearch/FlowClassification/datasets/covertype/covtype.csv')

le = LabelEncoder()
df['Cover_Type'] = le.fit_transform(df['Cover_Type'])

df['Cover_Type'].value_counts()

class SingleTaskDataset(Dataset):
    def __init__(self, df : pd.DataFrame, task1_column):
        self.n = df.shape[0]
        
        self.task1_labels = df[task1_column].astype(np.int64).values

        self.scalar = StandardScaler()
        self.x = self.scalar.fit_transform(df.drop(columns=[task1_column])).astype(np.float32)
        # self.x = df.drop(task1_column, axis=1).astype(np.float32).values

    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        # Retrieve features and labels from the dataframe using column names
        features = self.x[idx]
        labels_task1 = self.task1_labels[idx]

        return features, labels_task1
        # return self.x[index], self.task1_labels[index], self.task2_labels[index]

# df = df.sample(frac=.1, random_state=42)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.4, random_state=42)

print(f"{len(train_df)}, {len(val_df)}, {len(test_df)}")

# Create datasets for train, validation, and test
train_dataset = SingleTaskDataset(train_df, "Cover_Type")
val_dataset = SingleTaskDataset(val_df, "Cover_Type")
test_dataset = SingleTaskDataset(test_df, "Cover_Type")

# Define batch size
batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle validation data
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # No need to shuffle test data

464809, 69721, 46482


In [10]:
# each task loss is scaled by its own learnable parameter, then regularization is applied 
class UncertaintyLoss(nn.Module):
    def __init__(self, num_tasks):
        super(UncertaintyLoss, self).__init__()
        self.num_tasks = num_tasks

        self.loss_fns = [nn.CrossEntropyLoss() for x in range(num_tasks)] 

    def forward(self, predictions, labels_task1):

        #task 1
        target = labels_task1
        prediction = predictions[0]
        loss_fn = self.loss_fns[0]
        task_loss = loss_fn(prediction, target)
        
        return task_loss
    
#All layers of the model
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(MultiHeadAttention, self).__init__()

        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        assert(self.head_dim * heads == embed_size), "Embed size needs to be div by heads"
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys =nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)


    def forward(self, values, keys, query):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])

        attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3) #(batch_size, head_dim, #query_embeddings, #key_embeddings)

        # Calculate simplified attention scores
        avg_attention = attention.mean(dim=0)  # Average across batches
        # print("batch average", avg_attention.shape)
        avg_attention = avg_attention.mean(dim=0).squeeze(dim=0)
        # print("head average", avg_attention.shape)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.heads*self.head_dim) #(batch_size, n_features, embed_size)
        out = self.fc_out(out)

        return out, avg_attention
    
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion, pre_norm_on):
        super(TransformerBlock, self).__init__()

        self.pre_norm_on = pre_norm_on
        if self.pre_norm_on:
            self.pre_norm = nn.LayerNorm(embed_size)
        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(nn.Linear(embed_size, forward_expansion*embed_size),
                                          nn.ReLU(),
                                          nn.Linear(forward_expansion*embed_size, embed_size)
                                          )
        self.dropout = nn.Dropout(dropout)

    def forward(self,value,key,query):
        if self.pre_norm_on:
            query = self.pre_norm(query)
            key = self.pre_norm(key)
            value = self.pre_norm(value)
            
        attention, avg_attention = self.attention(value, key, query)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out, avg_attention
    
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, pre_norm_on):
        super(DecoderBlock, self).__init__()

        self.attention = MultiHeadAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion, pre_norm_on)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key):
        out, avg_attention = self.transformer_block(value, key, x)

        return out, avg_attention

class Decoder(nn.Module):
    def __init__(self,
                 embed_size,
                 num_layers,
                 heads,
                 forward_expansion,
                 decoder_dropout,
                 pre_norm_on
    ):
        super(Decoder, self).__init__()

        self.layers = nn.ModuleList(
                [
                    DecoderBlock(
                        embed_size,
                        heads,
                        dropout=decoder_dropout,
                        forward_expansion=forward_expansion,
                        pre_norm_on=pre_norm_on
                    )
                    for _ in range(num_layers)
                ]
            )
        self.avg_attention = None

    def forward(self, class_embed, context):
        for layer in self.layers:
            # x is the classification embedding (CLS Token)
            # context are the feature embeddings that will be used as key and value
            x, self.avg_attention = layer(class_embed, context, context)
  
        return x 

class Embeddings(nn.Module):
    def __init__(self, sigma, embed_size, input_size, embedding_dropout, n_features, num_target_labels, rff_on):
        super(Embeddings, self).__init__()

        self.rff_on = rff_on

        if self.rff_on:
            self.rffs = nn.ModuleList([GaussianEncoding(sigma=sigma, input_size=input_size, encoded_size=embed_size//2) for _ in range(n_features)])
            self.dropout = nn.Dropout(embedding_dropout)
            self.mlp_in = embed_size
        else:
            self.mlp_in = input_size

        self.embeddings = nn.ModuleList([nn.Linear(in_features=self.mlp_in, out_features=embed_size) for _ in range(n_features)])

        # Classifcation Embeddings for each target label
        self.target_label_embeddings = nn.ModuleList([nn.Embedding(1, embed_size) for _ in range(num_target_labels)])


    def forward(self, x):
        x = x.unsqueeze(2) #(batch_size, n_features) -> (batch_size, n_features, 1)
        rff_vectors = []
        if self.rff_on:
            for i, r in enumerate(self.rffs):
                input = x[:,i,:]
                out = r(input)
                rff_vectors.append(out)
        
            x = torch.stack(rff_vectors, dim=1)
        
        embeddings = []
        for i, e in enumerate(self.embeddings):
            goin_in = x[:,i,:]
            goin_out = e(goin_in)
            embeddings.append(goin_out)

        target_label_embeddings_ = []
        for e in self.target_label_embeddings:
            input = torch.tensor([0], device=x.device)
            temp = e(input)
            temp = temp.repeat(x.size(0), 1)
            tmep = temp.unsqueeze(1)
            target_label_embeddings_.append(temp)

        class_embeddings = torch.stack(target_label_embeddings_, dim=1)
        
        # class_embed = self.classification_embedding(torch.tensor([0], device=x.device))  # use index 0 for the classification embedding
        # class_embed = class_embed.repeat(x.size(0), 1) # -> (batch_size, embed_size)
        # class_embed = class_embed.unsqueeze(1)

        context = torch.stack(embeddings, dim=1)

        return class_embeddings, context

class classificationHead(nn.Module):
    def __init__(self, embed_size, dropout, mlp_scale_classification, num_target_classes):
        super(classificationHead, self).__init__()
        
        #flattening the embeddings out so each sample in batch is represented with a 460 dimensional vector
        self.input = embed_size
        self.lin1 = nn.Linear(self.input, mlp_scale_classification*self.input)
        self.drop = nn.Dropout(dropout)
        self.lin2 = nn.Linear(mlp_scale_classification*self.input, mlp_scale_classification*self.input)
        self.lin3 = nn.Linear(mlp_scale_classification*self.input, self.input)
        self.lin4 = nn.Linear(self.input, num_target_classes)
        self.relu = nn.ReLU()
        self.initialize_weights()

    def initialize_weights(self): #he_initialization.
        torch.nn.init.kaiming_normal_(self.lin1.weight, nonlinearity='relu')
        torch.nn.init.zeros_(self.lin1.bias)

        torch.nn.init.kaiming_normal_(self.lin3.weight, nonlinearity='relu')
        torch.nn.init.zeros_(self.lin3.bias)

    def forward(self, x):

        x= torch.reshape(x, (-1, self.input))

        x = self.lin1(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin2(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin3(x)
        x = self.relu(x)
        x = self.drop(x)
        x = self.lin4(x)
  
        return x


# DEFAULT PARAMETERS SET UP FOR VPN DATASET. BE CAREFUL AND MAKE SURE YOU SET THEM UP HOW YOU WANT.
class Classifier(nn.Module):
    def __init__(self, 
                 rff_on = False,
                 sigma=4,
                 embed_size=20,
                 input_size=1,
                 embedding_dropout = 0,
                 n_features=23, # YOU WILL PROBABLY NEED TO CHANGE
                 num_layers=1,
                 heads=1,
                 forward_expansion=4, # Determines how wide the MLP is in the encoder. Its a scaling factor. 
                 decoder_dropout=0,
                 classification_dropout = 0,
                 pre_norm_on = False,
                 mlp_scale_classification = 4,
                 targets_classes : list=  [3,8]
                 ):
        super(Classifier, self).__init__()

        self.embeddings = Embeddings(rff_on=rff_on, sigma=sigma, embed_size=embed_size, input_size=input_size, embedding_dropout=embedding_dropout, n_features=n_features, num_target_labels=len(targets_classes))
        self.decoder = Decoder(embed_size=embed_size, num_layers=num_layers, heads=heads, forward_expansion=forward_expansion, decoder_dropout=decoder_dropout, pre_norm_on=pre_norm_on)
        self.classifying_heads = nn.ModuleList([classificationHead(embed_size=embed_size, dropout=classification_dropout, mlp_scale_classification=mlp_scale_classification, num_target_classes=x) for x in targets_classes])
        
    def forward(self, x):
        class_embed, context = self.embeddings(x)

        x = self.decoder(class_embed, context)
        
        probability_dist_raw = []
        for i, e in enumerate(self.classifying_heads):
            input = x[:, i,:]
            output = e(input)
            probability_dist_raw.append(output)
        
        return probability_dist_raw

# Training and Testing Loops
def train(dataloader, model, loss_function, optimizer, device_in_use):
    model.train()

    total_loss = 0

    total_correct_1 = 0
    total_samples_1 = 0
    all_targets_1 = []
    all_predictions_1 = []

    total_correct_2 = 0
    total_samples_2 = 0
    all_targets_2 = []
    all_predictions_2 = []

    for (features,labels_task1,) in dataloader:
        features,labels_task1 = features.to(device_in_use),labels_task1.to(device_in_use)


        task_predictions = model(features) #contains a list of the tensor outputs for each task

        loss = loss_function(task_predictions, labels_task1)
        total_loss += loss.item()

        #computing accuracy for first target
        y_pred_softmax_1 = torch.softmax(task_predictions[0], dim=1)
        _, y_pred_labels_1 = torch.max(y_pred_softmax_1, dim=1)
        total_correct_1 += (y_pred_labels_1 == labels_task1).sum().item()
        total_samples_1 += labels_task1.size(0)
        all_targets_1.extend(labels_task1.cpu().numpy())
        all_predictions_1.extend(y_pred_labels_1.cpu().numpy())

        # #computing accuaracy for second target
        # y_pred_softmax_2 = torch.softmax(task_predictions[1], dim=1)
        # _, y_pred_labels_2 = torch.max(y_pred_softmax_2, dim=1)
        # total_correct_2 += (y_pred_labels_2 == labels_task2).sum().item()
        # total_samples_2 += labels_task2.size(0)
        # all_targets_2.extend(labels_task2.cpu().numpy())
        # all_predictions_2.extend(y_pred_labels_2.cpu().numpy())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss/len(dataloader)
    accuracy_1 = total_correct_1 / total_samples_1
    # accuracy_2 = total_correct_2 / total_samples_2

    # # precision = precision_score(all_targets, all_predictions, average='weighted')
    # recall = recall_score(all_targets, all_predictions, average='weighted')
    # f1 = f1_score(all_targets, all_predictions, average='weighted')

    return avg_loss, accuracy_1

def test(dataloader, model, loss_function, device_in_use):
  model.eval()
  total_loss = 0
  
  total_correct_1 = 0
  total_samples_1 = 0
  all_targets_1 = []
  all_predictions_1 = []

  total_correct_2 = 0
  total_samples_2 = 0
  all_targets_2 = []
  all_predictions_2 = []

  with torch.no_grad():
    for (features,labels_task1) in dataloader:
      features,labels_task1 = features.to(device_in_use),labels_task1.to(device_in_use)

      #compute prediction error
      task_predictions = model(features) #contains a list of the tensor outputs for each task

      loss = loss_function(task_predictions, labels_task1)
      total_loss += loss.item()

      #computing accuracy for first target
      y_pred_softmax_1 = torch.softmax(task_predictions[0], dim=1)
      _, y_pred_labels_1 = torch.max(y_pred_softmax_1, dim=1)
      total_correct_1 += (y_pred_labels_1 == labels_task1).sum().item()
      total_samples_1 += labels_task1.size(0)
      all_targets_1.extend(labels_task1.cpu().numpy())
      all_predictions_1.extend(y_pred_labels_1.cpu().numpy())

      # #computing accuaracy for second target
      # y_pred_softmax_2 = torch.softmax(task_predictions[1], dim=1)
      # _, y_pred_labels_2 = torch.max(y_pred_softmax_2, dim=1)
      # total_correct_2 += (y_pred_labels_2 == labels_task2).sum().item()
      # total_samples_2 += labels_task2.size(0)
      # all_targets_2.extend(labels_task2.cpu().numpy())
      # all_predictions_2.extend(y_pred_labels_2.cpu().numpy())

  avg = total_loss/len(dataloader)
  accuracy_1 = total_correct_1 / total_samples_1
  # accuracy_2 = total_correct_2 / total_samples_2
  # recall = recall_score(all_targets, all_predictions, average='weighted')
  f1_1 = f1_score(all_targets_1, all_predictions_1, average='weighted')
  # f1_2 = f1_score(all_targets_2, all_predictions_2, average="weighted")

  return avg, accuracy_1, all_predictions_1, all_targets_1, f1_1

def format_metric(value): # Used to format the metrics output
    return f"{value:.4f}"

In [11]:
# Define the early stopping mechanism
class EarlyStopping:
    def __init__(self, patience=5):
        self.patience = patience
        self.counter = 0
        self.best_metric = float('-inf')
        self.early_stop = False

    def __call__(self, metric):
        if metric > self.best_metric:
            self.best_metric = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

# Function to log results to a text file
def log_to_file(filename, text):
    with open(filename, 'a') as f:
        f.write(text + '\n')

def objective(trial):
    trial_number = trial.number

    # Define hyperparameters to search over
    rff_on = trial.suggest_categorical('rff_on', [True, False])
    sigma = trial.suggest_float('sigma', .001, 10)
    num_layers = trial.suggest_int('num_layers', 1, 2)
    # Ensure that embed_size is divisible by num_layers
    embed_size = trial.suggest_categorical("embed_size", [50, 60, 70, 80, 90, 100, 120, 140, 160])
    heads = trial.suggest_categorical("heads", [1, 5, 10])
    forward_expansion = trial.suggest_int('forward_expansion', 1, 8)
    prenorm_on = trial.suggest_categorical('prenorm_on', [True, False])
    mlp_scale_classification = trial.suggest_int('mlp_scale_classification', 1, 8)

    learning_rate = trial.suggest_float('learning_rate', 0.0001, 0.01)

    num_epochs = 75

    # Create your model with the sampled hyperparameters
    model = Classifier(
        n_features=54,
        targets_classes=[7],
        rff_on=rff_on,
        sigma=sigma,
        embed_size=embed_size,
        num_layers=num_layers,
        heads=heads,
        forward_expansion=forward_expansion,
        pre_norm_on=prenorm_on,
        mlp_scale_classification=mlp_scale_classification
    ).to(device_in_use)

    # Define loss function and optimizer
    loss_function = UncertaintyLoss(1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Initialize early stopping
    early_stopping = EarlyStopping(patience=5)  # Adjust patience as needed

    # Training loop with a large number of epochs
    for epoch in range(num_epochs):
        train_loss, train_accuracy = train(train_dataloader, model, loss_function, optimizer, device_in_use)
        
        # Validation loop
        val_loss, val_accuracy, _, _, _ = test(val_dataloader, model, loss_function, device_in_use)
        
        # Check if we should early stop based on validation accuracy
        if early_stopping(val_accuracy):
            break

    # # Evaluate the model on the test set
    # test_loss, test_accuracy, _, _, _ = test(test_dataloader, model, loss_function, device_in_use)
    
    # Log the final test accuracy for this trial to a shared log file
    final_log = f"Trial {trial_number} completed. Validation Accuracy = {val_accuracy:.4f}"
    log_to_file('all_trials_log.txt', final_log)

    # Return the test accuracy as the objective to optimize
    return val_accuracy

In [12]:
# Set the number of optimization trials
num_trials = 30

# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize validation accuracy

# Start the optimization process
study.optimize(objective, n_trials=num_trials, show_progress_bar=True)

# Get the best hyperparameters and the validation accuracy at the point of early stopping
best_params = study.best_params
best_val_accuracy = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Accuracy (at Early Stopping):", best_val_accuracy)

[I 2023-09-29 18:15:45,296] A new study created in memory with name: no-name-c1f0eec8-21f6-464d-b262-ba2388e8bbb7
Best trial: 0. Best value: 0.957057:   3%|▎         | 1/30 [1:17:50<37:37:25, 4670.54s/it]

[I 2023-09-29 19:33:35,840] Trial 0 finished with value: 0.957057414552287 and parameters: {'rff_on': False, 'sigma': 4.011390427575556, 'num_layers': 1, 'embed_size': 160, 'heads': 1, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 2, 'learning_rate': 0.0008848654005003334}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:   7%|▋         | 2/30 [2:50:31<40:24:01, 5194.35s/it]

[I 2023-09-29 21:06:16,848] Trial 1 finished with value: 0.4844451456519557 and parameters: {'rff_on': True, 'sigma': 8.941070434236297, 'num_layers': 1, 'embed_size': 140, 'heads': 1, 'forward_expansion': 4, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.007780863279031983}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  10%|█         | 3/30 [4:25:04<40:35:52, 5413.05s/it]

[I 2023-09-29 22:40:50,156] Trial 2 finished with value: 0.4844451456519557 and parameters: {'rff_on': True, 'sigma': 1.2336214175959943, 'num_layers': 2, 'embed_size': 70, 'heads': 1, 'forward_expansion': 7, 'prenorm_on': True, 'mlp_scale_classification': 4, 'learning_rate': 0.008838141126083846}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  13%|█▎        | 4/30 [6:00:26<39:58:23, 5534.76s/it]

[I 2023-09-30 00:16:11,492] Trial 3 finished with value: 0.9435033920913355 and parameters: {'rff_on': True, 'sigma': 3.7123289754033304, 'num_layers': 2, 'embed_size': 70, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 3, 'learning_rate': 0.0032157896359951777}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  17%|█▋        | 5/30 [7:19:47<36:29:57, 5255.89s/it]

[I 2023-09-30 01:35:32,932] Trial 4 finished with value: 0.6705009968302234 and parameters: {'rff_on': False, 'sigma': 1.8260017163732565, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 8, 'prenorm_on': True, 'mlp_scale_classification': 6, 'learning_rate': 0.002187575773182042}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  20%|██        | 6/30 [8:49:34<35:20:13, 5300.56s/it]

[I 2023-09-30 03:05:20,207] Trial 5 finished with value: 0.4844451456519557 and parameters: {'rff_on': True, 'sigma': 3.1932507415543023, 'num_layers': 1, 'embed_size': 90, 'heads': 1, 'forward_expansion': 1, 'prenorm_on': False, 'mlp_scale_classification': 4, 'learning_rate': 0.005685925321724773}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  23%|██▎       | 7/30 [10:25:21<34:47:49, 5446.51s/it]

[I 2023-09-30 04:41:07,206] Trial 6 finished with value: 0.9473903128182327 and parameters: {'rff_on': True, 'sigma': 2.647376783501273, 'num_layers': 2, 'embed_size': 90, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': True, 'mlp_scale_classification': 1, 'learning_rate': 0.00926307432928551}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  27%|██▋       | 8/30 [11:44:56<31:58:37, 5232.60s/it]

[I 2023-09-30 06:00:41,765] Trial 7 finished with value: 0.9497425452876465 and parameters: {'rff_on': False, 'sigma': 3.978649294361454, 'num_layers': 1, 'embed_size': 70, 'heads': 5, 'forward_expansion': 2, 'prenorm_on': True, 'mlp_scale_classification': 4, 'learning_rate': 0.003235226366473099}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  30%|███       | 9/30 [13:16:46<31:01:42, 5319.17s/it]

[I 2023-09-30 07:32:31,303] Trial 8 finished with value: 0.9209420404182385 and parameters: {'rff_on': True, 'sigma': 3.9213375133404424, 'num_layers': 1, 'embed_size': 70, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 6, 'learning_rate': 0.0017807834515944326}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  33%|███▎      | 10/30 [14:34:35<28:26:09, 5118.49s/it]

[I 2023-09-30 08:50:20,419] Trial 9 finished with value: 0.6721360852540841 and parameters: {'rff_on': False, 'sigma': 4.3616962900003715, 'num_layers': 1, 'embed_size': 160, 'heads': 1, 'forward_expansion': 2, 'prenorm_on': False, 'mlp_scale_classification': 7, 'learning_rate': 0.0020601164892285907}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  37%|███▋      | 11/30 [15:54:21<26:28:38, 5016.75s/it]

[I 2023-09-30 10:10:06,492] Trial 10 finished with value: 0.9294617116794079 and parameters: {'rff_on': False, 'sigma': 6.217539056214692, 'num_layers': 2, 'embed_size': 60, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 2, 'learning_rate': 0.00021041930400566764}. Best is trial 0 with value: 0.957057414552287.


Best trial: 0. Best value: 0.957057:  40%|████      | 12/30 [17:13:52<24:42:37, 4942.08s/it]

[I 2023-09-30 11:29:37,773] Trial 11 finished with value: 0.9299780553922061 and parameters: {'rff_on': False, 'sigma': 0.18226567822626594, 'num_layers': 1, 'embed_size': 50, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 3, 'learning_rate': 0.004241000001534702}. Best is trial 0 with value: 0.957057414552287.


Best trial: 12. Best value: 0.959266:  43%|████▎     | 13/30 [18:32:44<23:02:11, 4878.32s/it]

[I 2023-09-30 12:48:29,399] Trial 12 finished with value: 0.9592662182125902 and parameters: {'rff_on': False, 'sigma': 5.477016766106461, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.0006365067852907736}. Best is trial 12 with value: 0.9592662182125902.


Best trial: 12. Best value: 0.959266:  47%|████▋     | 14/30 [19:51:00<21:26:14, 4823.43s/it]

[I 2023-09-30 14:06:45,974] Trial 13 finished with value: 0.9526254643507694 and parameters: {'rff_on': False, 'sigma': 5.819112790061075, 'num_layers': 1, 'embed_size': 100, 'heads': 5, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 8, 'learning_rate': 0.00010947451434157283}. Best is trial 12 with value: 0.9592662182125902.


Best trial: 14. Best value: 0.95994:  50%|█████     | 15/30 [21:09:44<19:58:19, 4793.32s/it] 

[I 2023-09-30 15:25:29,518] Trial 14 finished with value: 0.95994033361541 and parameters: {'rff_on': False, 'sigma': 5.557877185420446, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.0009738854295804682}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  53%|█████▎    | 16/30 [22:27:51<18:30:59, 4761.41s/it]

[I 2023-09-30 16:43:36,840] Trial 15 finished with value: 0.955164154272027 and parameters: {'rff_on': False, 'sigma': 6.9593719325173335, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 6, 'learning_rate': 0.0012407659685258147}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  57%|█████▋    | 17/30 [23:45:59<17:06:52, 4739.45s/it]

[I 2023-09-30 18:01:45,219] Trial 16 finished with value: 0.9551354685102049 and parameters: {'rff_on': False, 'sigma': 5.196250445924256, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.00022705657381929048}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  60%|██████    | 18/30 [25:06:40<15:53:56, 4769.74s/it]

[I 2023-09-30 19:22:25,470] Trial 17 finished with value: 0.950832604236887 and parameters: {'rff_on': False, 'sigma': 7.081037841930927, 'num_layers': 2, 'embed_size': 120, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.0028294932214317945}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  63%|██████▎   | 19/30 [26:26:13<14:34:39, 4770.84s/it]

[I 2023-09-30 20:41:58,859] Trial 18 finished with value: 0.9564263277922004 and parameters: {'rff_on': False, 'sigma': 7.8539858105670906, 'num_layers': 1, 'embed_size': 80, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 8, 'learning_rate': 0.0014249986655708697}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  67%|██████▋   | 20/30 [27:46:03<13:16:05, 4776.57s/it]

[I 2023-09-30 22:01:48,796] Trial 19 finished with value: 0.945310595086129 and parameters: {'rff_on': False, 'sigma': 4.976252542487756, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 3, 'prenorm_on': False, 'mlp_scale_classification': 7, 'learning_rate': 0.004423775195538675}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  70%|███████   | 21/30 [29:07:03<12:00:14, 4801.61s/it]

[I 2023-09-30 23:22:48,787] Trial 20 finished with value: 0.9596534759971888 and parameters: {'rff_on': False, 'sigma': 9.978006192694869, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.0009653647909318935}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  73%|███████▎  | 22/30 [30:27:05<10:40:13, 4801.73s/it]

[I 2023-10-01 00:42:50,790] Trial 21 finished with value: 0.9566271281249552 and parameters: {'rff_on': False, 'sigma': 8.58485430590904, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.001048911446210212}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  77%|███████▋  | 23/30 [31:47:23<9:20:46, 4806.66s/it] 

[I 2023-10-01 02:03:08,966] Trial 22 finished with value: 0.9511051189741971 and parameters: {'rff_on': False, 'sigma': 9.796700766760342, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.0023447139133374022}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  80%|████████  | 24/30 [33:08:23<8:02:16, 4822.74s/it]

[I 2023-10-01 03:24:09,222] Trial 23 finished with value: 0.9573442721705082 and parameters: {'rff_on': False, 'sigma': 9.872880800853046, 'num_layers': 2, 'embed_size': 100, 'heads': 10, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 6, 'learning_rate': 0.0009309100803511188}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  83%|████████▎ | 25/30 [34:29:06<6:42:23, 4828.61s/it]

[I 2023-10-01 04:44:51,518] Trial 24 finished with value: 0.9496851737640022 and parameters: {'rff_on': False, 'sigma': 6.272477676622113, 'num_layers': 2, 'embed_size': 80, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 3, 'learning_rate': 0.0017614775394873645}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  87%|████████▋ | 26/30 [35:48:40<5:20:48, 4812.19s/it]

[I 2023-10-01 06:04:25,398] Trial 25 finished with value: 0.948910658194805 and parameters: {'rff_on': False, 'sigma': 7.531362190319852, 'num_layers': 1, 'embed_size': 60, 'heads': 5, 'forward_expansion': 4, 'prenorm_on': False, 'mlp_scale_classification': 7, 'learning_rate': 0.002788698216050522}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  90%|█████████ | 27/30 [37:09:08<4:00:51, 4817.01s/it]

[I 2023-10-01 07:24:53,642] Trial 26 finished with value: 0.9497999168112907 and parameters: {'rff_on': False, 'sigma': 5.058404885844179, 'num_layers': 2, 'embed_size': 50, 'heads': 10, 'forward_expansion': 8, 'prenorm_on': False, 'mlp_scale_classification': 4, 'learning_rate': 0.0007574980339746953}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  93%|█████████▎| 28/30 [38:28:18<2:39:53, 4796.86s/it]

[I 2023-10-01 08:44:03,489] Trial 27 finished with value: 0.9536007802527215 and parameters: {'rff_on': False, 'sigma': 8.192240487004712, 'num_layers': 1, 'embed_size': 120, 'heads': 5, 'forward_expansion': 7, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.00010925562490736495}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994:  97%|█████████▋| 29/30 [39:48:44<1:20:05, 4805.64s/it]

[I 2023-10-01 10:04:29,634] Trial 28 finished with value: 0.9585921028097704 and parameters: {'rff_on': False, 'sigma': 8.985106716438043, 'num_layers': 2, 'embed_size': 140, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 6, 'learning_rate': 0.0016476413263101287}. Best is trial 14 with value: 0.95994033361541.


Best trial: 14. Best value: 0.95994: 100%|██████████| 30/30 [41:07:55<00:00, 4935.85s/it]  

[I 2023-10-01 11:23:40,912] Trial 29 finished with value: 0.9594526756644339 and parameters: {'rff_on': False, 'sigma': 6.859421898616931, 'num_layers': 1, 'embed_size': 160, 'heads': 5, 'forward_expansion': 5, 'prenorm_on': False, 'mlp_scale_classification': 3, 'learning_rate': 0.0008542519100732909}. Best is trial 14 with value: 0.95994033361541.
Best Hyperparameters: {'rff_on': False, 'sigma': 5.557877185420446, 'num_layers': 1, 'embed_size': 100, 'heads': 10, 'forward_expansion': 6, 'prenorm_on': False, 'mlp_scale_classification': 5, 'learning_rate': 0.0009738854295804682}
Best Validation Accuracy (at Early Stopping): 0.95994033361541



