In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import lightning.pytorch as pl
import optuna
from optuna.pruners import HyperbandPruner
from optuna.integration import PyTorchLightningPruningCallback
torch.set_float32_matmul_precision('high')

In [2]:
from BPE_Encoder import token2id, train_dataset, val_dataset
from Transformer import MLLM

In [None]:
# Train tokens
train_ids = torch.load("data\train_texttokens.pt", map_location="cpu")
train_ids_stream       = train_ids.tolist() 
print(len(train_ids_stream))

# validation set tokens 
val_ids = torch.load("data\val_texttokens.pt", map_location="cpu")
val_ids_stream       = val_ids.tolist()
print(len(val_ids_stream))

2851449
480708


In [4]:

def objective(trial):


    seed = 2025 + trial.number
    pl.seed_everything(seed, workers=True)


    lr   = trial.suggest_float("lr", 1e-4, 1e-3, log=True)
    wd   = trial.suggest_float("weight_decay", 1e-6, 1e-2, log=True)
    dropout_percentage = trial.suggest_float("dropout_percentage", 0, 0.15)

    heads      = trial.suggest_categorical("heads", [2,4])
    num_layers = trial.suggest_categorical("num_layers", [2,3,4,6])
    dim        = trial.suggest_categorical("dim", [48,64])

    ls         = trial.suggest_float("label_smoothing", 0, 0.06)
    pct_start = trial.suggest_float("pct_start", 0.1, 0.5)
    act = trial.suggest_categorical("activation", ["relu", "gelu"])
    ffn_internal = trial.suggest_categorical('ffn_internal', [2,4])


    model = MLLM(vocab=len(token2id), dim=dim, pad_idx=token2id['<pad>'], max_pos=256, QKV_dim=dim // heads, heads=heads, num_layers=num_layers, 
                 dropout_percentage=dropout_percentage, learning_rate=lr, wd=wd, ls=ls, pct_start = pct_start, act=act, ffn_internal = ffn_internal)

    callback_prune = PyTorchLightningPruningCallback(trial, monitor="Val_Loss")

    early_call= pl.callbacks.EarlyStopping( monitor="Val_Loss", patience=5, mode="min")


    trainer = pl.Trainer(
        max_epochs=30,
        accelerator="gpu",
        precision="16-mixed",
        gradient_clip_val=1.0,        
        callbacks=[callback_prune,early_call],
        logger=False,
        enable_checkpointing=False,
    )

    trainer.fit(model, train_dataset, val_dataset)

    val_loss = trainer.callback_metrics["Val_Loss"].item()


    return val_loss



pruner = HyperbandPruner(min_resource=1, max_resource=30, reduction_factor=3)

study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_trials=25) 

print("Best trial:", study.best_trial.params)


[I 2025-07-11 19:35:21,879] A new study created in memory with name: no-name-6e9c7620-1c8f-4734-a3bc-2e48a5edd92e
Seed set to 2025
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | ReLU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 200 K  | train
5 | model_head | Linear           | 1.3 M  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total par

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 19:48:49,336] Trial 0 finished with value: 4.5322041511535645 and parameters: {'lr': 0.00021746823599620488, 'weight_decay': 5.1877394286615515e-05, 'dropout_percentage': 0.10072702360599266, 'heads': 4, 'num_layers': 6, 'dim': 64, 'label_smoothing': 0.0013309586742343237, 'pct_start': 0.338822490334006, 'activation': 'relu', 'ffn_internal': 2}. Best is trial 0 with value: 4.5322041511535645.
Seed set to 2026
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0     

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 20:03:19,610] Trial 1 finished with value: 4.718948841094971 and parameters: {'lr': 0.00020139972145127592, 'weight_decay': 0.00012065133407249857, 'dropout_percentage': 0.013634998500582956, 'heads': 2, 'num_layers': 6, 'dim': 48, 'label_smoothing': 0.031029734371152327, 'pct_start': 0.38174997147680834, 'activation': 'gelu', 'ffn_internal': 2}. Best is trial 0 with value: 4.5322041511535645.
Seed set to 2027
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | ReLU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0    

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 20:19:30,959] Trial 2 finished with value: 4.667603015899658 and parameters: {'lr': 0.00011501449113485001, 'weight_decay': 0.002948075689238747, 'dropout_percentage': 0.08772043346256737, 'heads': 4, 'num_layers': 6, 'dim': 64, 'label_smoothing': 0.01684875948674721, 'pct_start': 0.457239289574232, 'activation': 'relu', 'ffn_internal': 2}. Best is trial 0 with value: 4.5322041511535645.
Seed set to 2028
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | ReLU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 20:33:39,169] Trial 3 finished with value: 4.636911392211914 and parameters: {'lr': 0.0002776671942077388, 'weight_decay': 0.005116344435400404, 'dropout_percentage': 0.14726821140618984, 'heads': 2, 'num_layers': 4, 'dim': 48, 'label_smoothing': 0.01934340604741214, 'pct_start': 0.2488045078782576, 'activation': 'relu', 'ffn_internal': 4}. Best is trial 0 with value: 4.5322041511535645.
Seed set to 2029
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 20:49:53,222] Trial 4 finished with value: 4.2817511558532715 and parameters: {'lr': 0.0008846917512346465, 'weight_decay': 0.00029475907910799643, 'dropout_percentage': 0.0010306170289606452, 'heads': 4, 'num_layers': 6, 'dim': 64, 'label_smoothing': 0.019019051540349757, 'pct_start': 0.35980068988151803, 'activation': 'gelu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2030
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params  | Mode 
---------------------------------------------------------
0 | act        | GELU             | 0       | train
1 | embedtoken | Embedding        | 1.3 M   | train
2 | embedpos   | Embedding        | 16.

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 20:54:30,559] Trial 5 pruned. Trial was pruned at epoch 9.
Seed set to 2031
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 113 K  | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.347     Total estimated model param

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 21:08:31,407] Trial 6 finished with value: 4.463203430175781 and parameters: {'lr': 0.0005640122583699303, 'weight_decay': 1.3457395837580279e-05, 'dropout_percentage': 0.1336835522972976, 'heads': 2, 'num_layers': 4, 'dim': 48, 'label_smoothing': 0.008661997562690418, 'pct_start': 0.3456905449850259, 'activation': 'gelu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2032
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | ReLU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | tra

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:13:20,067] Trial 7 pruned. Trial was pruned at epoch 9.
Seed set to 2033
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 84.8 K | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.234     Total estimated model param

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:14:21,841] Trial 8 pruned. Trial was pruned at epoch 1.
Seed set to 2034
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params  | Mode 
---------------------------------------------------------
0 | act        | ReLU             | 0       | train
1 | embedtoken | Embedding        | 1.3 M   | train
2 | embedpos   | Embedding        | 16.4 K  | train
3 | dropout    | Dropout          | 0       | train
4 | layers     | ModuleList       | 100.0 K | train
5 | model_head | Linear           | 1.3 M   | train
6 | loss       | CrossEntropyLoss | 0       | train
---------------------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.593     Total estimated m

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:22:17,851] Trial 9 finished with value: 4.79119873046875 and parameters: {'lr': 0.0005343630561407035, 'weight_decay': 2.1603690700776898e-06, 'dropout_percentage': 0.014637328824834943, 'heads': 2, 'num_layers': 2, 'dim': 64, 'label_smoothing': 0.03962477166320927, 'pct_start': 0.3444818334864448, 'activation': 'relu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2035
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0      | 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 21:38:29,363] Trial 10 finished with value: 4.550512790679932 and parameters: {'lr': 0.0009913031913405107, 'weight_decay': 0.0005958930748641575, 'dropout_percentage': 0.053794475184024027, 'heads': 4, 'num_layers': 6, 'dim': 64, 'label_smoothing': 0.053259140046544536, 'pct_start': 0.11236768177217438, 'activation': 'gelu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2036
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:43:19,884] Trial 11 pruned. Trial was pruned at epoch 9.
Seed set to 2037
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 113 K  | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.347     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:44:27,358] Trial 12 pruned. Trial was pruned at epoch 1.
Seed set to 2038
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 199 K  | train
5 | model_head | Linear           | 1.3 M  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.993     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:57:37,556] Trial 13 pruned. Trial was pruned at epoch 27.
Seed set to 2039
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 299 K  | train
5 | model_head | Linear           | 1.3 M  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.393     Total estimated model par

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 21:59:44,526] Trial 14 pruned. Trial was pruned at epoch 3.
Seed set to 2040
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 113 K  | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.347     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:00:50,793] Trial 15 pruned. Trial was pruned at epoch 1.
Seed set to 2041
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 149 K  | train
5 | model_head | Linear           | 1.3 M  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.793     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 22:14:32,160] Trial 16 finished with value: 4.512755870819092 and parameters: {'lr': 0.0009703416627155193, 'weight_decay': 0.00010745130274326574, 'dropout_percentage': 0.12722766073471703, 'heads': 2, 'num_layers': 3, 'dim': 64, 'label_smoothing': 0.0247968251596083, 'pct_start': 0.4206637083460757, 'activation': 'gelu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2042
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | tra

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:24:34,153] Trial 17 finished with value: 4.470570087432861 and parameters: {'lr': 0.0006857280709451328, 'weight_decay': 1.0241945068231227e-05, 'dropout_percentage': 0.08067343375330015, 'heads': 4, 'num_layers': 2, 'dim': 48, 'label_smoothing': 0.0005714742130586078, 'pct_start': 0.1955417944847811, 'activation': 'gelu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2043
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0     

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:25:42,944] Trial 18 pruned. Trial was pruned at epoch 1.
Seed set to 2044
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 169 K  | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.573     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:27:50,290] Trial 19 pruned. Trial was pruned at epoch 3.
Seed set to 2045
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 1.3 M  | train
2 | embedpos   | Embedding        | 16.4 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 199 K  | train
5 | model_head | Linear           | 1.3 M  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.5 M     Trainable params
0         Non-trainable params
1.5 M     Total params
5.993     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:29:50,748] Trial 20 pruned. Trial was pruned at epoch 3.
Seed set to 2046
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 56.5 K | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.121     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:42:31,251] Trial 21 pruned. Trial was pruned at epoch 27.
Seed set to 2047
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 56.5 K | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.121     Total estimated model par

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.
[I 2025-07-11 22:56:07,601] Trial 22 finished with value: 4.416449069976807 and parameters: {'lr': 0.0008208450301452836, 'weight_decay': 7.428802847070393e-06, 'dropout_percentage': 0.08128484873544577, 'heads': 4, 'num_layers': 2, 'dim': 48, 'label_smoothing': 0.005097502009981244, 'pct_start': 0.20554149208610936, 'activation': 'gelu', 'ffn_internal': 4}. Best is trial 4 with value: 4.2817511558532715.
Seed set to 2048
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | t

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:58:25,874] Trial 23 pruned. Trial was pruned at epoch 1.
Seed set to 2049
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | act        | GELU             | 0      | train
1 | embedtoken | Embedding        | 961 K  | train
2 | embedpos   | Embedding        | 12.3 K | train
3 | dropout    | Dropout          | 0      | train
4 | layers     | ModuleList       | 56.5 K | train
5 | model_head | Linear           | 961 K  | train
6 | loss       | CrossEntropyLoss | 0      | train
--------------------------------------------------------
1.0 M     Trainable params
0         Non-trainable params
1.0 M     Total params
4.121     Total estimated model para

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[I 2025-07-11 22:59:27,055] Trial 24 pruned. Trial was pruned at epoch 1.


Best trial: {'lr': 0.0008846917512346465, 'weight_decay': 0.00029475907910799643, 'dropout_percentage': 0.0010306170289606452, 'heads': 4, 'num_layers': 6, 'dim': 64, 'label_smoothing': 0.019019051540349757, 'pct_start': 0.35980068988151803, 'activation': 'gelu', 'ffn_internal': 4}
