In [2]:
# Imports
import gc
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, T5Tokenizer, T5ForConditionalGeneration
import optuna
import os
import json
from tqdm import tqdm
import numpy as np
import time
import shutil

2025-04-15 21:44:53.707352: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744753494.002555      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744753494.091067      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"Number of GPUs available: {torch.cuda.device_count()}")

# Load dataset
with open('/kaggle/input/moe-dataset/combined_scientific_papers.json', 'r') as f:
    data = json.load(f)

# Domain to label mapping
domain_to_label = {domain: idx for idx, domain in enumerate(set(entry['domain'] for entry in data))}
num_labels = len(domain_to_label)
num_experts = num_labels
print(f"Number of experts/domains: {num_experts}")

Using device: cuda
Number of GPUs available: 2
Number of experts/domains: 3


In [4]:
class ScientificDataset(Dataset):
    def __init__(self, data, domain_to_label):
        self.queries = [entry['text'][:100] for entry in data]
        self.labels = [domain_to_label[entry['domain']] for entry in data]
        self.responses = [entry['text'] for entry in data]
    
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        return self.queries[idx], self.labels[idx], self.responses[idx]

# Collate functions
def gating_collate_fn(batch):
    queries, labels, _ = zip(*batch)
    tokenized = bert_tokenizer(list(queries), padding=True, truncation=True, return_tensors='pt')
    return tokenized, torch.tensor(labels)

def expert_collate_fn(batch):
    queries, responses = zip(*batch)
    inputs = t5_tokenizer(list(queries), padding=True, truncation=True, return_tensors='pt')
    targets = t5_tokenizer(list(responses), padding=True, truncation=True, return_tensors='pt')
    return inputs, targets['input_ids']

# Expert dataset
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

# Extract expert data
def get_expert_data(dataset, expert_id):
    return [(query, response) for query, label, response in dataset if label == expert_id]

# Training functions with gradient accumulation
def train_gating_model(model, train_loader, val_loader, lr, epochs, accumulation_steps=4):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    model = nn.DataParallel(model)
    model.to(device)
    loss_fct = nn.CrossEntropyLoss()
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        for i, batch in enumerate(train_loader):
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            outputs = model(input_ids=inputs['input_ids'], 
                            attention_mask=inputs['attention_mask'])
            logits = outputs.logits
            loss = loss_fct(logits, labels) / accumulation_steps
            loss.backward()
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            total_loss += loss.item() * accumulation_steps
        avg_loss = total_loss / len(train_loader)
        print(f"Gating Epoch {epoch+1}, Train Loss: {avg_loss:.4f}")
        gc.collect()
        torch.cuda.empty_cache()
    return model

def train_expert(model, train_loader, val_loader, lr, epochs, expert_id, accumulation_steps=4):
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    model = nn.DataParallel(model)
    model.to(device)
    pad_token_id = model.module.config.pad_token_id
    loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        for i, batch in enumerate(train_loader):
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            decoder_input_ids = labels[:, :-1].clone()
            target_labels = labels[:, 1:].clone()
            outputs = model(input_ids=inputs['input_ids'], 
                            attention_mask=inputs['attention_mask'], 
                            decoder_input_ids=decoder_input_ids)
            logits = outputs.logits
            loss = loss_fct(logits.view(-1, logits.size(-1)), target_labels.view(-1)) / accumulation_steps
            loss.backward()
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            total_loss += loss.item() * accumulation_steps
        avg_loss = total_loss / len(train_loader)
        print(f"Expert {expert_id} Epoch {epoch+1}, Train Loss: {avg_loss:.4f}")
        gc.collect()
        torch.cuda.empty_cache()
    return model

# Evaluation functions
def evaluate_gating_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            outputs = model(input_ids=inputs['input_ids'], 
                           attention_mask=inputs['attention_mask'])
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

def evaluate_expert(model, val_loader):
    if len(val_loader) == 0:
        return 0.0
    model.eval()
    total_loss = 0
    pad_token_id = model.module.config.pad_token_id
    loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            decoder_input_ids = labels[:, :-1].clone()
            target_labels = labels[:, 1:].clone()
            outputs = model(input_ids=inputs['input_ids'], 
                            attention_mask=inputs['attention_mask'], 
                            decoder_input_ids=decoder_input_ids)
            logits = outputs.logits
            loss = loss_fct(logits.view(-1, logits.size(-1)), target_labels.view(-1))
            total_loss += loss.item()
    return total_loss / len(val_loader)

# Optuna objective
def objective(trial):
    trial_dir = f'trial_{trial.number}'
    os.makedirs(trial_dir, exist_ok=True)
    
    gating_lr = trial.suggest_float('gating_lr', 1e-5, 1e-3, log=True)
    gating_epochs = trial.suggest_int('gating_epochs', 3, 10)
    expert_lr = trial.suggest_float('expert_lr', 1e-5, 1e-3, log=True)
    expert_epochs = trial.suggest_int('expert_epochs', 3, 10)
    
    gating_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_experts).to(device)
    gating_model = train_gating_model(gating_model, train_loader_gating, val_loader_gating, gating_lr, gating_epochs)
    gating_accuracy = evaluate_gating_model(gating_model, val_loader_gating)
    
    gating_model_path = os.path.join(trial_dir, 'gating.pt')
    torch.save(gating_model.state_dict(), gating_model_path)
    gating_model.to('cpu')
    torch.cuda.empty_cache()
    del gating_model
    
    experts_paths = {}
    expert_losses = []
    for expert_id in range(num_experts):
        train_data = train_expert_data[expert_id]
        val_data = val_expert_data[expert_id]
        if not train_data:
            continue
        expert_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
        train_dataset_expert = TextDataset(train_data)
        val_dataset_expert = TextDataset(val_data)
        train_loader_expert = DataLoader(train_dataset_expert, batch_size=2, shuffle=True, collate_fn=expert_collate_fn)
        val_loader_expert = DataLoader(val_dataset_expert, batch_size=2, shuffle=False, collate_fn=expert_collate_fn)
        expert_model = train_expert(expert_model, train_loader_expert, val_loader_expert, expert_lr, expert_epochs, expert_id)
        loss = evaluate_expert(expert_model, val_loader_expert)
        expert_losses.append(loss)
        
        expert_path = os.path.join(trial_dir, f'expert_{expert_id}.pt')
        torch.save(expert_model.state_dict(), expert_path)
        experts_paths[expert_id] = expert_path
        expert_model.to('cpu')
        torch.cuda.empty_cache()
        del expert_model
    
    avg_expert_loss = sum(expert_losses) / len(expert_losses) if expert_losses else 0
    combined_metric = gating_accuracy - 0.1 * avg_expert_loss
    
    trial.set_user_attr('gating_accuracy', gating_accuracy)
    trial.set_user_attr('avg_expert_loss', avg_expert_loss)
    trial.set_user_attr('trial_dir', trial_dir)
    trial.set_user_attr('experts_paths', experts_paths)
    
    return combined_metric

In [5]:
if __name__ == "__main__":
    full_dataset = ScientificDataset(data, domain_to_label)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
    
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    t5_tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)
    
    train_loader_gating = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=gating_collate_fn)
    val_loader_gating = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=gating_collate_fn)
    
    train_expert_data = {i: get_expert_data(train_dataset, i) for i in range(num_experts)}
    val_expert_data = {i: get_expert_data(val_dataset, i) for i in range(num_experts)}
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)
    
    print("\nBenchmarking Top 5 Configurations:")
    top_trials = sorted(study.trials, key=lambda t: t.value, reverse=True)[:5]
    for i, trial in enumerate(top_trials):
        print(f"\nMOE {i+1}:")
        print(f"  Trial Number: {trial.number}")
        print(f"  Combined Metric: {trial.value:.4f}")
        print(f"  Gating Accuracy: {trial.user_attrs['gating_accuracy']:.4f}")
        print(f"  Avg Expert Loss: {trial.user_attrs['avg_expert_loss']:.4f}")
        print(f"  Hyperparameters: {trial.params}")
    
    for i, trial in enumerate(top_trials):
        moe_dir = f'MOE_{i+1}'
        os.makedirs(moe_dir, exist_ok=True)
        trial_dir = trial.user_attrs['trial_dir']
        
        shutil.copy(os.path.join(trial_dir, 'gating.pt'), os.path.join(moe_dir, 'gating.pt'))
        
        experts_paths = trial.user_attrs['experts_paths']
        for expert_id, expert_path in experts_paths.items():
            shutil.copy(expert_path, os.path.join(moe_dir, f'expert_{expert_id}.pt'))
        
        metrics = {
            'gating_accuracy': trial.user_attrs['gating_accuracy'],
            'avg_expert_loss': trial.user_attrs['avg_expert_loss'],
            'combined_metric': trial.value
        }
        with open(os.path.join(moe_dir, 'metrics.json'), 'w') as f:
            json.dump(metrics, f)
        with open(os.path.join(moe_dir, 'hyperparams.json'), 'w') as f:
            json.dump(trial.params, f)
    
    top_trial_numbers = [trial.number for trial in top_trials]
    for trial in study.trials:
        if trial.number not in top_trial_numbers:
            trial_dir = trial.user_attrs.get('trial_dir')
            if trial_dir and os.path.exists(trial_dir):
                shutil.rmtree(trial_dir)
    
    print("\nTop 5 MOE models saved.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

[I 2025-04-15 21:45:08,933] A new study created in memory with name: no-name-24f4c29d-9c17-4f33-8228-49fd2ec7feed


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2060
Gating Epoch 2, Train Loss: 1.2355
Gating Epoch 3, Train Loss: 1.1713
Gating Epoch 4, Train Loss: 0.9235
Gating Epoch 5, Train Loss: 1.1551
Gating Epoch 6, Train Loss: 0.9173
Gating Epoch 7, Train Loss: 0.9830
Gating Epoch 8, Train Loss: 0.9875
Gating Epoch 9, Train Loss: 1.1635


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Expert 0 Epoch 1, Train Loss: 8.2312
Expert 0 Epoch 2, Train Loss: 7.7561
Expert 0 Epoch 3, Train Loss: 7.4886
Expert 0 Epoch 4, Train Loss: 7.8170
Expert 0 Epoch 5, Train Loss: 8.2777
Expert 0 Epoch 6, Train Loss: 9.2387
Expert 0 Epoch 7, Train Loss: 8.2158
Expert 0 Epoch 8, Train Loss: 8.2186
Expert 0 Epoch 9, Train Loss: 8.4419
Expert 0 Epoch 10, Train Loss: 8.0054
Expert 1 Epoch 1, Train Loss: 9.1898
Expert 1 Epoch 2, Train Loss: 5.7921
Expert 1 Epoch 3, Train Loss: 5.7660
Expert 1 Epoch 4, Train Loss: 5.4415
Expert 1 Epoch 5, Train Loss: 5.3396
Expert 1 Epoch 6, Train Loss: 5.3139
Expert 1 Epoch 7, Train Loss: 5.2141
Expert 1 Epoch 8, Train Loss: 5.1516
Expert 1 Epoch 9, Train Loss: 5.0151
Expert 1 Epoch 10, Train Loss: 5.0468
Expert 2 Epoch 1, Train Loss: 9.2142
Expert 2 Epoch 2, Train Loss: 8.5644
Expert 2 Epoch 3, Train Loss: 8.8171
Expert 2 Epoch 4, Train Loss: 8.7581
Expert 2 Epoch 5, Train Loss: 9.2824
Expert 2 Epoch 6, Train Loss: 8.2846
Expert 2 Epoch 7, Train Loss: 8.6929

[I 2025-04-15 21:45:56,393] Trial 0 finished with value: -0.2897994836171468 and parameters: {'gating_lr': 0.0009911161304626812, 'gating_epochs': 9, 'expert_lr': 0.0005427763810654496, 'expert_epochs': 10}. Best is trial 0 with value: -0.2897994836171468.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.1303
Gating Epoch 2, Train Loss: 1.0080
Gating Epoch 3, Train Loss: 1.0009
Gating Epoch 4, Train Loss: 1.1083
Expert 0 Epoch 1, Train Loss: 7.6656
Expert 0 Epoch 2, Train Loss: 8.3227
Expert 0 Epoch 3, Train Loss: 7.6131
Expert 0 Epoch 4, Train Loss: 7.7927
Expert 1 Epoch 1, Train Loss: 9.1275
Expert 1 Epoch 2, Train Loss: 8.3206
Expert 1 Epoch 3, Train Loss: 7.7673
Expert 1 Epoch 4, Train Loss: 7.2721
Expert 2 Epoch 1, Train Loss: 8.7710
Expert 2 Epoch 2, Train Loss: 8.7408
Expert 2 Epoch 3, Train Loss: 9.0971
Expert 2 Epoch 4, Train Loss: 8.6869


[I 2025-04-15 21:46:15,400] Trial 1 finished with value: -0.31526645024617517 and parameters: {'gating_lr': 0.000506561536060624, 'gating_epochs': 4, 'expert_lr': 3.393877654448231e-05, 'expert_epochs': 4}. Best is trial 0 with value: -0.2897994836171468.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2853
Gating Epoch 2, Train Loss: 1.2600
Gating Epoch 3, Train Loss: 1.1154
Gating Epoch 4, Train Loss: 1.3207
Gating Epoch 5, Train Loss: 1.2868
Gating Epoch 6, Train Loss: 1.3152
Gating Epoch 7, Train Loss: 1.1370
Expert 0 Epoch 1, Train Loss: 8.8826
Expert 0 Epoch 2, Train Loss: 7.7605
Expert 0 Epoch 3, Train Loss: 8.3009
Expert 0 Epoch 4, Train Loss: 7.9384
Expert 0 Epoch 5, Train Loss: 7.7097
Expert 0 Epoch 6, Train Loss: 8.2612
Expert 0 Epoch 7, Train Loss: 8.8336
Expert 0 Epoch 8, Train Loss: 7.6416
Expert 0 Epoch 9, Train Loss: 7.9992
Expert 1 Epoch 1, Train Loss: 10.2315
Expert 1 Epoch 2, Train Loss: 6.1280
Expert 1 Epoch 3, Train Loss: 5.9215
Expert 1 Epoch 4, Train Loss: 5.7991
Expert 1 Epoch 5, Train Loss: 5.6710
Expert 1 Epoch 6, Train Loss: 5.6090
Expert 1 Epoch 7, Train Loss: 5.4465
Expert 1 Epoch 8, Train Loss: 5.3993
Expert 1 Epoch 9, Train Loss: 5.2974
Expert 2 Epoch 1, Train Loss: 8.2756
Expert 2 Epoch 2, Train Loss: 10.2850
Expert 2 Epoc

[I 2025-04-15 21:46:52,077] Trial 2 finished with value: -0.29424751599629717 and parameters: {'gating_lr': 0.0003180065817088647, 'gating_epochs': 7, 'expert_lr': 0.0002766300816983877, 'expert_epochs': 9}. Best is trial 0 with value: -0.2897994836171468.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.0681
Gating Epoch 2, Train Loss: 1.0708
Gating Epoch 3, Train Loss: 1.1274
Gating Epoch 4, Train Loss: 1.1591
Gating Epoch 5, Train Loss: 1.0243
Gating Epoch 6, Train Loss: 1.1033
Gating Epoch 7, Train Loss: 1.1006
Expert 0 Epoch 1, Train Loss: 8.3030
Expert 0 Epoch 2, Train Loss: 8.0816
Expert 0 Epoch 3, Train Loss: 8.5177
Expert 0 Epoch 4, Train Loss: 8.2028
Expert 1 Epoch 1, Train Loss: 9.5439
Expert 1 Epoch 2, Train Loss: 10.0005
Expert 1 Epoch 3, Train Loss: 7.8773
Expert 1 Epoch 4, Train Loss: 8.0169
Expert 2 Epoch 1, Train Loss: 8.9574
Expert 2 Epoch 2, Train Loss: 9.2676
Expert 2 Epoch 3, Train Loss: 8.7881
Expert 2 Epoch 4, Train Loss: 9.4242


[I 2025-04-15 21:47:13,430] Trial 3 finished with value: -0.5206979910532634 and parameters: {'gating_lr': 1.654524998237207e-05, 'gating_epochs': 7, 'expert_lr': 1.856809070715814e-05, 'expert_epochs': 4}. Best is trial 0 with value: -0.2897994836171468.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.0548
Gating Epoch 2, Train Loss: 1.1568
Gating Epoch 3, Train Loss: 1.0073
Gating Epoch 4, Train Loss: 1.1348
Expert 0 Epoch 1, Train Loss: 8.5807
Expert 0 Epoch 2, Train Loss: 7.6978
Expert 0 Epoch 3, Train Loss: 8.1332
Expert 0 Epoch 4, Train Loss: 8.2297
Expert 0 Epoch 5, Train Loss: 9.4223
Expert 0 Epoch 6, Train Loss: 7.8461
Expert 0 Epoch 7, Train Loss: 7.5361
Expert 1 Epoch 1, Train Loss: 8.8145
Expert 1 Epoch 2, Train Loss: 8.6456
Expert 1 Epoch 3, Train Loss: 7.6634
Expert 1 Epoch 4, Train Loss: 7.1211
Expert 1 Epoch 5, Train Loss: 6.8866
Expert 1 Epoch 6, Train Loss: 6.4203
Expert 1 Epoch 7, Train Loss: 6.2186
Expert 2 Epoch 1, Train Loss: 9.7169
Expert 2 Epoch 2, Train Loss: 8.9009
Expert 2 Epoch 3, Train Loss: 8.6333
Expert 2 Epoch 4, Train Loss: 8.9676
Expert 2 Epoch 5, Train Loss: 9.1477
Expert 2 Epoch 6, Train Loss: 8.4958
Expert 2 Epoch 7, Train Loss: 9.0224


[I 2025-04-15 21:47:41,889] Trial 4 finished with value: -0.5084701379140217 and parameters: {'gating_lr': 1.027185267388475e-05, 'gating_epochs': 4, 'expert_lr': 4.2237230602717784e-05, 'expert_epochs': 7}. Best is trial 0 with value: -0.2897994836171468.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2152
Gating Epoch 2, Train Loss: 1.1378
Gating Epoch 3, Train Loss: 1.1765
Gating Epoch 4, Train Loss: 1.1772
Gating Epoch 5, Train Loss: 1.1087
Gating Epoch 6, Train Loss: 1.0621
Gating Epoch 7, Train Loss: 1.1538
Gating Epoch 8, Train Loss: 1.1559
Gating Epoch 9, Train Loss: 1.2132
Gating Epoch 10, Train Loss: 1.1737
Expert 0 Epoch 1, Train Loss: 7.6325
Expert 0 Epoch 2, Train Loss: 8.2443
Expert 0 Epoch 3, Train Loss: 7.2584
Expert 0 Epoch 4, Train Loss: 7.6072
Expert 0 Epoch 5, Train Loss: 8.4710
Expert 0 Epoch 6, Train Loss: 8.8078
Expert 0 Epoch 7, Train Loss: 7.5961
Expert 0 Epoch 8, Train Loss: 8.7542
Expert 0 Epoch 9, Train Loss: 7.6933
Expert 0 Epoch 10, Train Loss: 8.5945
Expert 1 Epoch 1, Train Loss: 8.7822
Expert 1 Epoch 2, Train Loss: 6.0136
Expert 1 Epoch 3, Train Loss: 5.6983
Expert 1 Epoch 4, Train Loss: 5.4442
Expert 1 Epoch 5, Train Loss: 5.4564
Expert 1 Epoch 6, Train Loss: 5.2446
Expert 1 Epoch 7, Train Loss: 5.2197
Expert 1 Epoch 8, T

[I 2025-04-15 21:48:23,632] Trial 5 finished with value: -0.28928737640380864 and parameters: {'gating_lr': 0.00040774988128485385, 'gating_epochs': 10, 'expert_lr': 0.0008169895028253811, 'expert_epochs': 10}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.1421
Gating Epoch 2, Train Loss: 1.0873
Gating Epoch 3, Train Loss: 1.1492
Gating Epoch 4, Train Loss: 1.1058
Expert 0 Epoch 1, Train Loss: 7.6777
Expert 0 Epoch 2, Train Loss: 8.3658
Expert 0 Epoch 3, Train Loss: 8.1979
Expert 0 Epoch 4, Train Loss: 8.0754
Expert 0 Epoch 5, Train Loss: 7.6848
Expert 0 Epoch 6, Train Loss: 8.2257
Expert 0 Epoch 7, Train Loss: 8.0889
Expert 0 Epoch 8, Train Loss: 8.8126
Expert 0 Epoch 9, Train Loss: 8.5274
Expert 0 Epoch 10, Train Loss: 7.6637
Expert 1 Epoch 1, Train Loss: 8.7390
Expert 1 Epoch 2, Train Loss: 5.9587
Expert 1 Epoch 3, Train Loss: 5.6661
Expert 1 Epoch 4, Train Loss: 5.5740
Expert 1 Epoch 5, Train Loss: 5.4383
Expert 1 Epoch 6, Train Loss: 5.2106
Expert 1 Epoch 7, Train Loss: 5.1748
Expert 1 Epoch 8, Train Loss: 5.1967
Expert 1 Epoch 9, Train Loss: 5.0256
Expert 1 Epoch 10, Train Loss: 5.0589
Expert 2 Epoch 1, Train Loss: 8.4180
Expert 2 Epoch 2, Train Loss: 8.8140
Expert 2 Epoch 3, Train Loss: 9.3249
Expert 

[I 2025-04-15 21:49:01,203] Trial 6 finished with value: -0.48967701594034835 and parameters: {'gating_lr': 0.0003836214473827223, 'gating_epochs': 4, 'expert_lr': 0.00043798355049078384, 'expert_epochs': 10}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2126
Gating Epoch 2, Train Loss: 1.0999
Gating Epoch 3, Train Loss: 1.0643
Gating Epoch 4, Train Loss: 1.1759
Gating Epoch 5, Train Loss: 1.1019
Gating Epoch 6, Train Loss: 1.0282
Gating Epoch 7, Train Loss: 1.0804
Expert 0 Epoch 1, Train Loss: 7.8414
Expert 0 Epoch 2, Train Loss: 8.3951
Expert 0 Epoch 3, Train Loss: 7.6248
Expert 1 Epoch 1, Train Loss: 8.9294
Expert 1 Epoch 2, Train Loss: 8.0333
Expert 1 Epoch 3, Train Loss: 7.2104
Expert 2 Epoch 1, Train Loss: 9.0886
Expert 2 Epoch 2, Train Loss: 9.0734
Expert 2 Epoch 3, Train Loss: 9.2205


[I 2025-04-15 21:49:19,724] Trial 7 finished with value: -0.3131669203440349 and parameters: {'gating_lr': 4.296749905535592e-05, 'gating_epochs': 7, 'expert_lr': 6.058681485871738e-05, 'expert_epochs': 3}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 0.9881
Gating Epoch 2, Train Loss: 1.0262
Gating Epoch 3, Train Loss: 1.0384
Expert 0 Epoch 1, Train Loss: 7.5086
Expert 0 Epoch 2, Train Loss: 7.1770
Expert 0 Epoch 3, Train Loss: 8.8499
Expert 0 Epoch 4, Train Loss: 8.4360
Expert 0 Epoch 5, Train Loss: 8.2254
Expert 1 Epoch 1, Train Loss: 8.9629
Expert 1 Epoch 2, Train Loss: 8.7145
Expert 1 Epoch 3, Train Loss: 8.4195
Expert 1 Epoch 4, Train Loss: 8.1604
Expert 1 Epoch 5, Train Loss: 8.8194
Expert 2 Epoch 1, Train Loss: 9.1794
Expert 2 Epoch 2, Train Loss: 8.7201
Expert 2 Epoch 3, Train Loss: 8.5351
Expert 2 Epoch 4, Train Loss: 8.3182
Expert 2 Epoch 5, Train Loss: 8.4079


[I 2025-04-15 21:49:41,611] Trial 8 finished with value: -0.3215205669403076 and parameters: {'gating_lr': 3.716219741163235e-05, 'gating_epochs': 3, 'expert_lr': 1.3429533478405216e-05, 'expert_epochs': 5}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2183
Gating Epoch 2, Train Loss: 1.0117
Gating Epoch 3, Train Loss: 1.1764
Gating Epoch 4, Train Loss: 1.0414
Gating Epoch 5, Train Loss: 1.0853
Gating Epoch 6, Train Loss: 1.1459
Expert 0 Epoch 1, Train Loss: 7.9300
Expert 0 Epoch 2, Train Loss: 7.5311
Expert 0 Epoch 3, Train Loss: 7.1269
Expert 0 Epoch 4, Train Loss: 8.1766
Expert 1 Epoch 1, Train Loss: 8.9009
Expert 1 Epoch 2, Train Loss: 9.0852
Expert 1 Epoch 3, Train Loss: 8.5183
Expert 1 Epoch 4, Train Loss: 7.8121
Expert 2 Epoch 1, Train Loss: 9.6349
Expert 2 Epoch 2, Train Loss: 9.4274
Expert 2 Epoch 3, Train Loss: 8.5067
Expert 2 Epoch 4, Train Loss: 8.9772


[I 2025-04-15 21:50:02,214] Trial 9 finished with value: -0.5259895006815594 and parameters: {'gating_lr': 0.0003021363581110624, 'gating_epochs': 6, 'expert_lr': 1.2429756702914153e-05, 'expert_epochs': 4}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.1132
Gating Epoch 2, Train Loss: 1.1241
Gating Epoch 3, Train Loss: 1.0881
Gating Epoch 4, Train Loss: 1.1610
Gating Epoch 5, Train Loss: 1.0685
Gating Epoch 6, Train Loss: 1.0695
Gating Epoch 7, Train Loss: 1.0169
Gating Epoch 8, Train Loss: 1.0784
Gating Epoch 9, Train Loss: 1.1131
Gating Epoch 10, Train Loss: 1.1049
Expert 0 Epoch 1, Train Loss: 7.6051
Expert 0 Epoch 2, Train Loss: 8.9445
Expert 0 Epoch 3, Train Loss: 8.4976
Expert 0 Epoch 4, Train Loss: 8.3767
Expert 0 Epoch 5, Train Loss: 8.2155
Expert 0 Epoch 6, Train Loss: 7.9460
Expert 0 Epoch 7, Train Loss: 8.1620
Expert 0 Epoch 8, Train Loss: 8.1974
Expert 1 Epoch 1, Train Loss: 9.4135
Expert 1 Epoch 2, Train Loss: 5.7588
Expert 1 Epoch 3, Train Loss: 5.6523
Expert 1 Epoch 4, Train Loss: 5.5078
Expert 1 Epoch 5, Train Loss: 5.3447
Expert 1 Epoch 6, Train Loss: 5.2133
Expert 1 Epoch 7, Train Loss: 5.0942
Expert 1 Epoch 8, Train Loss: 5.0731
Expert 2 Epoch 1, Train Loss: 8.5985
Expert 2 Epoch 2, Tr

[I 2025-04-15 21:50:37,930] Trial 10 finished with value: -0.2947918891906739 and parameters: {'gating_lr': 9.795984896871263e-05, 'gating_epochs': 10, 'expert_lr': 0.0009987505771563818, 'expert_epochs': 8}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.3967
Gating Epoch 2, Train Loss: 1.2883
Gating Epoch 3, Train Loss: 1.3922
Gating Epoch 4, Train Loss: 0.9851
Gating Epoch 5, Train Loss: 1.0155
Gating Epoch 6, Train Loss: 1.0420
Gating Epoch 7, Train Loss: 1.2935
Gating Epoch 8, Train Loss: 1.3461
Gating Epoch 9, Train Loss: 1.0469
Gating Epoch 10, Train Loss: 1.1898
Expert 0 Epoch 1, Train Loss: 7.7791
Expert 0 Epoch 2, Train Loss: 8.4497
Expert 0 Epoch 3, Train Loss: 8.6435
Expert 0 Epoch 4, Train Loss: 7.9820
Expert 0 Epoch 5, Train Loss: 8.1726
Expert 0 Epoch 6, Train Loss: 7.8955
Expert 0 Epoch 7, Train Loss: 8.1099
Expert 0 Epoch 8, Train Loss: 7.9911
Expert 0 Epoch 9, Train Loss: 7.8059
Expert 0 Epoch 10, Train Loss: 7.5531
Expert 1 Epoch 1, Train Loss: 9.0302
Expert 1 Epoch 2, Train Loss: 6.7024
Expert 1 Epoch 3, Train Loss: 6.4476
Expert 1 Epoch 4, Train Loss: 6.0891
Expert 1 Epoch 5, Train Loss: 5.9259
Expert 1 Epoch 6, Train Loss: 5.7702
Expert 1 Epoch 7, Train Loss: 5.7009
Expert 1 Epoch 8, T

[I 2025-04-15 21:51:19,459] Trial 11 finished with value: -0.4963113625844319 and parameters: {'gating_lr': 0.000994050459357635, 'gating_epochs': 10, 'expert_lr': 0.0001989565542994661, 'expert_epochs': 10}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2488
Gating Epoch 2, Train Loss: 1.2804
Gating Epoch 3, Train Loss: 1.0100
Gating Epoch 4, Train Loss: 0.9873
Gating Epoch 5, Train Loss: 1.0504
Gating Epoch 6, Train Loss: 1.1079
Gating Epoch 7, Train Loss: 1.0846
Gating Epoch 8, Train Loss: 0.9970
Gating Epoch 9, Train Loss: 1.1526
Expert 0 Epoch 1, Train Loss: 7.8261
Expert 0 Epoch 2, Train Loss: 8.5040
Expert 0 Epoch 3, Train Loss: 7.6919
Expert 0 Epoch 4, Train Loss: 7.6483
Expert 0 Epoch 5, Train Loss: 8.2610
Expert 0 Epoch 6, Train Loss: 8.3433
Expert 0 Epoch 7, Train Loss: 7.5100
Expert 0 Epoch 8, Train Loss: 7.7797
Expert 1 Epoch 1, Train Loss: 8.4910
Expert 1 Epoch 2, Train Loss: 5.9717
Expert 1 Epoch 3, Train Loss: 5.5762
Expert 1 Epoch 4, Train Loss: 5.4418
Expert 1 Epoch 5, Train Loss: 5.3413
Expert 1 Epoch 6, Train Loss: 5.2629
Expert 1 Epoch 7, Train Loss: 5.1099
Expert 1 Epoch 8, Train Loss: 5.0861
Expert 2 Epoch 1, Train Loss: 8.8225
Expert 2 Epoch 2, Train Loss: 8.7558
Expert 2 Epoch 3, T

[I 2025-04-15 21:51:54,237] Trial 12 finished with value: -0.4905422369639079 and parameters: {'gating_lr': 0.0009833292875574562, 'gating_epochs': 9, 'expert_lr': 0.0006783403396056237, 'expert_epochs': 8}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.0576
Gating Epoch 2, Train Loss: 1.1032
Gating Epoch 3, Train Loss: 1.1949
Gating Epoch 4, Train Loss: 1.0198
Gating Epoch 5, Train Loss: 1.0426
Gating Epoch 6, Train Loss: 1.1575
Gating Epoch 7, Train Loss: 1.0976
Gating Epoch 8, Train Loss: 1.0301
Gating Epoch 9, Train Loss: 1.1381
Expert 0 Epoch 1, Train Loss: 8.5423
Expert 0 Epoch 2, Train Loss: 8.4561
Expert 0 Epoch 3, Train Loss: 8.1778
Expert 0 Epoch 4, Train Loss: 8.8518
Expert 0 Epoch 5, Train Loss: 8.2354
Expert 0 Epoch 6, Train Loss: 8.0333
Expert 0 Epoch 7, Train Loss: 8.7001
Expert 0 Epoch 8, Train Loss: 8.1543
Expert 0 Epoch 9, Train Loss: 8.3933
Expert 0 Epoch 10, Train Loss: 8.3865
Expert 1 Epoch 1, Train Loss: 8.2817
Expert 1 Epoch 2, Train Loss: 6.6001
Expert 1 Epoch 3, Train Loss: 6.2879
Expert 1 Epoch 4, Train Loss: 6.0260
Expert 1 Epoch 5, Train Loss: 5.9625
Expert 1 Epoch 6, Train Loss: 5.6018
Expert 1 Epoch 7, Train Loss: 5.6865
Expert 1 Epoch 8, Train Loss: 5.5057
Expert 1 Epoch 9, 

[I 2025-04-15 21:52:35,114] Trial 13 finished with value: -0.49651103019714354 and parameters: {'gating_lr': 0.0001741872896918878, 'gating_epochs': 9, 'expert_lr': 0.0001629267280785093, 'expert_epochs': 10}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.0607
Gating Epoch 2, Train Loss: 1.0815
Gating Epoch 3, Train Loss: 1.1205
Gating Epoch 4, Train Loss: 1.1136
Gating Epoch 5, Train Loss: 1.0972
Gating Epoch 6, Train Loss: 1.0898
Gating Epoch 7, Train Loss: 1.1257
Gating Epoch 8, Train Loss: 1.0783
Gating Epoch 9, Train Loss: 1.1284
Expert 0 Epoch 1, Train Loss: 8.3226
Expert 0 Epoch 2, Train Loss: 8.6080
Expert 0 Epoch 3, Train Loss: 7.2992
Expert 0 Epoch 4, Train Loss: 7.7756
Expert 0 Epoch 5, Train Loss: 8.4865
Expert 0 Epoch 6, Train Loss: 7.8952
Expert 1 Epoch 1, Train Loss: 8.6131
Expert 1 Epoch 2, Train Loss: 5.9880
Expert 1 Epoch 3, Train Loss: 5.6021
Expert 1 Epoch 4, Train Loss: 5.5244
Expert 1 Epoch 5, Train Loss: 5.3978
Expert 1 Epoch 6, Train Loss: 5.3135
Expert 2 Epoch 1, Train Loss: 9.1075
Expert 2 Epoch 2, Train Loss: 8.8960
Expert 2 Epoch 3, Train Loss: 9.2101
Expert 2 Epoch 4, Train Loss: 9.3131
Expert 2 Epoch 5, Train Loss: 9.1056
Expert 2 Epoch 6, Train Loss: 8.6106


[I 2025-04-15 21:53:03,600] Trial 14 finished with value: -0.49393436113993333 and parameters: {'gating_lr': 0.0006863260181887924, 'gating_epochs': 9, 'expert_lr': 0.0004388065864122191, 'expert_epochs': 6}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.1278
Gating Epoch 2, Train Loss: 1.0568
Gating Epoch 3, Train Loss: 1.0186
Gating Epoch 4, Train Loss: 1.1151
Gating Epoch 5, Train Loss: 1.0359
Gating Epoch 6, Train Loss: 1.0413
Gating Epoch 7, Train Loss: 1.1252
Gating Epoch 8, Train Loss: 1.1745
Expert 0 Epoch 1, Train Loss: 9.3577
Expert 0 Epoch 2, Train Loss: 7.6722
Expert 0 Epoch 3, Train Loss: 7.7286
Expert 0 Epoch 4, Train Loss: 8.5534
Expert 0 Epoch 5, Train Loss: 7.7976
Expert 0 Epoch 6, Train Loss: 8.6076
Expert 0 Epoch 7, Train Loss: 7.7441
Expert 0 Epoch 8, Train Loss: 8.0094
Expert 0 Epoch 9, Train Loss: 7.9515
Expert 1 Epoch 1, Train Loss: 9.5000
Expert 1 Epoch 2, Train Loss: 8.2520
Expert 1 Epoch 3, Train Loss: 6.4228
Expert 1 Epoch 4, Train Loss: 6.2376
Expert 1 Epoch 5, Train Loss: 5.7721
Expert 1 Epoch 6, Train Loss: 5.6864
Expert 1 Epoch 7, Train Loss: 5.7890
Expert 1 Epoch 8, Train Loss: 5.6950
Expert 1 Epoch 9, Train Loss: 5.7203
Expert 2 Epoch 1, Train Loss: 8.3122
Expert 2 Epoch 2,

[I 2025-04-15 21:53:41,018] Trial 15 finished with value: -0.5010258356730144 and parameters: {'gating_lr': 0.00015337804958992412, 'gating_epochs': 8, 'expert_lr': 9.798445358342019e-05, 'expert_epochs': 9}. Best is trial 5 with value: -0.28928737640380864.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.0588
Gating Epoch 2, Train Loss: 0.9437
Gating Epoch 3, Train Loss: 1.0575
Gating Epoch 4, Train Loss: 1.1194
Gating Epoch 5, Train Loss: 1.0354
Gating Epoch 6, Train Loss: 1.0459
Gating Epoch 7, Train Loss: 1.0716
Gating Epoch 8, Train Loss: 1.1385
Gating Epoch 9, Train Loss: 1.2019
Gating Epoch 10, Train Loss: 1.1667
Expert 0 Epoch 1, Train Loss: 8.4275
Expert 0 Epoch 2, Train Loss: 8.0243
Expert 0 Epoch 3, Train Loss: 7.7832
Expert 0 Epoch 4, Train Loss: 7.9420
Expert 0 Epoch 5, Train Loss: 7.7846
Expert 0 Epoch 6, Train Loss: 8.3761
Expert 0 Epoch 7, Train Loss: 8.4376
Expert 0 Epoch 8, Train Loss: 8.1266
Expert 1 Epoch 1, Train Loss: 9.0074
Expert 1 Epoch 2, Train Loss: 5.7064
Expert 1 Epoch 3, Train Loss: 5.5980
Expert 1 Epoch 4, Train Loss: 5.3785
Expert 1 Epoch 5, Train Loss: 5.3279
Expert 1 Epoch 6, Train Loss: 5.1897
Expert 1 Epoch 7, Train Loss: 5.0970
Expert 1 Epoch 8, Train Loss: 4.9211
Expert 2 Epoch 1, Train Loss: 9.2033
Expert 2 Epoch 2, Tr

[I 2025-04-15 21:54:16,491] Trial 16 finished with value: -0.08947065671284993 and parameters: {'gating_lr': 0.0002121563783901395, 'gating_epochs': 10, 'expert_lr': 0.0008240674624446471, 'expert_epochs': 8}. Best is trial 16 with value: -0.08947065671284993.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gating Epoch 1, Train Loss: 1.2795
Gating Epoch 2, Train Loss: 1.1124
Gating Epoch 3, Train Loss: 1.0275
Gating Epoch 4, Train Loss: 1.0027
Gating Epoch 5, Train Loss: 1.2186
Gating Epoch 6, Train Loss: 1.2887
Gating Epoch 7, Train Loss: 1.1691
Gating Epoch 8, Train Loss: 1.1721
Gating Epoch 9, Train Loss: 1.2138
Gating Epoch 10, Train Loss: 1.2679
Expert 0 Epoch 1, Train Loss: 8.8489
Expert 0 Epoch 2, Train Loss: 7.8500
Expert 0 Epoch 3, Train Loss: 8.3981
Expert 0 Epoch 4, Train Loss: 8.5941
Expert 0 Epoch 5, Train Loss: 8.1092
Expert 0 Epoch 6, Train Loss: 8.2189
Expert 0 Epoch 7, Train Loss: 7.7611
Expert 0 Epoch 8, Train Loss: 8.7595
Expert 1 Epoch 1, Train Loss: 8.1021
Expert 1 Epoch 2, Train Loss: 5.9502
Expert 1 Epoch 3, Train Loss: 5.5623
Expert 1 Epoch 4, Train Loss: 5.4477
Expert 1 Epoch 5, Train Loss: 5.3773
Expert 1 Epoch 6, Train Loss: 5.1596
Expert 1 Epoch 7, Train Loss: 5.1114
Expert 1 Epoch 8, Train Loss: 5.0824
Expert 2 Epoch 1, Train Loss: 8.7942
Expert 2 Epoch 2, Tr

[W 2025-04-15 21:54:52,807] Trial 17 failed with parameters: {'gating_lr': 8.249007070428141e-05, 'gating_epochs': 10, 'expert_lr': 0.0009826494734997409, 'expert_epochs': 8} because of the following error: RuntimeError('[enforce fail at inline_container.cc:603] . unexpected pos 224245888 vs 224245776').
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/serialization.py", line 850, in save
    _save(
  File "/usr/local/lib/python3.11/dist-packages/torch/serialization.py", line 1114, in _save
    zip_file.write_record(name, storage, num_bytes)
RuntimeError: [enforce fail at inline_container.cc:778] . PytorchStreamWriter failed writing file data/86: file write failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/i

RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 224245888 vs 224245776