In [34]:
# import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.model_selection import KFold
import optuna
from datetime import datetime
from optuna.trial import TrialState
import numpy as np

In [35]:
class LayerNormLSTM(nn.Module):
    """LSTM layer with layer normalization applied to gates"""
    def __init__(self, input_node, hidden_node):
        super().__init__()
        self.input_node = input_node
        self.hidden_node = hidden_node

        self.w_i = nn.Linear(input_node, 4 * hidden_node, bias=False)
        self.w_h = nn.Linear(hidden_node, 4 * hidden_node, bias=False)

        self.ln_i = nn.LayerNorm(hidden_node)
        self.ln_h = nn.LayerNorm(hidden_node)
        self.ln_g = nn.LayerNorm(hidden_node)
        self.ln_o = nn.LayerNorm(hidden_node)
        self.ln_c = nn.LayerNorm(hidden_node)

    def forward(self, input, hidden):
        h_prev, c_prev = hidden
        gi = self.w_i(input)
        gh = self.w_h(h_prev)
        i_i, i_f, i_g, i_o = gi.chunk(4, dim=-1)
        h_i, h_f, h_g, h_o = gh.chunk(4, dim=-1)

        i_g = torch.sigmoid(self.ln_i(i_i + h_i))
        f_g = torch.sigmoid(self.ln_h(i_f + h_f))
        g_g = torch.tanh(self.ln_g(i_g + h_g))
        o_g = torch.sigmoid(self.ln_o(i_o + h_o))

        c_new = f_g * c_prev + i_g * g_g
        c_new = self.ln_c(c_new)
        h_new = o_g * torch.tanh(c_new)

        return h_new, c_new

In [36]:
class StateExtr(nn.Module):
    def __init__(self, input_node, hidden_node, n_layer, dropout):
        super().__init__()
        self.hidden_node = hidden_node
        self.n_layer = n_layer
        self.input_node = input_node

        self.lstm_cells = nn.ModuleList()
        self.lstm_cells.append(LayerNormLSTM(input_node, hidden_node))
        for _ in range(n_layer - 1):
            self.lstm_cells.append(LayerNormLSTM(hidden_node, hidden_node))

        self.dropout = nn.Dropout(dropout)
        self.final_layer_norm = nn.LayerNorm(hidden_node)
        self.final_dropout = nn.Dropout(dropout)

    def forward(self, x, seq_len):
        batch_size, max_len, input_node = x.size()
        device = x.device

        h_states = []
        c_states = []
        for _ in range(self.n_layer):
            h_states.append(torch.zeros(batch_size, self.hidden_node, device=device))
            c_states.append(torch.zeros(batch_size, self.hidden_node, device=device))
        
        outputs = []
        for t in range(max_len):
            x_t = x[:, t, :]
            layer_input = x_t
            for layer_idx, lstm_cell in enumerate(self.lstm_cells):
                h_new, c_new = lstm_cell(layer_input, (h_states[layer_idx], c_states[layer_idx]))
                h_states[layer_idx] = h_new
                c_states[layer_idx] = c_new

                if layer_idx < len(self.lstm_cells) - 1:
                    layer_input = self.dropout(h_new)
                else:
                    layer_input = h_new
            outputs.append(layer_input)
        
        output_tensor = torch.stack(outputs, dim=1)
        seq_len_cpu = seq_len.detach().cpu().long()
        mask = torch.arange(max_len, device='cpu')[None, :] < seq_len_cpu[:, None]
        mask = mask.float().to(device).unsqueeze(-1)
        masked_output = output_tensor * mask
        normalized = self.final_layer_norm(masked_output)
        return self.final_dropout(normalized)

In [37]:
class PhysicalChangeDecoder(nn.Module):
    def __init__(self, input_node, output_node, n_layer, hidden_node, dropout):
        super().__init__()
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(input_node, hidden_node))
        self.layers.append(nn.LayerNorm(hidden_node))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout))

        for i in range(n_layer - 1):
            self.layers.append(nn.Linear(hidden_node, hidden_node))
            self.layers.append(nn.LayerNorm(hidden_node))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout))

        self.layers.append(nn.Linear(hidden_node, output_node))
    
    def forward(self, hidden_states):
        x = hidden_states
        for layer in self.layers:
            x = layer(x)
        return x

In [38]:
class CurrentPredictor(nn.Module):
    def __init__(self, input_node, hidden_node, n_layer, dropout):
        super().__init__()
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(input_node, hidden_node))
        self.layers.append(nn.LayerNorm(hidden_node))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout))
        
        for i in range(n_layer - 1):
            self.layers.append(nn.Linear(hidden_node, hidden_node))
            self.layers.append(nn.LayerNorm(hidden_node))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout))
        
        self.layers.append(nn.Linear(hidden_node, 1))
    
    def forward(self, new_state):
        x = new_state
        for layer in self.layers:
            x = layer(x)
        return x

In [39]:
class PhysicsConstraintLayer(nn.Module):
    def __init__(self, range_mm, current_predictor, eps=1e-2):
        super().__init__()
        self.sps = eps
        self.current_predictor = current_predictor
        self.register_buffer('range_mm_tensor', self._convert_range_to_tensor(range_mm))

    def _convert_range_to_tensor(self, range_mm):
        feature_names = ['V','E','VF','VA','VB','CFLA','CALA','CFK','CBK','I']
        ranges = torch.zeros(len(feature_names),2)
        for i, name in enumerate(feature_names):
            if name in range_mm:
                ranges[i, 0] = range_mm[name]['min']
                ranges[i, 1] = range_mm[name]['max']
        return ranges
    
    def normalize(self, data, feature_idx):
        min_val = self.range_mm_tensor[feature_idx, 0]
        max_val = self.range_mm_tensor[feature_idx, 1]
        return (data - min_val) / (max_val - min_val)

    def denormalize(self, data, feature_idx):
        min_val = self.range_mm_tensor[feature_idx, 0]
        max_val = self.range_mm_tensor[feature_idx, 1]
        return data * (max_val - min_val) + min_val

    def forward(self, physical_changes, current_state):
        V_idx, E_idx, VF_idx, VA_idx, VB_idx = 0, 1, 2, 3, 4
        CFLA_idx, CALA_idx, CFK_idx, CBK_idx, I_idx = 5, 6, 7, 8, 9

        VF = self.denormalize(current_state[..., 2:3], VF_idx)
        VA = self.denormalize(current_state[..., 3:4], VA_idx)
        VB = self.denormalize(current_state[..., 4:5], VB_idx)
        CFLA = self.denormalize(current_state[..., 5:6], CFLA_idx)
        CALA = self.denormalize(current_state[..., 6:7], CALA_idx)
        CFK = self.denormalize(current_state[..., 7:8], CFK_idx)
        CBK = self.denormalize(current_state[..., 8:9], CBK_idx)

        dVA = physical_changes[..., 0:1]
        dVB = physical_changes[..., 1:2]
        rratio = physical_changes[..., 2:3]
        dNBK = physical_changes[..., 3:4]

        ratio = torch.sigmoid(rratio)
        dNALA = ratio * dNBK

        NFLA = CFLA * VF
        NALA = CALA * VA
        NFK = CFK * VF
        NBK = CBK * VB

        nVF = VF - dVA - dVB
        nVA = VA + dVA
        nVB = VB + dVB

        nVF = torch.clamp(nVF, min=self.sps)
        nVA = torch.clamp(nVA, min=self.sps)
        nVB = torch.clamp(nVB, min=self.sps)
        
        nNFLA = NFLA - torch.clamp(dNALA, min=0.0)
        nNALA = NALA + torch.clamp(dNALA, min=0.0)
        nNFK = NFK - torch.clamp(dNBK, min=0.0)
        nNBK = NBK + torch.clamp(dNBK, min=0.0)

        nNFLA = torch.clamp(nNFLA, min=0.0)
        nNALA = torch.clamp(nNALA, min=0.0)
        nNFK = torch.clamp(nNFK, min=0.0)
        nNBK = torch.clamp(nNBK, min=0.0)

        nCFLA = nNFLA / nVF
        nCALA = nNALA / nVA
        nCFK = nNFK / nVF
        nCBK = nNBK / nVB

        V = current_state[..., 0:1]
        E = current_state[..., 1:2]
        nVF_norm = self.normalize(nVF, VF_idx)
        nVA_norm = self.normalize(nVA, VA_idx)
        nVB_norm = self.normalize(nVB, VB_idx)
        nCFLA_norm = self.normalize(nCFLA, CFLA_idx)
        nCALA_norm = self.normalize(nCALA, CALA_idx)
        nCFK_norm = self.normalize(nCFK, CFK_idx)
        nCBK_norm = self.normalize(nCBK, CBK_idx)

        temp_state = torch.cat([
            V, E, nVF_norm, nVA_norm, nVB_norm, nCFLA_norm, nCALA_norm, nCFK_norm, nCBK_norm
        ], dim=-1)
        
        nI_pred_norm = self.current_predictor(temp_state)
        nI_real = self.denormalize(nI_pred_norm, I_idx)
        nI_real = torch.clamp(nI_real, min=0.0)
        nI_norm = self.normalize(nI_real, I_idx)

        next_state = torch.cat([
            V, E, nVF_norm, nVA_norm, nVB_norm, nCFLA_norm, nCALA_norm, nCFK_norm, nCBK_norm, nI_norm
        ], dim=-1)
        
        return next_state

In [40]:
class BMEDAutoregressiveModel(nn.Module):
    def __init__(self, state_extr_params, decoder_params, current_predictor_params, range_mm):
        super().__init__()
        self.state_extr = StateExtr(**state_extr_params)
        self.physical_decoder = PhysicalChangeDecoder(**decoder_params)
        self.current_predictor = CurrentPredictor(**current_predictor_params)
        self.physics_constraint = PhysicsConstraintLayer(range_mm, self.current_predictor)

    def forward(self, x, seq_len):
        hidden_states = self.state_extr(x, seq_len)
        physical_changes = self.physical_decoder(hidden_states)
        new_x = self.physics_constraint(physical_changes, x)
        return new_x

In [41]:
class NoamScheduler:
    def __init__(self, optimizer, model_size, warmup_epochs, factor=1.0):
        self.optimizer = optimizer
        self.model_size = model_size
        self.warmup_epochs = warmup_epochs
        self.factor = factor
        self.epoch_num = 0

    def step_epoch(self):
        self.epoch_num += 1
        lr = self.factor * (
            self.model_size ** (-0.5) *
            min(self.epoch_num ** (-0.5), self.epoch_num * self.warmup_epochs ** (-1.5))
        )
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        return lr

In [42]:
# 유틸리티 함수들
def df_treat(name):
    df = pd.read_csv(name)
    ndf = pd.DataFrame()
    range_mm={
        'V': {'min':df['V'].min()*0.8, 'max': df['V'].max()*1.2},
        'E': {'min':df['E'].min()*0.8, 'max': df['E'].max()*1.2},
        'VF': {'min':df['VF'].min()*0.8, 'max': df['VF'].max()*1.2},
        'VA': {'min':df['VA'].min()*0.8, 'max': df['VA'].max()*1.2},
        'VB': {'min':df['VB'].min()*0.8, 'max': df['VB'].max()*1.2},
        'CFLA': {'min':0, 'max': df['CFLA'].max()*1.2},
        'CALA': {'min':0, 'max': df['CALA'].max()*1.2},
        'CFK': {'min':0, 'max': df['CFK'].max()*1.2},
        'CBK': {'min':0, 'max': df['CBK'].max()*1.2},
        'I': {'min':0, 'max': df['I'].max()*1.2},
    }
    ndf['exp'] = df['exp']; ndf['t'] = df['t']

    for col in ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CFK', 'CBK', 'I']:
        if col in range_mm:
            ndf[col] = (df[col] - range_mm[col]['min'])/(range_mm[col]['max'] - range_mm[col]['min'])
        else:
            ndf[col] = df[col]

    exp_num_list = sorted(ndf['exp'].unique())
    return df, ndf, range_mm, exp_num_list

def seq_data(ndf, exp_num_list):
    seq = []
    feature_cols = ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CFK', 'CBK', 'I']
    for exp in exp_num_list:
        exp_df = ndf[ndf['exp'] == exp]
        seq.append(exp_df[feature_cols].values)
    return seq

def pad_seq(seq):
    max_len = max([len(s) for s in seq])
    seq_len = [len(s) for s in seq]
    pad_seq = pad_sequence([torch.tensor(s) for s in seq], batch_first=True, padding_value=-1)
    return pad_seq, seq_len, max_len

def gen_dataset(pad_seq, seq_len):
    input_tensor = pad_seq.float()
    seq_len_tensor = torch.tensor(seq_len)
    dataset = TensorDataset(input_tensor, seq_len_tensor)
    return dataset

def masked_mse_loss(pred, target, seq_len):
    batch_size, max_len, features = pred.shape
    seq_len_cpu = seq_len.detach().cpu().long()
    mask = torch.arange(max_len, device='cpu')[None, :] < seq_len_cpu[:, None]
    mask = mask.float().to(pred.device)
    loss = F.mse_loss(pred, target, reduction='none')
    masked_loss = loss * mask.unsqueeze(-1)
    total_loss = masked_loss.sum()
    total_elements = mask.sum()
    masked_loss = total_loss / total_elements
    return masked_loss

def tf_data(input_seq, seq_len):
    inputs = input_seq[:, :-1, :-1]
    targets = input_seq[:, 1:, :]
    target_seq_len = seq_len - 1
    return inputs, targets, target_seq_len

In [43]:
# Optuna 목적 함수
def objective(trial):
    """
    Optuna trial을 위한 목적 함수
    K-fold cross validation을 사용하여 하이퍼파라미터 최적화
    """
    
    # 1. 하이퍼파라미터 제안
    # LSTM StateExtractor 파라미터
    lstm_hidden_size = trial.suggest_categorical('lstm_hidden_size', [16, 32, 48, 64, 72, 96, 128])
    lstm_n_layers = trial.suggest_int('lstm_n_layers', 2, 6, step=1)
    lstm_dropout = trial.suggest_float('lstm_dropout', 0.1, 0.5, step=0.1)
    
    # PhysicalChangeDecoder 파라미터
    decoder_hidden_size = trial.suggest_categorical('decoder_hidden_size', [16, 32, 48, 64, 72, 96, 128])
    decoder_n_layers = trial.suggest_int('decoder_n_layers', 2, 6, step=1)
    decoder_dropout = trial.suggest_float('decoder_dropout', 0.1, 0.6, step=0.1)
    
    # CurrentPredictor 파라미터
    current_hidden_size = trial.suggest_categorical('current_hidden_size', [16, 32, 48, 64, 72, 96, 128])
    current_n_layers = trial.suggest_int('current_n_layers', 2, 6, step=1)
    current_dropout = trial.suggest_float('current_dropout', 0.1, 0.6, step=0.1)
    
    # NoamScheduler 파라미터
    noam_factor = trial.suggest_float('noam_factor', 0.5, 2.0, step=0.1)
    warmup_ratio = trial.suggest_float('warmup_ratio', 0.05, 0.3, step=0.05)
    
    # Batch size 파라미터
    batch_size = trial.suggest_categorical('batch_size', [3, 5, 15])
    
    # 2. K-fold Cross Validation
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_splits = 5
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    fold_losses = []
    
    # 데이터 로드 (global 변수 사용)
    indices = list(range(len(dataset)))
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(indices)):
        print(f"  🔄 Trial {trial.number}, Fold {fold+1}/{n_splits}")
        
        # 폴드별 데이터셋 준비
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        
        # 3. 모델 파라미터 설정
        state_extr_params = {
            'input_node': 9,
            'hidden_node': lstm_hidden_size,
            'n_layer': lstm_n_layers,
            'dropout': lstm_dropout
        }
        
        decoder_params = {
            'input_node': lstm_hidden_size,
            'hidden_node': decoder_hidden_size,
            'n_layer': decoder_n_layers,
            'dropout': decoder_dropout,
            'output_node': 4
        }
        
        current_predictor_params = {
            'input_node': 9,
            'hidden_node': current_hidden_size,
            'n_layer': current_n_layers,
            'dropout': current_dropout
        }
        
        # 4. 모델 초기화
        model = BMEDAutoregressiveModel(state_extr_params, decoder_params, current_predictor_params, range_mm)
        model = model.to(device)
        
        # 5. 옵티마이저 및 스케줄러 설정
        optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
        
        # 총 에포크 수와 warmup 에포크 계산
        total_epochs = 100  # Optuna 최적화를 위해 에포크 수 감소
        warmup_epochs = int(total_epochs * warmup_ratio)
        
        scheduler = NoamScheduler(
            optimizer, 
            model_size=lstm_hidden_size,
            warmup_epochs=warmup_epochs,
            factor=noam_factor
        )
        
        # 6. 훈련
        best_total_loss = float('inf')
        
        for epoch in range(total_epochs):
            # Learning rate 업데이트
            current_lr = scheduler.step_epoch()
            
            # 훈련
            model.train()
            train_loss = 0.0
            train_batches = 0
            
            for input_seq, seq_len in train_loader:
                try:
                    input_seq = input_seq.to(device)
                    seq_len = seq_len.to(device)
                    
                    inputs, targets, target_seq_len = tf_data(input_seq, seq_len)
                    
                    optimizer.zero_grad()
                    pred = model(inputs, target_seq_len)
                    loss = masked_mse_loss(pred, targets, target_seq_len)
                    
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    
                    train_loss += loss.item()
                    train_batches += 1
                    
                except Exception as e:
                    continue
            
            if train_batches == 0:
                break
                
            train_loss = train_loss / train_batches
            
            # 검증
            model.eval()
            val_loss = 0.0
            val_batches = 0
            
            with torch.no_grad():
                for input_seq, seq_len in val_loader:
                    try:
                        input_seq = input_seq.to(device)
                        seq_len = seq_len.to(device)
                        
                        inputs, targets, target_seq_len = tf_data(input_seq, seq_len)
                        
                        pred = model(inputs, target_seq_len)
                        loss = masked_mse_loss(pred, targets, target_seq_len)
                        
                        val_loss += loss.item()
                        val_batches += 1
                        
                    except Exception as e:
                        continue
            
            if val_batches == 0:
                break
                
            val_loss = val_loss / val_batches
            
            # Calculate total loss
            total_loss = train_loss + val_loss
            
            # Early stopping
            if total_loss < best_total_loss:
                best_total_loss = total_loss
        
        fold_losses.append(best_total_loss)
        print(f"    Fold {fold+1} best total loss: {best_total_loss:.6f}")
        
        # 메모리 정리
        del model, optimizer, scheduler
        torch.cuda.empty_cache()
    
    # 7. K-fold 평균 손실 반환
    avg_loss = np.mean(fold_losses)
    std_loss = np.std(fold_losses)
    
    print(f"  📊 Trial {trial.number} - Average CV Loss: {avg_loss:.6f} (±{std_loss:.6f})")
    
    return avg_loss

In [44]:
# 메인 최적화 함수
def run_optuna_optimization():
    """Optuna를 사용한 하이퍼파라미터 최적화 실행"""
    
    print("🚀 BMED TF Model Hyperparameter Optimization with Optuna")
    print("="*80)
    
    # 전역 데이터 로드
    global dataset, range_mm
    
    print("📋 데이터 로드 중...")
    df, ndf, range_mm, exp_num_list = df_treat('BMED_DATA_AG.csv')
    seq = seq_data(ndf, exp_num_list)
    pad, seq_len, max_len = pad_seq(seq)
    dataset = gen_dataset(pad, seq_len)
    
    print(f"   - 총 실험 개수: {len(exp_num_list)}")
    print(f"   - 총 데이터 포인트: {len(dataset)}")
    print(f"   - 최대 시퀀스 길이: {max_len}")
    
    # SQLite 데이터베이스를 사용한 Optuna study 생성
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    db_url = f"sqlite:///bmed_optuna_study_{timestamp}.db"
    
    study = optuna.create_study(
        direction='minimize',
        study_name='bmed_tf_optimization',
        sampler=optuna.samplers.TPESampler(seed=42),
        storage=db_url,
        load_if_exists=True
    )
    
    # 최적화 실행
    n_trials = 100
    print(f"🔍 최적화 시작 (총 {n_trials} trials)")
    
    try:
        study.optimize(objective, n_trials=n_trials, timeout=None)
    except KeyboardInterrupt:
        print("\n⚠️ 최적화가 사용자에 의해 중단되었습니다.")
    
    # 결과 분석
    print("\n" + "="*80)
    print("📊 OPTIMIZATION RESULTS")
    print("="*80)
    
    print(f"✅ 완료된 trials: {len(study.trials)}")
    print(f"🏆 최고 성능 trial: {study.best_trial.number}")
    print(f"💯 최고 성능 값: {study.best_value:.6f}")
    
    print(f"\n🎯 최적 하이퍼파라미터:")
    for key, value in study.best_params.items():
        print(f"   {key}: {value}")
    
    # 상위 5개 trial 정보
    print(f"\n📈 상위 5개 Trials:")
    trials_df = study.trials_dataframe().sort_values('value').head(5)
    for idx, (_, trial) in enumerate(trials_df.iterrows()):
        print(f"   {idx+1}. Trial {int(trial['number'])}: {trial['value']:.6f}")
    
    # 결과 저장
    result_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Trials 결과 CSV로 저장
    trials_file = f"bmed_optuna_trials_{result_timestamp}.csv"
    trials_df = study.trials_dataframe()
    trials_df.to_csv(trials_file, index=False)
    print(f"💾 모든 trials 결과가 저장되었습니다: {trials_file}")
    
    # SQLite 데이터베이스 정보
    print(f"💾 SQLite 데이터베이스에 실시간 저장됨: {db_url}")
    print(f"   - 중단 후 재시작 시 자동으로 기존 결과를 불러옵니다")
    print(f"   - 다른 프로세스에서 진행상황 모니터링 가능합니다")
    
    print("="*80)
    print("🎉 하이퍼파라미터 최적화 완료!")
    
    return study

if __name__ == "__main__":
    study = run_optuna_optimization()

[I 2025-09-08 22:25:47,057] A new study created in RDB with name: bmed_tf_optimization


🚀 BMED TF Model Hyperparameter Optimization with Optuna
📋 데이터 로드 중...
   - 총 실험 개수: 15
   - 총 데이터 포인트: 15
   - 최대 시퀀스 길이: 37
🔍 최적화 시작 (총 100 trials)
  🔄 Trial 0, Fold 1/5
    Fold 1 best total loss: 0.297867
  🔄 Trial 0, Fold 2/5
    Fold 2 best total loss: 0.300687
  🔄 Trial 0, Fold 3/5
    Fold 3 best total loss: 0.025844
  🔄 Trial 0, Fold 4/5
    Fold 4 best total loss: 0.171848
  🔄 Trial 0, Fold 5/5


[I 2025-09-08 22:33:01,680] Trial 0 finished with value: 0.22576341001937789 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 6, 'lstm_dropout': 0.4, 'decoder_hidden_size': 48, 'decoder_n_layers': 3, 'decoder_dropout': 0.4, 'current_hidden_size': 48, 'current_n_layers': 5, 'current_dropout': 0.2, 'noam_factor': 1.3, 'warmup_ratio': 0.2, 'batch_size': 5}. Best is trial 0 with value: 0.22576341001937789.


    Fold 5 best total loss: 0.332571
  📊 Trial 0 - Average CV Loss: 0.225763 (±0.114101)
  🔄 Trial 1, Fold 1/5
    Fold 1 best total loss: 0.008599
  🔄 Trial 1, Fold 2/5
    Fold 2 best total loss: 0.024000
  🔄 Trial 1, Fold 3/5
    Fold 3 best total loss: 0.019576
  🔄 Trial 1, Fold 4/5
    Fold 4 best total loss: 0.012959
  🔄 Trial 1, Fold 5/5


[I 2025-09-08 22:37:56,473] Trial 1 finished with value: 0.016060342127457262 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.1, 'decoder_hidden_size': 48, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 1.0, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.015168
  📊 Trial 1 - Average CV Loss: 0.016060 (±0.005320)
  🔄 Trial 2, Fold 1/5
    Fold 1 best total loss: 0.296965
  🔄 Trial 2, Fold 2/5
    Fold 2 best total loss: 0.298020
  🔄 Trial 2, Fold 3/5
    Fold 3 best total loss: 0.200516
  🔄 Trial 2, Fold 4/5
    Fold 4 best total loss: 0.173378
  🔄 Trial 2, Fold 5/5


[I 2025-09-08 22:40:37,943] Trial 2 finished with value: 0.25978163542846844 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 6, 'decoder_dropout': 0.4, 'current_hidden_size': 128, 'current_n_layers': 4, 'current_dropout': 0.1, 'noam_factor': 1.6, 'warmup_ratio': 0.25, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.330029
  📊 Trial 2 - Average CV Loss: 0.259782 (±0.061249)
  🔄 Trial 3, Fold 1/5
    Fold 1 best total loss: 0.324544
  🔄 Trial 3, Fold 2/5
    Fold 2 best total loss: 0.327052
  🔄 Trial 3, Fold 3/5
    Fold 3 best total loss: 0.234401
  🔄 Trial 3, Fold 4/5
    Fold 4 best total loss: 0.213981
  🔄 Trial 3, Fold 5/5


[I 2025-09-08 22:42:38,467] Trial 3 finished with value: 0.2911373309791088 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 4, 'lstm_dropout': 0.5, 'decoder_hidden_size': 48, 'decoder_n_layers': 6, 'decoder_dropout': 0.5, 'current_hidden_size': 72, 'current_n_layers': 6, 'current_dropout': 0.2, 'noam_factor': 0.6, 'warmup_ratio': 0.1, 'batch_size': 15}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.355709
  📊 Trial 3 - Average CV Loss: 0.291137 (±0.056120)
  🔄 Trial 4, Fold 1/5
    Fold 1 best total loss: 0.018738
  🔄 Trial 4, Fold 2/5
    Fold 2 best total loss: 0.028830
  🔄 Trial 4, Fold 3/5
    Fold 3 best total loss: 0.034731
  🔄 Trial 4, Fold 4/5
    Fold 4 best total loss: 0.028867
  🔄 Trial 4, Fold 5/5


[I 2025-09-08 22:47:36,224] Trial 4 finished with value: 0.02819486283697188 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 3, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 48, 'decoder_n_layers': 3, 'decoder_dropout': 0.1, 'current_hidden_size': 72, 'current_n_layers': 4, 'current_dropout': 0.6, 'noam_factor': 0.8, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.029808
  📊 Trial 4 - Average CV Loss: 0.028195 (±0.005208)
  🔄 Trial 5, Fold 1/5
    Fold 1 best total loss: 0.106601
  🔄 Trial 5, Fold 2/5
    Fold 2 best total loss: 0.113648
  🔄 Trial 5, Fold 3/5
    Fold 3 best total loss: 0.079170
  🔄 Trial 5, Fold 4/5
    Fold 4 best total loss: 0.092028
  🔄 Trial 5, Fold 5/5


[I 2025-09-08 22:48:38,923] Trial 5 finished with value: 0.10318072065711022 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 32, 'decoder_n_layers': 5, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 16, 'current_n_layers': 5, 'current_dropout': 0.5, 'noam_factor': 1.3, 'warmup_ratio': 0.2, 'batch_size': 15}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.124456
  📊 Trial 5 - Average CV Loss: 0.103181 (±0.015967)
  🔄 Trial 6, Fold 1/5
    Fold 1 best total loss: 0.317850
  🔄 Trial 6, Fold 2/5
    Fold 2 best total loss: 0.238262
  🔄 Trial 6, Fold 3/5
    Fold 3 best total loss: 0.224433
  🔄 Trial 6, Fold 4/5
    Fold 4 best total loss: 0.089120
  🔄 Trial 6, Fold 5/5


[I 2025-09-08 22:56:27,142] Trial 6 finished with value: 0.24337829556316137 and parameters: {'lstm_hidden_size': 16, 'lstm_n_layers': 5, 'lstm_dropout': 0.4, 'decoder_hidden_size': 48, 'decoder_n_layers': 2, 'decoder_dropout': 0.1, 'current_hidden_size': 72, 'current_n_layers': 5, 'current_dropout': 0.4, 'noam_factor': 1.8, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.347226
  📊 Trial 6 - Average CV Loss: 0.243378 (±0.090012)
  🔄 Trial 7, Fold 1/5
    Fold 1 best total loss: 0.324394
  🔄 Trial 7, Fold 2/5
    Fold 2 best total loss: 0.327049
  🔄 Trial 7, Fold 3/5
    Fold 3 best total loss: 0.234337
  🔄 Trial 7, Fold 4/5
    Fold 4 best total loss: 0.213723
  🔄 Trial 7, Fold 5/5


[I 2025-09-08 22:58:28,590] Trial 7 finished with value: 0.2910341702401638 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 48, 'decoder_n_layers': 6, 'decoder_dropout': 0.6, 'current_hidden_size': 96, 'current_n_layers': 6, 'current_dropout': 0.2, 'noam_factor': 1.1, 'warmup_ratio': 0.3, 'batch_size': 15}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.355668
  📊 Trial 7 - Average CV Loss: 0.291034 (±0.056177)
  🔄 Trial 8, Fold 1/5
    Fold 1 best total loss: 0.324481
  🔄 Trial 8, Fold 2/5
    Fold 2 best total loss: 0.327135
  🔄 Trial 8, Fold 3/5
    Fold 3 best total loss: 0.234403
  🔄 Trial 8, Fold 4/5
    Fold 4 best total loss: 0.213954
  🔄 Trial 8, Fold 5/5


[I 2025-09-08 23:00:26,155] Trial 8 finished with value: 0.29115027114748954 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 4, 'lstm_dropout': 0.5, 'decoder_hidden_size': 128, 'decoder_n_layers': 6, 'decoder_dropout': 0.6, 'current_hidden_size': 128, 'current_n_layers': 3, 'current_dropout': 0.30000000000000004, 'noam_factor': 0.6, 'warmup_ratio': 0.2, 'batch_size': 15}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.355779
  📊 Trial 8 - Average CV Loss: 0.291150 (±0.056147)
  🔄 Trial 9, Fold 1/5
    Fold 1 best total loss: 0.285348
  🔄 Trial 9, Fold 2/5
    Fold 2 best total loss: 0.325334
  🔄 Trial 9, Fold 3/5
    Fold 3 best total loss: 0.234461
  🔄 Trial 9, Fold 4/5
    Fold 4 best total loss: 8.377290
  🔄 Trial 9, Fold 5/5


[I 2025-09-08 23:02:22,977] Trial 9 finished with value: 1.9152314707636833 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 4, 'lstm_dropout': 0.4, 'decoder_hidden_size': 128, 'decoder_n_layers': 5, 'decoder_dropout': 0.6, 'current_hidden_size': 48, 'current_n_layers': 6, 'current_dropout': 0.6, 'noam_factor': 1.6, 'warmup_ratio': 0.15000000000000002, 'batch_size': 15}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.353725
  📊 Trial 9 - Average CV Loss: 1.915231 (±3.231278)
  🔄 Trial 10, Fold 1/5
    Fold 1 best total loss: 4.594150
  🔄 Trial 10, Fold 2/5
    Fold 2 best total loss: 0.018593
  🔄 Trial 10, Fold 3/5
    Fold 3 best total loss: 0.199501
  🔄 Trial 10, Fold 4/5
    Fold 4 best total loss: 0.013941
  🔄 Trial 10, Fold 5/5


[I 2025-09-08 23:09:26,925] Trial 10 finished with value: 0.968204159848392 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 6, 'lstm_dropout': 0.2, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 1.1, 'warmup_ratio': 0.05, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.014836
  📊 Trial 10 - Average CV Loss: 0.968204 (±1.814369)
  🔄 Trial 11, Fold 1/5
    Fold 1 best total loss: 0.019652
  🔄 Trial 11, Fold 2/5
    Fold 2 best total loss: 0.024776
  🔄 Trial 11, Fold 3/5
    Fold 3 best total loss: 0.027485
  🔄 Trial 11, Fold 4/5
    Fold 4 best total loss: 0.023500
  🔄 Trial 11, Fold 5/5


[I 2025-09-08 23:14:21,089] Trial 11 finished with value: 0.0248192482162267 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 3, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.6, 'noam_factor': 0.8, 'warmup_ratio': 0.3, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.028683
  📊 Trial 11 - Average CV Loss: 0.024819 (±0.003177)
  🔄 Trial 12, Fold 1/5
    Fold 1 best total loss: 0.009664
  🔄 Trial 12, Fold 2/5
    Fold 2 best total loss: 0.020942
  🔄 Trial 12, Fold 3/5
    Fold 3 best total loss: 0.022193
  🔄 Trial 12, Fold 4/5
    Fold 4 best total loss: 0.023272
  🔄 Trial 12, Fold 5/5


[I 2025-09-08 23:19:14,774] Trial 12 finished with value: 0.018885088921524584 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 0.9, 'warmup_ratio': 0.1, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.018354
  📊 Trial 12 - Average CV Loss: 0.018885 (±0.004894)
  🔄 Trial 13, Fold 1/5
    Fold 1 best total loss: 0.316381
  🔄 Trial 13, Fold 2/5
    Fold 2 best total loss: 0.021336
  🔄 Trial 13, Fold 3/5
    Fold 3 best total loss: 0.223397
  🔄 Trial 13, Fold 4/5
    Fold 4 best total loss: 0.200010
  🔄 Trial 13, Fold 5/5


[I 2025-09-08 23:24:07,874] Trial 13 finished with value: 0.22150124390609563 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 72, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 64, 'current_n_layers': 3, 'current_dropout': 0.4, 'noam_factor': 1.0, 'warmup_ratio': 0.1, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.346382
  📊 Trial 13 - Average CV Loss: 0.221501 (±0.114131)
  🔄 Trial 14, Fold 1/5
    Fold 1 best total loss: 0.295806
  🔄 Trial 14, Fold 2/5
    Fold 2 best total loss: 0.020641
  🔄 Trial 14, Fold 3/5
    Fold 3 best total loss: 0.205355
  🔄 Trial 14, Fold 4/5
    Fold 4 best total loss: 0.018460
  🔄 Trial 14, Fold 5/5


[I 2025-09-08 23:30:05,443] Trial 14 finished with value: 0.11117777686255674 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 2, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 0.9, 'warmup_ratio': 0.1, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.015626
  📊 Trial 14 - Average CV Loss: 0.111178 (±0.117372)
  🔄 Trial 15, Fold 1/5
    Fold 1 best total loss: 0.315273
  🔄 Trial 15, Fold 2/5
    Fold 2 best total loss: 0.307139
  🔄 Trial 15, Fold 3/5
    Fold 3 best total loss: 0.033421
  🔄 Trial 15, Fold 4/5
    Fold 4 best total loss: 0.197569
  🔄 Trial 15, Fold 5/5


[I 2025-09-08 23:34:56,164] Trial 15 finished with value: 0.23945311047136783 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.5, 'noam_factor': 1.4, 'warmup_ratio': 0.05, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.343862
  📊 Trial 15 - Average CV Loss: 0.239453 (±0.114398)
  🔄 Trial 16, Fold 1/5
    Fold 1 best total loss: 0.020053
  🔄 Trial 16, Fold 2/5
    Fold 2 best total loss: 0.026035
  🔄 Trial 16, Fold 3/5
    Fold 3 best total loss: 0.027117
  🔄 Trial 16, Fold 4/5
    Fold 4 best total loss: 0.021202
  🔄 Trial 16, Fold 5/5


[I 2025-09-08 23:40:50,177] Trial 16 finished with value: 0.023704127377520007 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 3, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 0.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.024115
  📊 Trial 16 - Average CV Loss: 0.023704 (±0.002714)
  🔄 Trial 17, Fold 1/5
    Fold 1 best total loss: 0.295481
  🔄 Trial 17, Fold 2/5
    Fold 2 best total loss: 0.299948
  🔄 Trial 17, Fold 3/5
    Fold 3 best total loss: 0.017680
  🔄 Trial 17, Fold 4/5
    Fold 4 best total loss: 0.011888
  🔄 Trial 17, Fold 5/5


[I 2025-09-08 23:43:28,793] Trial 17 finished with value: 0.19116326981845 and parameters: {'lstm_hidden_size': 16, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 32, 'decoder_n_layers': 5, 'decoder_dropout': 0.2, 'current_hidden_size': 96, 'current_n_layers': 3, 'current_dropout': 0.30000000000000004, 'noam_factor': 0.8, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.330819
  📊 Trial 17 - Average CV Loss: 0.191163 (±0.144538)
  🔄 Trial 18, Fold 1/5
    Fold 1 best total loss: 0.318050
  🔄 Trial 18, Fold 2/5
    Fold 2 best total loss: 0.024385
  🔄 Trial 18, Fold 3/5
    Fold 3 best total loss: 0.021074
  🔄 Trial 18, Fold 4/5
    Fold 4 best total loss: 0.198803
  🔄 Trial 18, Fold 5/5


[I 2025-09-08 23:48:16,380] Trial 18 finished with value: 0.18204652338754385 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 64, 'current_n_layers': 2, 'current_dropout': 0.5, 'noam_factor': 1.1, 'warmup_ratio': 0.1, 'batch_size': 3}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.347920
  📊 Trial 18 - Average CV Loss: 0.182047 (±0.139328)
  🔄 Trial 19, Fold 1/5
    Fold 1 best total loss: 0.296316
  🔄 Trial 19, Fold 2/5
    Fold 2 best total loss: 0.024961
  🔄 Trial 19, Fold 3/5
    Fold 3 best total loss: 0.030342
  🔄 Trial 19, Fold 4/5
    Fold 4 best total loss: 0.013002
  🔄 Trial 19, Fold 5/5


[I 2025-09-08 23:55:07,490] Trial 19 finished with value: 0.07746099115659794 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 5, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 72, 'decoder_n_layers': 3, 'decoder_dropout': 0.4, 'current_hidden_size': 16, 'current_n_layers': 4, 'current_dropout': 0.1, 'noam_factor': 0.7, 'warmup_ratio': 0.05, 'batch_size': 5}. Best is trial 1 with value: 0.016060342127457262.


    Fold 5 best total loss: 0.022684
  📊 Trial 19 - Average CV Loss: 0.077461 (±0.109571)
  🔄 Trial 20, Fold 1/5
    Fold 1 best total loss: 0.010686
  🔄 Trial 20, Fold 2/5
    Fold 2 best total loss: 0.021828
  🔄 Trial 20, Fold 3/5
    Fold 3 best total loss: 0.018276
  🔄 Trial 20, Fold 4/5
    Fold 4 best total loss: 0.014285
  🔄 Trial 20, Fold 5/5


[I 2025-09-09 00:00:56,288] Trial 20 finished with value: 0.01600659122923389 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 20 with value: 0.01600659122923389.


    Fold 5 best total loss: 0.014958
  📊 Trial 20 - Average CV Loss: 0.016007 (±0.003779)
  🔄 Trial 21, Fold 1/5
    Fold 1 best total loss: 0.012517
  🔄 Trial 21, Fold 2/5
    Fold 2 best total loss: 0.318701
  🔄 Trial 21, Fold 3/5
    Fold 3 best total loss: 0.015580
  🔄 Trial 21, Fold 4/5
    Fold 4 best total loss: 4.714937
  🔄 Trial 21, Fold 5/5


[I 2025-09-09 00:06:41,418] Trial 21 finished with value: 1.966023700265214 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 2.0, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 20 with value: 0.01600659122923389.


    Fold 5 best total loss: 4.768384
  📊 Trial 21 - Average CV Loss: 1.966024 (±2.269090)
  🔄 Trial 22, Fold 1/5
    Fold 1 best total loss: 0.009513
  🔄 Trial 22, Fold 2/5
    Fold 2 best total loss: 0.320064
  🔄 Trial 22, Fold 3/5
    Fold 3 best total loss: 0.224359
  🔄 Trial 22, Fold 4/5
    Fold 4 best total loss: 4.699104
  🔄 Trial 22, Fold 5/5


[I 2025-09-09 00:14:19,073] Trial 22 finished with value: 1.1198883177479728 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 1.6, 'warmup_ratio': 0.1, 'batch_size': 3}. Best is trial 20 with value: 0.01600659122923389.


    Fold 5 best total loss: 0.346402
  📊 Trial 22 - Average CV Loss: 1.119888 (±1.793527)
  🔄 Trial 23, Fold 1/5
    Fold 1 best total loss: 0.008635
  🔄 Trial 23, Fold 2/5
    Fold 2 best total loss: 0.020152
  🔄 Trial 23, Fold 3/5
    Fold 3 best total loss: 0.017114
  🔄 Trial 23, Fold 4/5
    Fold 4 best total loss: 0.011654
  🔄 Trial 23, Fold 5/5


[I 2025-09-09 00:18:16,600] Trial 23 finished with value: 0.014941661176271737 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.4, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 23 with value: 0.014941661176271737.


    Fold 5 best total loss: 0.017154
  📊 Trial 23 - Average CV Loss: 0.014942 (±0.004180)
  🔄 Trial 24, Fold 1/5
    Fold 1 best total loss: 0.013651
  🔄 Trial 24, Fold 2/5
    Fold 2 best total loss: 0.019599
  🔄 Trial 24, Fold 3/5
    Fold 3 best total loss: 0.018254
  🔄 Trial 24, Fold 4/5
    Fold 4 best total loss: 0.014591
  🔄 Trial 24, Fold 5/5


[I 2025-09-09 00:22:21,518] Trial 24 finished with value: 0.01724691520212218 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 23 with value: 0.014941661176271737.


    Fold 5 best total loss: 0.020139
  📊 Trial 24 - Average CV Loss: 0.017247 (±0.002642)
  🔄 Trial 25, Fold 1/5
    Fold 1 best total loss: 0.009901
  🔄 Trial 25, Fold 2/5
    Fold 2 best total loss: 0.018289
  🔄 Trial 25, Fold 3/5
    Fold 3 best total loss: 0.020910
  🔄 Trial 25, Fold 4/5
    Fold 4 best total loss: 0.013141
  🔄 Trial 25, Fold 5/5


[I 2025-09-09 00:26:26,254] Trial 25 finished with value: 0.015392155782319606 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.8, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 23 with value: 0.014941661176271737.


    Fold 5 best total loss: 0.014718
  📊 Trial 25 - Average CV Loss: 0.015392 (±0.003860)
  🔄 Trial 26, Fold 1/5
    Fold 1 best total loss: 0.011043
  🔄 Trial 26, Fold 2/5
    Fold 2 best total loss: 0.020642
  🔄 Trial 26, Fold 3/5
    Fold 3 best total loss: 0.021087
  🔄 Trial 26, Fold 4/5
    Fold 4 best total loss: 0.021018
  🔄 Trial 26, Fold 5/5


[I 2025-09-09 00:30:34,985] Trial 26 finished with value: 0.017632700805552303 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 4, 'current_dropout': 0.2, 'noam_factor': 1.8, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 23 with value: 0.014941661176271737.


    Fold 5 best total loss: 0.014373
  📊 Trial 26 - Average CV Loss: 0.017633 (±0.004159)
  🔄 Trial 27, Fold 1/5
    Fold 1 best total loss: 0.011417
  🔄 Trial 27, Fold 2/5
    Fold 2 best total loss: 0.317965
  🔄 Trial 27, Fold 3/5
    Fold 3 best total loss: 0.020854
  🔄 Trial 27, Fold 4/5
    Fold 4 best total loss: 0.016433
  🔄 Trial 27, Fold 5/5


[I 2025-09-09 00:34:43,813] Trial 27 finished with value: 0.076582099404186 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 64, 'current_n_layers': 3, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 23 with value: 0.014941661176271737.


    Fold 5 best total loss: 0.016242
  📊 Trial 27 - Average CV Loss: 0.076582 (±0.120728)
  🔄 Trial 28, Fold 1/5
    Fold 1 best total loss: 0.317180
  🔄 Trial 28, Fold 2/5
    Fold 2 best total loss: 4.466065
  🔄 Trial 28, Fold 3/5
    Fold 3 best total loss: 0.226520
  🔄 Trial 28, Fold 4/5
    Fold 4 best total loss: 0.202863
  🔄 Trial 28, Fold 5/5


[I 2025-09-09 00:38:44,912] Trial 28 finished with value: 1.1119010254275055 and parameters: {'lstm_hidden_size': 16, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 128, 'current_n_layers': 4, 'current_dropout': 0.1, 'noam_factor': 2.0, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 23 with value: 0.014941661176271737.


    Fold 5 best total loss: 0.346878
  📊 Trial 28 - Average CV Loss: 1.111901 (±1.677945)
  🔄 Trial 29, Fold 1/5
    Fold 1 best total loss: 0.009705
  🔄 Trial 29, Fold 2/5
    Fold 2 best total loss: 0.022907
  🔄 Trial 29, Fold 3/5
    Fold 3 best total loss: 0.017617
  🔄 Trial 29, Fold 4/5
    Fold 4 best total loss: 0.010752
  🔄 Trial 29, Fold 5/5


[I 2025-09-09 00:41:58,059] Trial 29 finished with value: 0.014544721913989634 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 96, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.3, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.011743
  📊 Trial 29 - Average CV Loss: 0.014545 (±0.005001)
  🔄 Trial 30, Fold 1/5
    Fold 1 best total loss: 0.009897
  🔄 Trial 30, Fold 2/5
    Fold 2 best total loss: 0.025612
  🔄 Trial 30, Fold 3/5
    Fold 3 best total loss: 0.025587
  🔄 Trial 30, Fold 4/5
    Fold 4 best total loss: 0.015730
  🔄 Trial 30, Fold 5/5


[I 2025-09-09 00:45:17,043] Trial 30 finished with value: 0.01915533172432333 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 16, 'decoder_n_layers': 3, 'decoder_dropout': 0.4, 'current_hidden_size': 48, 'current_n_layers': 5, 'current_dropout': 0.2, 'noam_factor': 1.3, 'warmup_ratio': 0.3, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.018950
  📊 Trial 30 - Average CV Loss: 0.019155 (±0.006009)
  🔄 Trial 31, Fold 1/5
    Fold 1 best total loss: 0.011590
  🔄 Trial 31, Fold 2/5
    Fold 2 best total loss: 0.016196
  🔄 Trial 31, Fold 3/5
    Fold 3 best total loss: 0.021661
  🔄 Trial 31, Fold 4/5
    Fold 4 best total loss: 0.205474
  🔄 Trial 31, Fold 5/5


[I 2025-09-09 00:48:36,330] Trial 31 finished with value: 0.054561087884940206 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 96, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.4, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.017884
  📊 Trial 31 - Average CV Loss: 0.054561 (±0.075526)
  🔄 Trial 32, Fold 1/5
    Fold 1 best total loss: 0.008202
  🔄 Trial 32, Fold 2/5
    Fold 2 best total loss: 0.017565
  🔄 Trial 32, Fold 3/5
    Fold 3 best total loss: 0.224468
  🔄 Trial 32, Fold 4/5
    Fold 4 best total loss: 0.011916
  🔄 Trial 32, Fold 5/5


[I 2025-09-09 00:51:54,705] Trial 32 finished with value: 0.05459297182969749 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.010814
  📊 Trial 32 - Average CV Loss: 0.054593 (±0.084993)
  🔄 Trial 33, Fold 1/5
    Fold 1 best total loss: 0.010955
  🔄 Trial 33, Fold 2/5
    Fold 2 best total loss: 0.024247
  🔄 Trial 33, Fold 3/5
    Fold 3 best total loss: 0.021782
  🔄 Trial 33, Fold 4/5
    Fold 4 best total loss: 0.015842
  🔄 Trial 33, Fold 5/5


[I 2025-09-09 00:56:39,678] Trial 33 finished with value: 0.01844671805156395 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 48, 'current_n_layers': 4, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.2000000000000002, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.019407
  📊 Trial 33 - Average CV Loss: 0.018447 (±0.004661)
  🔄 Trial 34, Fold 1/5
    Fold 1 best total loss: 0.009785
  🔄 Trial 34, Fold 2/5
    Fold 2 best total loss: 0.024053
  🔄 Trial 34, Fold 3/5
    Fold 3 best total loss: 0.017350
  🔄 Trial 34, Fold 4/5
    Fold 4 best total loss: 0.016693
  🔄 Trial 34, Fold 5/5


[I 2025-09-09 00:59:55,087] Trial 34 finished with value: 0.016511898033786564 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 72, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.014679
  📊 Trial 34 - Average CV Loss: 0.016512 (±0.004609)
  🔄 Trial 35, Fold 1/5
    Fold 1 best total loss: 0.317680
  🔄 Trial 35, Fold 2/5
    Fold 2 best total loss: 0.319469
  🔄 Trial 35, Fold 3/5
    Fold 3 best total loss: 0.225759
  🔄 Trial 35, Fold 4/5
    Fold 4 best total loss: 0.196867
  🔄 Trial 35, Fold 5/5


[I 2025-09-09 01:04:37,284] Trial 35 finished with value: 0.21467932417290286 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 128, 'decoder_n_layers': 3, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 96, 'current_n_layers': 4, 'current_dropout': 0.2, 'noam_factor': 1.4, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.013622
  📊 Trial 35 - Average CV Loss: 0.214679 (±0.111762)
  🔄 Trial 36, Fold 1/5
    Fold 1 best total loss: 0.316824
  🔄 Trial 36, Fold 2/5
    Fold 2 best total loss: 0.319132
  🔄 Trial 36, Fold 3/5
    Fold 3 best total loss: 0.223100
  🔄 Trial 36, Fold 4/5
    Fold 4 best total loss: 0.012668
  🔄 Trial 36, Fold 5/5


[I 2025-09-09 01:07:54,001] Trial 36 finished with value: 0.17666960114147515 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 32, 'decoder_n_layers': 2, 'decoder_dropout': 0.5, 'current_hidden_size': 72, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.2000000000000002, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.011624
  📊 Trial 36 - Average CV Loss: 0.176670 (±0.138731)
  🔄 Trial 37, Fold 1/5
    Fold 1 best total loss: 0.011665
  🔄 Trial 37, Fold 2/5
    Fold 2 best total loss: 0.020214
  🔄 Trial 37, Fold 3/5
    Fold 3 best total loss: 0.017485
  🔄 Trial 37, Fold 4/5
    Fold 4 best total loss: 0.016392
  🔄 Trial 37, Fold 5/5


[I 2025-09-09 01:11:16,201] Trial 37 finished with value: 0.01638549685012549 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.016172
  📊 Trial 37 - Average CV Loss: 0.016385 (±0.002764)
  🔄 Trial 38, Fold 1/5
    Fold 1 best total loss: 0.324392
  🔄 Trial 38, Fold 2/5
    Fold 2 best total loss: 0.327046
  🔄 Trial 38, Fold 3/5
    Fold 3 best total loss: 0.234302
  🔄 Trial 38, Fold 4/5
    Fold 4 best total loss: 0.213753
  🔄 Trial 38, Fold 5/5


[I 2025-09-09 01:12:43,096] Trial 38 finished with value: 0.29104147776961325 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 48, 'current_n_layers': 4, 'current_dropout': 0.2, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 15}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.355715
  📊 Trial 38 - Average CV Loss: 0.291041 (±0.056185)
  🔄 Trial 39, Fold 1/5
    Fold 1 best total loss: 0.317246
  🔄 Trial 39, Fold 2/5
    Fold 2 best total loss: 0.319768
  🔄 Trial 39, Fold 3/5
    Fold 3 best total loss: 0.016386
  🔄 Trial 39, Fold 4/5
    Fold 4 best total loss: 0.202172
  🔄 Trial 39, Fold 5/5


[I 2025-09-09 01:16:01,188] Trial 39 finished with value: 0.1740022792830132 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 16, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 128, 'current_n_layers': 3, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.014439
  📊 Trial 39 - Average CV Loss: 0.174002 (±0.136282)
  🔄 Trial 40, Fold 1/5
    Fold 1 best total loss: 0.024275
  🔄 Trial 40, Fold 2/5
    Fold 2 best total loss: 0.022371
  🔄 Trial 40, Fold 3/5
    Fold 3 best total loss: 0.031920
  🔄 Trial 40, Fold 4/5
    Fold 4 best total loss: 0.012409
  🔄 Trial 40, Fold 5/5


[I 2025-09-09 01:17:27,007] Trial 40 finished with value: 0.022771335393190383 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 3, 'lstm_dropout': 0.5, 'decoder_hidden_size': 48, 'decoder_n_layers': 4, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 32, 'current_n_layers': 5, 'current_dropout': 0.1, 'noam_factor': 1.6, 'warmup_ratio': 0.25, 'batch_size': 15}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.022881
  📊 Trial 40 - Average CV Loss: 0.022771 (±0.006221)
  🔄 Trial 41, Fold 1/5
    Fold 1 best total loss: 0.009447
  🔄 Trial 41, Fold 2/5
    Fold 2 best total loss: 0.021105
  🔄 Trial 41, Fold 3/5
    Fold 3 best total loss: 0.020687
  🔄 Trial 41, Fold 4/5
    Fold 4 best total loss: 0.016289
  🔄 Trial 41, Fold 5/5


[I 2025-09-09 01:22:11,279] Trial 41 finished with value: 0.016806533777465424 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 1.2000000000000002, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.016505
  📊 Trial 41 - Average CV Loss: 0.016807 (±0.004197)
  🔄 Trial 42, Fold 1/5
    Fold 1 best total loss: 0.011384
  🔄 Trial 42, Fold 2/5
    Fold 2 best total loss: 0.019877
  🔄 Trial 42, Fold 3/5
    Fold 3 best total loss: 0.020695
  🔄 Trial 42, Fold 4/5
    Fold 4 best total loss: 0.023083
  🔄 Trial 42, Fold 5/5


[I 2025-09-09 01:26:54,759] Trial 42 finished with value: 0.018482477683573962 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.1, 'decoder_hidden_size': 48, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.2, 'noam_factor': 1.3, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.017373
  📊 Trial 42 - Average CV Loss: 0.018482 (±0.003991)
  🔄 Trial 43, Fold 1/5
    Fold 1 best total loss: 0.295817
  🔄 Trial 43, Fold 2/5
    Fold 2 best total loss: 0.018144
  🔄 Trial 43, Fold 3/5
    Fold 3 best total loss: 0.195076
  🔄 Trial 43, Fold 4/5
    Fold 4 best total loss: 0.174412
  🔄 Trial 43, Fold 5/5


[I 2025-09-09 01:33:48,829] Trial 43 finished with value: 0.139532830628256 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 48, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 72, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.0, 'warmup_ratio': 0.2, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.014215
  📊 Trial 43 - Average CV Loss: 0.139533 (±0.108781)
  🔄 Trial 44, Fold 1/5
    Fold 1 best total loss: 0.010161
  🔄 Trial 44, Fold 2/5
    Fold 2 best total loss: 0.301134
  🔄 Trial 44, Fold 3/5
    Fold 3 best total loss: 0.018695
  🔄 Trial 44, Fold 4/5
    Fold 4 best total loss: 0.013100
  🔄 Trial 44, Fold 5/5


[I 2025-09-09 01:36:19,001] Trial 44 finished with value: 0.07141417941699425 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 48, 'decoder_n_layers': 4, 'decoder_dropout': 0.5, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.013981
  📊 Trial 44 - Average CV Loss: 0.071414 (±0.114893)
  🔄 Trial 45, Fold 1/5
    Fold 1 best total loss: 0.106049
  🔄 Trial 45, Fold 2/5
    Fold 2 best total loss: 0.049768
  🔄 Trial 45, Fold 3/5
    Fold 3 best total loss: 0.234378
  🔄 Trial 45, Fold 4/5
    Fold 4 best total loss: 0.014781
  🔄 Trial 45, Fold 5/5


[I 2025-09-09 01:37:45,831] Trial 45 finished with value: 0.08869002889841796 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 128, 'decoder_n_layers': 4, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.4, 'warmup_ratio': 0.1, 'batch_size': 15}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.038475
  📊 Trial 45 - Average CV Loss: 0.088690 (±0.078772)
  🔄 Trial 46, Fold 1/5
    Fold 1 best total loss: 0.011830
  🔄 Trial 46, Fold 2/5
    Fold 2 best total loss: 0.026048
  🔄 Trial 46, Fold 3/5
    Fold 3 best total loss: 0.015658
  🔄 Trial 46, Fold 4/5
    Fold 4 best total loss: 0.019634
  🔄 Trial 46, Fold 5/5


[I 2025-09-09 01:41:01,736] Trial 46 finished with value: 0.017703988775610923 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 32, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 4, 'current_dropout': 0.2, 'noam_factor': 1.0, 'warmup_ratio': 0.3, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.015350
  📊 Trial 46 - Average CV Loss: 0.017704 (±0.004849)
  🔄 Trial 47, Fold 1/5
    Fold 1 best total loss: 0.006688
  🔄 Trial 47, Fold 2/5
    Fold 2 best total loss: 0.318914
  🔄 Trial 47, Fold 3/5
    Fold 3 best total loss: 0.224443
  🔄 Trial 47, Fold 4/5
    Fold 4 best total loss: 0.010932
  🔄 Trial 47, Fold 5/5


[I 2025-09-09 01:47:17,300] Trial 47 finished with value: 0.11399698656750842 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 4, 'lstm_dropout': 0.4, 'decoder_hidden_size': 48, 'decoder_n_layers': 6, 'decoder_dropout': 0.2, 'current_hidden_size': 96, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 1.1, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.009008
  📊 Trial 47 - Average CV Loss: 0.113997 (±0.132174)
  🔄 Trial 48, Fold 1/5
    Fold 1 best total loss: 0.295751
  🔄 Trial 48, Fold 2/5
    Fold 2 best total loss: 0.299367
  🔄 Trial 48, Fold 3/5
    Fold 3 best total loss: 0.027372
  🔄 Trial 48, Fold 4/5
    Fold 4 best total loss: 0.171279
  🔄 Trial 48, Fold 5/5


[I 2025-09-09 01:50:54,105] Trial 48 finished with value: 0.22470373790711165 and parameters: {'lstm_hidden_size': 16, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 48, 'current_n_layers': 4, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.329750
  📊 Trial 48 - Average CV Loss: 0.224704 (±0.112651)
  🔄 Trial 49, Fold 1/5
    Fold 1 best total loss: 0.315746
  🔄 Trial 49, Fold 2/5
    Fold 2 best total loss: 0.320902
  🔄 Trial 49, Fold 3/5
    Fold 3 best total loss: 0.014109
  🔄 Trial 49, Fold 4/5
    Fold 4 best total loss: 0.199945
  🔄 Trial 49, Fold 5/5


[I 2025-09-09 01:58:28,715] Trial 49 finished with value: 0.2393215410062112 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 5, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 64, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 128, 'current_n_layers': 2, 'current_dropout': 0.2, 'noam_factor': 1.2000000000000002, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.345905
  📊 Trial 49 - Average CV Loss: 0.239322 (±0.123392)
  🔄 Trial 50, Fold 1/5
    Fold 1 best total loss: 0.029075
  🔄 Trial 50, Fold 2/5
    Fold 2 best total loss: 0.327007
  🔄 Trial 50, Fold 3/5
    Fold 3 best total loss: 0.097670
  🔄 Trial 50, Fold 4/5
    Fold 4 best total loss: 0.023543
  🔄 Trial 50, Fold 5/5


[I 2025-09-09 02:00:20,224] Trial 50 finished with value: 0.16402089316397905 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.2, 'decoder_hidden_size': 96, 'decoder_n_layers': 2, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.3, 'warmup_ratio': 0.1, 'batch_size': 15}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.342810
  📊 Trial 50 - Average CV Loss: 0.164021 (±0.142040)
  🔄 Trial 51, Fold 1/5
    Fold 1 best total loss: 0.011677
  🔄 Trial 51, Fold 2/5
    Fold 2 best total loss: 0.251614
  🔄 Trial 51, Fold 3/5
    Fold 3 best total loss: 0.025528
  🔄 Trial 51, Fold 4/5
    Fold 4 best total loss: 0.013930
  🔄 Trial 51, Fold 5/5


[I 2025-09-09 02:03:37,607] Trial 51 finished with value: 0.06397097813896835 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.017106
  📊 Trial 51 - Average CV Loss: 0.063971 (±0.093939)
  🔄 Trial 52, Fold 1/5
    Fold 1 best total loss: 0.011403
  🔄 Trial 52, Fold 2/5
    Fold 2 best total loss: 0.023063
  🔄 Trial 52, Fold 3/5
    Fold 3 best total loss: 0.020360
  🔄 Trial 52, Fold 4/5
    Fold 4 best total loss: 0.014357
  🔄 Trial 52, Fold 5/5


[I 2025-09-09 02:06:53,386] Trial 52 finished with value: 0.016862348862923683 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 2, 'lstm_dropout': 0.5, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.015128
  📊 Trial 52 - Average CV Loss: 0.016862 (±0.004237)
  🔄 Trial 53, Fold 1/5
    Fold 1 best total loss: 0.012066
  🔄 Trial 53, Fold 2/5
    Fold 2 best total loss: 0.019194
  🔄 Trial 53, Fold 3/5
    Fold 3 best total loss: 0.022622
  🔄 Trial 53, Fold 4/5
    Fold 4 best total loss: 0.014895
  🔄 Trial 53, Fold 5/5


[I 2025-09-09 02:10:06,355] Trial 53 finished with value: 0.017333247419446706 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.017890
  📊 Trial 53 - Average CV Loss: 0.017333 (±0.003618)
  🔄 Trial 54, Fold 1/5
    Fold 1 best total loss: 0.012686
  🔄 Trial 54, Fold 2/5
    Fold 2 best total loss: 0.319048
  🔄 Trial 54, Fold 3/5
    Fold 3 best total loss: 0.018692
  🔄 Trial 54, Fold 4/5
    Fold 4 best total loss: 0.013266
  🔄 Trial 54, Fold 5/5


[I 2025-09-09 02:13:19,584] Trial 54 finished with value: 0.07643508678302169 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.018485
  📊 Trial 54 - Average CV Loss: 0.076435 (±0.121332)
  🔄 Trial 55, Fold 1/5
    Fold 1 best total loss: 0.315788
  🔄 Trial 55, Fold 2/5
    Fold 2 best total loss: 0.319090
  🔄 Trial 55, Fold 3/5
    Fold 3 best total loss: 0.016558
  🔄 Trial 55, Fold 4/5
    Fold 4 best total loss: 0.202222
  🔄 Trial 55, Fold 5/5


[I 2025-09-09 02:18:01,894] Trial 55 finished with value: 0.24003451981116086 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 72, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 64, 'current_n_layers': 2, 'current_dropout': 0.1, 'noam_factor': 2.0, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.346515
  📊 Trial 55 - Average CV Loss: 0.240035 (±0.122228)
  🔄 Trial 56, Fold 1/5
    Fold 1 best total loss: 0.097056
  🔄 Trial 56, Fold 2/5
    Fold 2 best total loss: 1.085958
  🔄 Trial 56, Fold 3/5
    Fold 3 best total loss: 0.033701
  🔄 Trial 56, Fold 4/5
    Fold 4 best total loss: 0.026497
  🔄 Trial 56, Fold 5/5


[I 2025-09-09 02:20:33,346] Trial 56 finished with value: 0.2719079023227095 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 5, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.4, 'warmup_ratio': 0.25, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.116328
  📊 Trial 56 - Average CV Loss: 0.271908 (±0.408516)
  🔄 Trial 57, Fold 1/5
    Fold 1 best total loss: 0.013738
  🔄 Trial 57, Fold 2/5
    Fold 2 best total loss: 0.151306
  🔄 Trial 57, Fold 3/5
    Fold 3 best total loss: 0.018689
  🔄 Trial 57, Fold 4/5
    Fold 4 best total loss: 0.021470
  🔄 Trial 57, Fold 5/5


[I 2025-09-09 02:25:13,722] Trial 57 finished with value: 0.044436062895692886 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 128, 'decoder_n_layers': 3, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 3, 'current_dropout': 0.2, 'noam_factor': 1.8, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.016978
  📊 Trial 57 - Average CV Loss: 0.044436 (±0.053494)
  🔄 Trial 58, Fold 1/5
    Fold 1 best total loss: 0.007674
  🔄 Trial 58, Fold 2/5
    Fold 2 best total loss: 0.319473
  🔄 Trial 58, Fold 3/5
    Fold 3 best total loss: 0.016766
  🔄 Trial 58, Fold 4/5
    Fold 4 best total loss: 0.196507
  🔄 Trial 58, Fold 5/5


[I 2025-09-09 02:28:30,095] Trial 58 finished with value: 0.17800884788739496 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 16, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 72, 'current_n_layers': 4, 'current_dropout': 0.1, 'noam_factor': 0.9, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.349623
  📊 Trial 58 - Average CV Loss: 0.178009 (±0.144789)
  🔄 Trial 59, Fold 1/5
    Fold 1 best total loss: 0.317557
  🔄 Trial 59, Fold 2/5
    Fold 2 best total loss: 0.319782
  🔄 Trial 59, Fold 3/5
    Fold 3 best total loss: 0.037455
  🔄 Trial 59, Fold 4/5
    Fold 4 best total loss: 0.026245
  🔄 Trial 59, Fold 5/5


[I 2025-09-09 02:33:11,471] Trial 59 finished with value: 0.14721536273136734 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 1.6, 'warmup_ratio': 0.1, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.035037
  📊 Trial 59 - Average CV Loss: 0.147215 (±0.140043)
  🔄 Trial 60, Fold 1/5
    Fold 1 best total loss: 0.012472
  🔄 Trial 60, Fold 2/5
    Fold 2 best total loss: 0.023078
  🔄 Trial 60, Fold 3/5
    Fold 3 best total loss: 0.028660
  🔄 Trial 60, Fold 4/5
    Fold 4 best total loss: 0.019439
  🔄 Trial 60, Fold 5/5


[I 2025-09-09 02:35:43,509] Trial 60 finished with value: 0.028082722689335548 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 48, 'decoder_n_layers': 4, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 32, 'current_n_layers': 6, 'current_dropout': 0.2, 'noam_factor': 1.1, 'warmup_ratio': 0.15000000000000002, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.056765
  📊 Trial 60 - Average CV Loss: 0.028083 (±0.015274)
  🔄 Trial 61, Fold 1/5
    Fold 1 best total loss: 0.008838
  🔄 Trial 61, Fold 2/5
    Fold 2 best total loss: 3.431604
  🔄 Trial 61, Fold 3/5
    Fold 3 best total loss: 0.025349
  🔄 Trial 61, Fold 4/5
    Fold 4 best total loss: 0.013844
  🔄 Trial 61, Fold 5/5


[I 2025-09-09 02:38:58,957] Trial 61 finished with value: 0.7652774012065493 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 72, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.346751
  📊 Trial 61 - Average CV Loss: 0.765277 (±1.339314)
  🔄 Trial 62, Fold 1/5
    Fold 1 best total loss: 0.011778
  🔄 Trial 62, Fold 2/5
    Fold 2 best total loss: 0.022478
  🔄 Trial 62, Fold 3/5
    Fold 3 best total loss: 0.019859
  🔄 Trial 62, Fold 4/5
    Fold 4 best total loss: 0.015159
  🔄 Trial 62, Fold 5/5


[I 2025-09-09 02:42:09,466] Trial 62 finished with value: 0.01664476814912632 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 72, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.5, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.013949
  📊 Trial 62 - Average CV Loss: 0.016645 (±0.003938)
  🔄 Trial 63, Fold 1/5
    Fold 1 best total loss: 0.009547
  🔄 Trial 63, Fold 2/5
    Fold 2 best total loss: 0.023330
  🔄 Trial 63, Fold 3/5
    Fold 3 best total loss: 0.021438
  🔄 Trial 63, Fold 4/5
    Fold 4 best total loss: 0.012238
  🔄 Trial 63, Fold 5/5


[I 2025-09-09 02:45:21,459] Trial 63 finished with value: 0.016526608949061484 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.4, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.016080
  📊 Trial 63 - Average CV Loss: 0.016527 (±0.005248)
  🔄 Trial 64, Fold 1/5
    Fold 1 best total loss: 0.037022
  🔄 Trial 64, Fold 2/5
    Fold 2 best total loss: 0.111102
  🔄 Trial 64, Fold 3/5
    Fold 3 best total loss: 0.042885
  🔄 Trial 64, Fold 4/5
    Fold 4 best total loss: 0.037175
  🔄 Trial 64, Fold 5/5


[I 2025-09-09 02:52:51,001] Trial 64 finished with value: 0.05576459188014269 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 5, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 72, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.5, 'noam_factor': 1.6, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.050638
  📊 Trial 64 - Average CV Loss: 0.055765 (±0.028111)
  🔄 Trial 65, Fold 1/5
    Fold 1 best total loss: 0.316736
  🔄 Trial 65, Fold 2/5
    Fold 2 best total loss: 0.318861
  🔄 Trial 65, Fold 3/5
    Fold 3 best total loss: 0.228676
  🔄 Trial 65, Fold 4/5
    Fold 4 best total loss: 4.616171
  🔄 Trial 65, Fold 5/5


[I 2025-09-09 02:56:05,136] Trial 65 finished with value: 1.1652799908071756 and parameters: {'lstm_hidden_size': 16, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 72, 'decoder_n_layers': 3, 'decoder_dropout': 0.4, 'current_hidden_size': 48, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.1, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.345955
  📊 Trial 65 - Average CV Loss: 1.165280 (±1.725898)
  🔄 Trial 66, Fold 1/5
    Fold 1 best total loss: 0.316021
  🔄 Trial 66, Fold 2/5
    Fold 2 best total loss: 0.320511
  🔄 Trial 66, Fold 3/5
    Fold 3 best total loss: 0.226116
  🔄 Trial 66, Fold 4/5
    Fold 4 best total loss: 0.012122
  🔄 Trial 66, Fold 5/5


[I 2025-09-09 02:59:18,350] Trial 66 finished with value: 0.2446511778049171 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 32, 'decoder_n_layers': 4, 'decoder_dropout': 0.2, 'current_hidden_size': 96, 'current_n_layers': 4, 'current_dropout': 0.2, 'noam_factor': 1.3, 'warmup_ratio': 0.3, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.348487
  📊 Trial 66 - Average CV Loss: 0.244651 (±0.123324)
  🔄 Trial 67, Fold 1/5
    Fold 1 best total loss: 0.006632
  🔄 Trial 67, Fold 2/5
    Fold 2 best total loss: 0.319142
  🔄 Trial 67, Fold 3/5
    Fold 3 best total loss: 0.017142
  🔄 Trial 67, Fold 4/5
    Fold 4 best total loss: 0.011263
  🔄 Trial 67, Fold 5/5


[I 2025-09-09 03:02:33,420] Trial 67 finished with value: 0.1404793354740832 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 96, 'decoder_n_layers': 3, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 64, 'current_n_layers': 3, 'current_dropout': 0.1, 'noam_factor': 0.7, 'warmup_ratio': 0.15000000000000002, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.348218
  📊 Trial 67 - Average CV Loss: 0.140479 (±0.158050)
  🔄 Trial 68, Fold 1/5
    Fold 1 best total loss: 0.011583
  🔄 Trial 68, Fold 2/5
    Fold 2 best total loss: 0.018945
  🔄 Trial 68, Fold 3/5
    Fold 3 best total loss: 0.017878
  🔄 Trial 68, Fold 4/5
    Fold 4 best total loss: 0.013893
  🔄 Trial 68, Fold 5/5


[I 2025-09-09 03:05:44,832] Trial 68 finished with value: 0.015597015246748924 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 2, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.015687
  📊 Trial 68 - Average CV Loss: 0.015597 (±0.002662)
  🔄 Trial 69, Fold 1/5
    Fold 1 best total loss: 0.010649
  🔄 Trial 69, Fold 2/5
    Fold 2 best total loss: 0.015521
  🔄 Trial 69, Fold 3/5
    Fold 3 best total loss: 0.019662
  🔄 Trial 69, Fold 4/5
    Fold 4 best total loss: 0.012871
  🔄 Trial 69, Fold 5/5


[I 2025-09-09 03:10:26,141] Trial 69 finished with value: 0.014644752640742808 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.014521
  📊 Trial 69 - Average CV Loss: 0.014645 (±0.003002)
  🔄 Trial 70, Fold 1/5
    Fold 1 best total loss: 0.011769
  🔄 Trial 70, Fold 2/5
    Fold 2 best total loss: 0.022132
  🔄 Trial 70, Fold 3/5
    Fold 3 best total loss: 0.020392
  🔄 Trial 70, Fold 4/5
    Fold 4 best total loss: 0.017399
  🔄 Trial 70, Fold 5/5


[I 2025-09-09 03:11:53,515] Trial 70 finished with value: 0.02004068745300174 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 6, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.25, 'batch_size': 15}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.028512
  📊 Trial 70 - Average CV Loss: 0.020041 (±0.005507)
  🔄 Trial 71, Fold 1/5
    Fold 1 best total loss: 0.012444
  🔄 Trial 71, Fold 2/5
    Fold 2 best total loss: 0.020026
  🔄 Trial 71, Fold 3/5
    Fold 3 best total loss: 0.021450
  🔄 Trial 71, Fold 4/5
    Fold 4 best total loss: 0.017221
  🔄 Trial 71, Fold 5/5


[I 2025-09-09 03:17:57,572] Trial 71 finished with value: 0.018248679162934423 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 4, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 1.8, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.020102
  📊 Trial 71 - Average CV Loss: 0.018249 (±0.003212)
  🔄 Trial 72, Fold 1/5
    Fold 1 best total loss: 0.010958
  🔄 Trial 72, Fold 2/5
    Fold 2 best total loss: 0.023348
  🔄 Trial 72, Fold 3/5
    Fold 3 best total loss: 0.021831
  🔄 Trial 72, Fold 4/5
    Fold 4 best total loss: 0.014574
  🔄 Trial 72, Fold 5/5


[I 2025-09-09 03:22:40,845] Trial 72 finished with value: 0.017296925699338318 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 3, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.015774
  📊 Trial 72 - Average CV Loss: 0.017297 (±0.004628)
  🔄 Trial 73, Fold 1/5
    Fold 1 best total loss: 0.011613
  🔄 Trial 73, Fold 2/5
    Fold 2 best total loss: 0.024038
  🔄 Trial 73, Fold 3/5
    Fold 3 best total loss: 0.019527
  🔄 Trial 73, Fold 4/5
    Fold 4 best total loss: 0.016075
  🔄 Trial 73, Fold 5/5


[I 2025-09-09 03:25:52,034] Trial 73 finished with value: 0.017842931649647654 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 2, 'lstm_dropout': 0.5, 'decoder_hidden_size': 64, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 2.0, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.017962
  📊 Trial 73 - Average CV Loss: 0.017843 (±0.004079)
  🔄 Trial 74, Fold 1/5
    Fold 1 best total loss: 0.012090
  🔄 Trial 74, Fold 2/5
    Fold 2 best total loss: 0.020233
  🔄 Trial 74, Fold 3/5
    Fold 3 best total loss: 0.019006
  🔄 Trial 74, Fold 4/5
    Fold 4 best total loss: 0.019372
  🔄 Trial 74, Fold 5/5


[I 2025-09-09 03:29:04,338] Trial 74 finished with value: 0.01696825264953077 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 2, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 4, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.2, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.014142
  📊 Trial 74 - Average CV Loss: 0.016968 (±0.003236)
  🔄 Trial 75, Fold 1/5
    Fold 1 best total loss: 0.011813
  🔄 Trial 75, Fold 2/5
    Fold 2 best total loss: 0.018540
  🔄 Trial 75, Fold 3/5
    Fold 3 best total loss: 0.019044
  🔄 Trial 75, Fold 4/5
    Fold 4 best total loss: 0.013955
  🔄 Trial 75, Fold 5/5


[I 2025-09-09 03:38:11,514] Trial 75 finished with value: 0.0160360521171242 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.6, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.016829
  📊 Trial 75 - Average CV Loss: 0.016036 (±0.002761)
  🔄 Trial 76, Fold 1/5
    Fold 1 best total loss: 0.010088
  🔄 Trial 76, Fold 2/5
    Fold 2 best total loss: 0.021638
  🔄 Trial 76, Fold 3/5
    Fold 3 best total loss: 0.019308
  🔄 Trial 76, Fold 4/5
    Fold 4 best total loss: 0.013779
  🔄 Trial 76, Fold 5/5


[I 2025-09-09 03:45:07,091] Trial 76 finished with value: 0.016072901431471106 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 6, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.6, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.015553
  📊 Trial 76 - Average CV Loss: 0.016073 (±0.004069)
  🔄 Trial 77, Fold 1/5
    Fold 1 best total loss: 0.011837
  🔄 Trial 77, Fold 2/5
    Fold 2 best total loss: 0.020180
  🔄 Trial 77, Fold 3/5
    Fold 3 best total loss: 0.020484
  🔄 Trial 77, Fold 4/5
    Fold 4 best total loss: 0.017813
  🔄 Trial 77, Fold 5/5


[I 2025-09-09 03:54:08,149] Trial 77 finished with value: 0.018592908792197705 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 64, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 1.6, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.022652
  📊 Trial 77 - Average CV Loss: 0.018593 (±0.003710)
  🔄 Trial 78, Fold 1/5
    Fold 1 best total loss: 0.010897
  🔄 Trial 78, Fold 2/5
    Fold 2 best total loss: 0.022702
  🔄 Trial 78, Fold 3/5
    Fold 3 best total loss: 0.019391
  🔄 Trial 78, Fold 4/5
    Fold 4 best total loss: 0.014349
  🔄 Trial 78, Fold 5/5


[I 2025-09-09 04:00:16,144] Trial 78 finished with value: 0.016150994366034864 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 4, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.013416
  📊 Trial 78 - Average CV Loss: 0.016151 (±0.004284)
  🔄 Trial 79, Fold 1/5
    Fold 1 best total loss: 0.013100
  🔄 Trial 79, Fold 2/5
    Fold 2 best total loss: 0.022548
  🔄 Trial 79, Fold 3/5
    Fold 3 best total loss: 0.023022
  🔄 Trial 79, Fold 4/5
    Fold 4 best total loss: 0.016289
  🔄 Trial 79, Fold 5/5


[I 2025-09-09 04:07:54,082] Trial 79 finished with value: 0.019161436357535422 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 48, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 1.5, 'warmup_ratio': 0.25, 'batch_size': 3}. Best is trial 29 with value: 0.014544721913989634.


    Fold 5 best total loss: 0.020848
  📊 Trial 79 - Average CV Loss: 0.019161 (±0.003853)
  🔄 Trial 80, Fold 1/5
    Fold 1 best total loss: 0.010310
  🔄 Trial 80, Fold 2/5
    Fold 2 best total loss: 0.018577
  🔄 Trial 80, Fold 3/5
    Fold 3 best total loss: 0.016855
  🔄 Trial 80, Fold 4/5
    Fold 4 best total loss: 0.011445
  🔄 Trial 80, Fold 5/5


[I 2025-09-09 04:13:36,534] Trial 80 finished with value: 0.014023609009260932 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.012931
  📊 Trial 80 - Average CV Loss: 0.014024 (±0.003175)
  🔄 Trial 81, Fold 1/5
    Fold 1 best total loss: 0.007457
  🔄 Trial 81, Fold 2/5
    Fold 2 best total loss: 0.018366
  🔄 Trial 81, Fold 3/5
    Fold 3 best total loss: 0.018377
  🔄 Trial 81, Fold 4/5
    Fold 4 best total loss: 0.014620
  🔄 Trial 81, Fold 5/5


[I 2025-09-09 04:19:24,329] Trial 81 finished with value: 0.014301489340141416 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.012688
  📊 Trial 81 - Average CV Loss: 0.014301 (±0.004067)
  🔄 Trial 82, Fold 1/5
    Fold 1 best total loss: 0.009409
  🔄 Trial 82, Fold 2/5
    Fold 2 best total loss: 0.017745
  🔄 Trial 82, Fold 3/5
    Fold 3 best total loss: 0.015711
  🔄 Trial 82, Fold 4/5
    Fold 4 best total loss: 0.012144
  🔄 Trial 82, Fold 5/5


[I 2025-09-09 04:26:18,245] Trial 82 finished with value: 0.014344193010280528 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.016711
  📊 Trial 82 - Average CV Loss: 0.014344 (±0.003106)
  🔄 Trial 83, Fold 1/5
    Fold 1 best total loss: 0.007984
  🔄 Trial 83, Fold 2/5
    Fold 2 best total loss: 0.018726
  🔄 Trial 83, Fold 3/5
    Fold 3 best total loss: 0.020358
  🔄 Trial 83, Fold 4/5
    Fold 4 best total loss: 0.010762
  🔄 Trial 83, Fold 5/5


[I 2025-09-09 04:32:05,702] Trial 83 finished with value: 0.014171965249503652 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.013030
  📊 Trial 83 - Average CV Loss: 0.014172 (±0.004695)
  🔄 Trial 84, Fold 1/5
    Fold 1 best total loss: 0.009396
  🔄 Trial 84, Fold 2/5
    Fold 2 best total loss: 0.022538
  🔄 Trial 84, Fold 3/5
    Fold 3 best total loss: 0.018365
  🔄 Trial 84, Fold 4/5
    Fold 4 best total loss: 0.014815
  🔄 Trial 84, Fold 5/5


[I 2025-09-09 04:38:59,909] Trial 84 finished with value: 0.016108598994712037 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.015428
  📊 Trial 84 - Average CV Loss: 0.016109 (±0.004327)
  🔄 Trial 85, Fold 1/5
    Fold 1 best total loss: 0.011972
  🔄 Trial 85, Fold 2/5
    Fold 2 best total loss: 0.017106
  🔄 Trial 85, Fold 3/5
    Fold 3 best total loss: 0.018301
  🔄 Trial 85, Fold 4/5
    Fold 4 best total loss: 0.013052
  🔄 Trial 85, Fold 5/5


[I 2025-09-09 04:44:45,994] Trial 85 finished with value: 0.014239849631364149 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.010768
  📊 Trial 85 - Average CV Loss: 0.014240 (±0.002943)
  🔄 Trial 86, Fold 1/5
    Fold 1 best total loss: 0.010580
  🔄 Trial 86, Fold 2/5
    Fold 2 best total loss: 0.021563
  🔄 Trial 86, Fold 3/5
    Fold 3 best total loss: 0.021057
  🔄 Trial 86, Fold 4/5
    Fold 4 best total loss: 0.012486
  🔄 Trial 86, Fold 5/5


[I 2025-09-09 04:50:33,279] Trial 86 finished with value: 0.016176906041800977 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.015198
  📊 Trial 86 - Average CV Loss: 0.016177 (±0.004444)
  🔄 Trial 87, Fold 1/5
    Fold 1 best total loss: 0.009277
  🔄 Trial 87, Fold 2/5
    Fold 2 best total loss: 0.020438
  🔄 Trial 87, Fold 3/5
    Fold 3 best total loss: 0.017987
  🔄 Trial 87, Fold 4/5
    Fold 4 best total loss: 0.011742
  🔄 Trial 87, Fold 5/5


[I 2025-09-09 04:56:21,364] Trial 87 finished with value: 0.0142298663345476 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.011705
  📊 Trial 87 - Average CV Loss: 0.014230 (±0.004237)
  🔄 Trial 88, Fold 1/5
    Fold 1 best total loss: 0.008783
  🔄 Trial 88, Fold 2/5
    Fold 2 best total loss: 0.019053
  🔄 Trial 88, Fold 3/5
    Fold 3 best total loss: 0.016970
  🔄 Trial 88, Fold 4/5
    Fold 4 best total loss: 0.016455
  🔄 Trial 88, Fold 5/5


[I 2025-09-09 05:02:07,915] Trial 88 finished with value: 0.01540397279895842 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.015759
  📊 Trial 88 - Average CV Loss: 0.015404 (±0.003488)
  🔄 Trial 89, Fold 1/5
    Fold 1 best total loss: 0.007793
  🔄 Trial 89, Fold 2/5
    Fold 2 best total loss: 0.024521
  🔄 Trial 89, Fold 3/5
    Fold 3 best total loss: 0.021135
  🔄 Trial 89, Fold 4/5
    Fold 4 best total loss: 0.014275
  🔄 Trial 89, Fold 5/5


[I 2025-09-09 05:07:55,870] Trial 89 finished with value: 0.015661131000767152 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.010582
  📊 Trial 89 - Average CV Loss: 0.015661 (±0.006294)
  🔄 Trial 90, Fold 1/5
    Fold 1 best total loss: 0.011988
  🔄 Trial 90, Fold 2/5
    Fold 2 best total loss: 0.020300
  🔄 Trial 90, Fold 3/5
    Fold 3 best total loss: 0.018954
  🔄 Trial 90, Fold 4/5
    Fold 4 best total loss: 0.016563
  🔄 Trial 90, Fold 5/5


[I 2025-09-09 05:13:39,677] Trial 90 finished with value: 0.017419634324808912 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 1.8, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.019293
  📊 Trial 90 - Average CV Loss: 0.017420 (±0.002980)
  🔄 Trial 91, Fold 1/5
    Fold 1 best total loss: 0.010879
  🔄 Trial 91, Fold 2/5
    Fold 2 best total loss: 0.016934
  🔄 Trial 91, Fold 3/5
    Fold 3 best total loss: 0.019402
  🔄 Trial 91, Fold 4/5
    Fold 4 best total loss: 0.011500
  🔄 Trial 91, Fold 5/5


[I 2025-09-09 05:19:26,057] Trial 91 finished with value: 0.014285211606572073 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.012711
  📊 Trial 91 - Average CV Loss: 0.014285 (±0.003318)
  🔄 Trial 92, Fold 1/5
    Fold 1 best total loss: 0.010963
  🔄 Trial 92, Fold 2/5
    Fold 2 best total loss: 0.017683
  🔄 Trial 92, Fold 3/5
    Fold 3 best total loss: 0.017490
  🔄 Trial 92, Fold 4/5
    Fold 4 best total loss: 0.013393
  🔄 Trial 92, Fold 5/5


[I 2025-09-09 05:25:08,163] Trial 92 finished with value: 0.015016302916531759 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.015553
  📊 Trial 92 - Average CV Loss: 0.015016 (±0.002553)
  🔄 Trial 93, Fold 1/5
    Fold 1 best total loss: 0.009703
  🔄 Trial 93, Fold 2/5
    Fold 2 best total loss: 0.020439
  🔄 Trial 93, Fold 3/5
    Fold 3 best total loss: 0.198981
  🔄 Trial 93, Fold 4/5
    Fold 4 best total loss: 0.011872
  🔄 Trial 93, Fold 5/5


[I 2025-09-09 05:30:58,466] Trial 93 finished with value: 0.051088450476527214 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.014447
  📊 Trial 93 - Average CV Loss: 0.051088 (±0.074034)
  🔄 Trial 94, Fold 1/5
    Fold 1 best total loss: 0.009253
  🔄 Trial 94, Fold 2/5
    Fold 2 best total loss: 0.018843
  🔄 Trial 94, Fold 3/5
    Fold 3 best total loss: 0.019429
  🔄 Trial 94, Fold 4/5
    Fold 4 best total loss: 0.010406
  🔄 Trial 94, Fold 5/5


[I 2025-09-09 05:36:43,517] Trial 94 finished with value: 0.014625430029506484 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.015196
  📊 Trial 94 - Average CV Loss: 0.014625 (±0.004192)
  🔄 Trial 95, Fold 1/5
    Fold 1 best total loss: 0.011035
  🔄 Trial 95, Fold 2/5
    Fold 2 best total loss: 0.018483
  🔄 Trial 95, Fold 3/5
    Fold 3 best total loss: 0.017997
  🔄 Trial 95, Fold 4/5
    Fold 4 best total loss: 0.018318
  🔄 Trial 95, Fold 5/5


[I 2025-09-09 05:42:30,803] Trial 95 finished with value: 0.016819270110378662 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.018264
  📊 Trial 95 - Average CV Loss: 0.016819 (±0.002897)
  🔄 Trial 96, Fold 1/5
    Fold 1 best total loss: 0.008101
  🔄 Trial 96, Fold 2/5
    Fold 2 best total loss: 0.020049
  🔄 Trial 96, Fold 3/5
    Fold 3 best total loss: 0.019603
  🔄 Trial 96, Fold 4/5
    Fold 4 best total loss: 0.012286
  🔄 Trial 96, Fold 5/5


[I 2025-09-09 05:48:16,366] Trial 96 finished with value: 0.014084919836993018 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.8, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.010385
  📊 Trial 96 - Average CV Loss: 0.014085 (±0.004873)
  🔄 Trial 97, Fold 1/5
    Fold 1 best total loss: 0.010911
  🔄 Trial 97, Fold 2/5
    Fold 2 best total loss: 0.016297
  🔄 Trial 97, Fold 3/5
    Fold 3 best total loss: 0.018637
  🔄 Trial 97, Fold 4/5
    Fold 4 best total loss: 4.363938
  🔄 Trial 97, Fold 5/5


[I 2025-09-09 05:54:00,597] Trial 97 finished with value: 0.8840188292476038 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 2.0, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.010311
  📊 Trial 97 - Average CV Loss: 0.884019 (±1.739963)
  🔄 Trial 98, Fold 1/5
    Fold 1 best total loss: 0.011798
  🔄 Trial 98, Fold 2/5
    Fold 2 best total loss: 0.019961
  🔄 Trial 98, Fold 3/5
    Fold 3 best total loss: 0.021190
  🔄 Trial 98, Fold 4/5
    Fold 4 best total loss: 0.019500
  🔄 Trial 98, Fold 5/5


[I 2025-09-09 05:59:47,227] Trial 98 finished with value: 0.0179111762282749 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.4, 'noam_factor': 1.7000000000000002, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.017108
  📊 Trial 98 - Average CV Loss: 0.017911 (±0.003332)
  🔄 Trial 99, Fold 1/5
    Fold 1 best total loss: 0.010135
  🔄 Trial 99, Fold 2/5
    Fold 2 best total loss: 0.299125
  🔄 Trial 99, Fold 3/5
    Fold 3 best total loss: 0.017179
  🔄 Trial 99, Fold 4/5
    Fold 4 best total loss: 0.012743
  🔄 Trial 99, Fold 5/5


[I 2025-09-09 06:05:30,154] Trial 99 finished with value: 0.07125218901783228 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.30000000000000004, 'noam_factor': 1.9000000000000001, 'warmup_ratio': 0.3, 'batch_size': 5}. Best is trial 80 with value: 0.014023609009260932.


    Fold 5 best total loss: 0.017079
  📊 Trial 99 - Average CV Loss: 0.071252 (±0.113968)

📊 OPTIMIZATION RESULTS
✅ 완료된 trials: 100
🏆 최고 성능 trial: 80
💯 최고 성능 값: 0.014024

🎯 최적 하이퍼파라미터:
   lstm_hidden_size: 64
   lstm_n_layers: 5
   lstm_dropout: 0.1
   decoder_hidden_size: 16
   decoder_n_layers: 2
   decoder_dropout: 0.2
   current_hidden_size: 48
   current_n_layers: 2
   current_dropout: 0.30000000000000004
   noam_factor: 1.7000000000000002
   warmup_ratio: 0.3
   batch_size: 5

📈 상위 5개 Trials:
   1. Trial 80: 0.014024
   2. Trial 96: 0.014085
   3. Trial 83: 0.014172
   4. Trial 87: 0.014230
   5. Trial 85: 0.014240
💾 모든 trials 결과가 저장되었습니다: bmed_optuna_trials_20250909_060530.csv
💾 SQLite 데이터베이스에 실시간 저장됨: sqlite:///bmed_optuna_study_20250908_222546.db
   - 중단 후 재시작 시 자동으로 기존 결과를 불러옵니다
   - 다른 프로세스에서 진행상황 모니터링 가능합니다
🎉 하이퍼파라미터 최적화 완료!
