In [1]:
# import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, Subset
from sklearn.model_selection import KFold
import optuna
from datetime import datetime
from optuna.trial import TrialState
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class LayerNormLSTM(nn.Module):
    def __init__(self, input_node, hidden_node):
        super().__init__()
        self.input_node = input_node
        self.hidden_node = hidden_node

        self.w_i = nn.Linear(input_node, 4*hidden_node, bias=False)
        self.w_h = nn.Linear(hidden_node, 4*hidden_node, bias=False)

        self.ln_i = nn.LayerNorm(hidden_node)
        self.ln_f = nn.LayerNorm(hidden_node)
        self.ln_w = nn.LayerNorm(hidden_node)
        self.ln_o = nn.LayerNorm(hidden_node)
        self.ln_c = nn.LayerNorm(hidden_node)

    def forward(self, input, hidden):
        h_prev, c_prev = hidden

        gi = self.w_i(input)
        gh = self.w_h(h_prev)
        i_i, i_f, i_w, i_o = gi.chunk(4, dim=-1)
        h_i, h_f, h_w, h_o = gh.chunk(4, dim=-1)

        i_g = torch.sigmoid(self.ln_i(i_i + h_i))
        f_g = torch.sigmoid(self.ln_f(i_f + h_f))
        w_g = torch.tanh(self.ln_w(i_w + h_w))
        o_g = torch.sigmoid(self.ln_o(i_o + h_o))
        

        c_new = f_g * c_prev + i_g * w_g
        c_new = self.ln_c(c_new)

        h_new = o_g * torch.tanh(c_new)

        return h_new, c_new

In [3]:
class StateExtr(nn.Module):
    def __init__(self, input_node, hidden_node, n_layer, dropout):
        super().__init__()
        self.hidden_node = hidden_node
        self.n_layer = n_layer
        self.input_node = input_node

        self.lstm_cells = nn.ModuleList()
        self.lstm_cells.append(LayerNormLSTM(input_node, hidden_node))
        for _ in range(n_layer - 1):
            self.lstm_cells.append(LayerNormLSTM(hidden_node, hidden_node))

        self.dropout = nn.Dropout(dropout)
        self.final_layer_norm = nn.LayerNorm(hidden_node)
        self.final_dropout = nn.Dropout(dropout)

    def forward(self, x, seq_len):
        batch_size, max_len, input_node = x.size()
        device = x.device

        h_states = []
        c_states = []
        for _ in range(self.n_layer):
            h_states.append(torch.zeros(batch_size, self.hidden_node, device=device))
            c_states.append(torch.zeros(batch_size, self.hidden_node, device=device))
        
        outputs = []
        for t in range(max_len):
            x_t = x[:, t, :]
            layer_input = x_t
            for layer_idx, lstm_cell in enumerate(self.lstm_cells):
                h_new, c_new = lstm_cell(layer_input, (h_states[layer_idx], c_states[layer_idx]))
                h_states[layer_idx] = h_new
                c_states[layer_idx] = c_new

                if layer_idx < len(self.lstm_cells) - 1:
                    layer_input = self.dropout(h_new)
                else:
                    layer_input = h_new
            outputs.append(layer_input)
        
        output_tensor = torch.stack(outputs, dim=1)
        seq_len_cpu = seq_len.detach().cpu().long()
        mask = torch.arange(max_len, device='cpu')[None, :] < seq_len_cpu[:, None]
        mask = mask.float().to(device).unsqueeze(-1)
        masked_output = output_tensor * mask
        normalized = self.final_layer_norm(masked_output)
        return self.final_dropout(normalized)

In [4]:
class PhysicalChangeDecoder(nn.Module):
    def __init__(self, input_node, output_node, n_layer, hidden_node, dropout):
        super().__init__()
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(input_node, hidden_node))
        self.layers.append(nn.LayerNorm(hidden_node))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout))

        for i in range(n_layer - 1):
            self.layers.append(nn.Linear(hidden_node, hidden_node))
            self.layers.append(nn.LayerNorm(hidden_node))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout))

        self.layers.append(nn.Linear(hidden_node, output_node))
    
    def forward(self, hidden_states):
        x = hidden_states
        for layer in self.layers:
            x = layer(x)
        return x

In [5]:
class CurrentPredictor(nn.Module):
    def __init__(self, input_node, hidden_node, n_layer, dropout):
        super().__init__()
        self.layers = nn.ModuleList()
        
        self.layers.append(nn.Linear(input_node, hidden_node))
        self.layers.append(nn.LayerNorm(hidden_node))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout))
        
        for i in range(n_layer - 1):
            self.layers.append(nn.Linear(hidden_node, hidden_node))
            self.layers.append(nn.LayerNorm(hidden_node))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout))
        
        self.layers.append(nn.Linear(hidden_node, 1))
    
    def forward(self, new_state):
        x = new_state
        for layer in self.layers:
            x = layer(x)
        return x

In [6]:
class PhysicsConstraintLayer(nn.Module):
    def __init__(self, range_mm, current_predictor, eps=1e-2):
        super().__init__()
        self.sps = eps
        self.current_predictor = current_predictor
        self.register_buffer('range_mm_tensor', self._convert_range_to_tensor(range_mm))

    def _convert_range_to_tensor(self, range_mm):
        feature_names = ['V','E','VF','VA','VB','CFLA','CALA','CFK','CBK','I']
        ranges = torch.zeros(len(feature_names),2)
        for i, name in enumerate(feature_names):
            if name in range_mm:
                ranges[i, 0] = range_mm[name]['min']
                ranges[i, 1] = range_mm[name]['max']
        return ranges
    
    def normalize(self, data, feature_idx):
        min_val = self.range_mm_tensor[feature_idx, 0]
        max_val = self.range_mm_tensor[feature_idx, 1]
        return (data - min_val) / (max_val - min_val)

    def denormalize(self, data, feature_idx):
        min_val = self.range_mm_tensor[feature_idx, 0]
        max_val = self.range_mm_tensor[feature_idx, 1]
        return data * (max_val - min_val) + min_val

    def forward(self, physical_changes, current_state):
        V_idx, E_idx, VF_idx, VA_idx, VB_idx = 0, 1, 2, 3, 4
        CFLA_idx, CALA_idx, CFK_idx, CBK_idx, I_idx = 5, 6, 7, 8, 9

        VF = self.denormalize(current_state[..., 2:3], VF_idx)
        VA = self.denormalize(current_state[..., 3:4], VA_idx)
        VB = self.denormalize(current_state[..., 4:5], VB_idx)
        CFLA = self.denormalize(current_state[..., 5:6], CFLA_idx)
        CALA = self.denormalize(current_state[..., 6:7], CALA_idx)
        CFK = self.denormalize(current_state[..., 7:8], CFK_idx)
        CBK = self.denormalize(current_state[..., 8:9], CBK_idx)

        dVA = physical_changes[..., 0:1]
        dVB = physical_changes[..., 1:2]
        rratio = physical_changes[..., 2:3]
        dNBK = physical_changes[..., 3:4]

        ratio = torch.sigmoid(rratio)
        dNALA = ratio * dNBK

        NFLA = CFLA * VF
        NALA = CALA * VA
        NFK = CFK * VF
        NBK = CBK * VB

        # tensor 비교를 torch.where로 변경
        condition1 = VF < dVA + dVB
        dVA = torch.where(condition1, torch.zeros_like(dVA), dVA)
        dVB = torch.where(condition1, torch.zeros_like(dVB), dVB)
        
        condition2 = NFLA < dNALA
        dNALA = torch.where(condition2, torch.zeros_like(dNALA), dNALA)
        
        condition3 = NFK < dNBK
        dNBK = torch.where(condition3, torch.zeros_like(dNBK), dNBK)

        nVF = VF - dVA - dVB
        nVA = VA + dVA
        nVB = VB + dVB

        nVF = torch.clamp(nVF, min=self.sps)
        nVA = torch.clamp(nVA, min=self.sps)
        nVB = torch.clamp(nVB, min=self.sps)
        
        nNFLA = NFLA - dNALA
        nNALA = NALA + dNALA
        nNFK = NFK - dNBK
        nNBK = NBK + dNBK

        nCFLA = nNFLA / nVF
        nCALA = nNALA / nVA
        nCFK = nNFK / nVF
        nCBK = nNBK / nVB

        V = current_state[..., 0:1]
        E = current_state[..., 1:2]
        nVF_norm = self.normalize(nVF, VF_idx)
        nVA_norm = self.normalize(nVA, VA_idx)
        nVB_norm = self.normalize(nVB, VB_idx)
        nCFLA_norm = self.normalize(nCFLA, CFLA_idx)
        nCALA_norm = self.normalize(nCALA, CALA_idx)
        nCFK_norm = self.normalize(nCFK, CFK_idx)
        nCBK_norm = self.normalize(nCBK, CBK_idx)

        temp_state = torch.cat([
            V, E, nVF_norm, nVA_norm, nVB_norm, nCFLA_norm, nCALA_norm, nCFK_norm, nCBK_norm
        ], dim=-1)
        
        nI_pred_norm = self.current_predictor(temp_state)
        nI_real = self.denormalize(nI_pred_norm, I_idx)
        nI_real = torch.clamp(nI_real, min=0.0)
        nI_norm = self.normalize(nI_real, I_idx)

        next_state = torch.cat([
            V, E, nVF_norm, nVA_norm, nVB_norm, nCFLA_norm, nCALA_norm, nCFK_norm, nCBK_norm, nI_norm
        ], dim=-1)
        
        return next_state

In [7]:
class BMEDAutoregressiveModel(nn.Module):
    def __init__(self, state_extr_params, decoder_params, current_predictor_params, range_mm):
        super().__init__()
        self.state_extr = StateExtr(**state_extr_params)
        self.physical_decoder = PhysicalChangeDecoder(**decoder_params)
        self.current_predictor = CurrentPredictor(**current_predictor_params)
        self.physics_constraint = PhysicsConstraintLayer(range_mm, self.current_predictor)

    def forward(self, x, seq_len):
        hidden_states = self.state_extr(x, seq_len)
        physical_changes = self.physical_decoder(hidden_states)
        new_x = self.physics_constraint(physical_changes, x)
        return new_x

In [8]:
class NoamScheduler:
    def __init__(self, optimizer, model_size, warmup_epochs, factor=1.0):
        self.optimizer = optimizer
        self.model_size = model_size
        self.warmup_epochs = warmup_epochs
        self.factor = 1
        self.epoch_num = 0

    def step_epoch(self):
        self.epoch_num += 1
        lr = self.factor * (
            self.model_size ** (-0.5) *
            min(self.epoch_num ** (-0.5), self.epoch_num * self.warmup_epochs ** (-1.5))
        )
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr
        return lr

In [9]:
# 유틸리티 함수들
def df_treat(name):
    df = pd.read_csv(name)
    ndf = pd.DataFrame()
    range_mm={
        'V': {'min':df['V'].min()*0.8, 'max': df['V'].max()*1.2},
        'E': {'min':df['E'].min()*0.8, 'max': df['E'].max()*1.2},
        'VF': {'min':df['VF'].min()*0.8, 'max': df['VF'].max()*1.2},
        'VA': {'min':df['VA'].min()*0.8, 'max': df['VA'].max()*1.2},
        'VB': {'min':df['VB'].min()*0.8, 'max': df['VB'].max()*1.2},
        'CFLA': {'min':0, 'max': df['CFLA'].max()*1.2},
        'CALA': {'min':0, 'max': df['CALA'].max()*1.2},
        'CFK': {'min':0, 'max': df['CFK'].max()*1.2},
        'CBK': {'min':0, 'max': df['CBK'].max()*1.2},
        'I': {'min':0, 'max': df['I'].max()*1.2},
    }
    ndf['exp'] = df['exp']; ndf['t'] = df['t']

    for col in ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CFK', 'CBK', 'I']:
        if col in range_mm:
            ndf[col] = (df[col] - range_mm[col]['min'])/(range_mm[col]['max'] - range_mm[col]['min'])
        else:
            ndf[col] = df[col]

    exp_num_list = sorted(ndf['exp'].unique())
    return df, ndf, range_mm, exp_num_list

def seq_data(ndf, exp_num_list):
    seq = []
    feature_cols = ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CFK', 'CBK', 'I']
    for exp in exp_num_list:
        exp_df = ndf[ndf['exp'] == exp]
        seq.append(exp_df[feature_cols].values)
    return seq

def pad_seq(seq):
    max_len = max([len(s) for s in seq])
    seq_len = [len(s) for s in seq]
    pad_seq = pad_sequence([torch.tensor(s) for s in seq], batch_first=True, padding_value=-1)
    return pad_seq, seq_len, max_len

def gen_dataset(pad_seq, seq_len):
    input_tensor = pad_seq.float()
    seq_len_tensor = torch.tensor(seq_len)
    dataset = TensorDataset(input_tensor, seq_len_tensor)
    return dataset

def masked_mse_loss(pred, target, seq_len):
    batch_size, max_len, features = pred.shape
    seq_len_cpu = seq_len.detach().cpu().long()
    mask = torch.arange(max_len, device='cpu')[None, :] < seq_len_cpu[:, None]
    mask = mask.float().to(pred.device)
    loss = F.mse_loss(pred, target, reduction='none')
    masked_loss = loss * mask.unsqueeze(-1)
    total_loss = masked_loss.sum()
    total_elements = mask.sum()
    masked_loss = total_loss / total_elements
    return masked_loss

def tf_data(input_seq, seq_len):
    inputs = input_seq[:, :-1, :-1]
    targets = input_seq[:, 1:, :]
    target_seq_len = seq_len - 1
    return inputs, targets, target_seq_len

In [10]:
# Optuna 목적 함수
def objective(trial):
    """
    Optuna trial을 위한 목적 함수
    K-fold cross validation을 사용하여 하이퍼파라미터 최적화
    """
    
    # 1. 하이퍼파라미터 제안
    # LSTM StateExtractor 파라미터
    lstm_hidden_size = trial.suggest_categorical('lstm_hidden_size', [16, 32, 48, 64, 72, 96, 128])
    lstm_n_layers = trial.suggest_int('lstm_n_layers', 2, 6, step=1)
    lstm_dropout = trial.suggest_float('lstm_dropout', 0.1, 0.5, step=0.1)
    
    # PhysicalChangeDecoder 파라미터
    decoder_hidden_size = trial.suggest_categorical('decoder_hidden_size', [16, 32, 48, 64, 72, 96, 128])
    decoder_n_layers = trial.suggest_int('decoder_n_layers', 2, 6, step=1)
    decoder_dropout = trial.suggest_float('decoder_dropout', 0.1, 0.6, step=0.1)
    
    # CurrentPredictor 파라미터
    current_hidden_size = trial.suggest_categorical('current_hidden_size', [16, 32, 48, 64, 72, 96, 128])
    current_n_layers = trial.suggest_int('current_n_layers', 2, 6, step=1)
    current_dropout = trial.suggest_float('current_dropout', 0.1, 0.6, step=0.1)
    
    # Batch size 파라미터
    batch_size = trial.suggest_categorical('batch_size', [3, 5, 15])
    
    # 2. K-fold Cross Validation
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    n_splits = 5
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    fold_losses = []
    
    # 데이터 로드 (global 변수 사용)
    indices = list(range(len(dataset)))
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(indices)):
        print(f"  🔄 Trial {trial.number}, Fold {fold+1}/{n_splits}")
        
        # 폴드별 데이터셋 준비
        train_subset = Subset(dataset, train_idx)
        val_subset = Subset(dataset, val_idx)
        
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        
        # 3. 모델 파라미터 설정
        state_extr_params = {
            'input_node': 9,
            'hidden_node': lstm_hidden_size,
            'n_layer': lstm_n_layers,
            'dropout': lstm_dropout
        }
        
        decoder_params = {
            'input_node': lstm_hidden_size,
            'hidden_node': decoder_hidden_size,
            'n_layer': decoder_n_layers,
            'dropout': decoder_dropout,
            'output_node': 4
        }
        
        current_predictor_params = {
            'input_node': 9,
            'hidden_node': current_hidden_size,
            'n_layer': current_n_layers,
            'dropout': current_dropout
        }
        
        # 4. 모델 초기화
        model = BMEDAutoregressiveModel(state_extr_params, decoder_params, current_predictor_params, range_mm)
        model = model.to(device)
        
        # 5. 옵티마이저 및 스케줄러 설정
        optimizer = torch.optim.AdamW(model.parameters(), lr=1.0)
        
        # 총 에포크 수와 warmup 에포크 계산
        total_epochs = 100  # Optuna 최적화를 위해 에포크 수 감소
        warmup_epochs = int(total_epochs * 0.1)
        
        scheduler = NoamScheduler(
            optimizer, 
            model_size=lstm_hidden_size,
            warmup_epochs=warmup_epochs,
            factor=1
        )
        
        # 6. 훈련
        best_total_loss = float('inf')
        
        for epoch in range(total_epochs):
            # Learning rate 업데이트
            current_lr = scheduler.step_epoch()
            
            # 훈련
            model.train()
            train_loss = 0.0
            train_batches = 0
            
            for input_seq, seq_len in train_loader:
                try:
                    input_seq = input_seq.to(device)
                    seq_len = seq_len.to(device)
                    
                    inputs, targets, target_seq_len = tf_data(input_seq, seq_len)
                    
                    optimizer.zero_grad()
                    pred = model(inputs, target_seq_len)
                    loss = masked_mse_loss(pred, targets, target_seq_len)
                    
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    
                    train_loss += loss.item()
                    train_batches += 1
                    
                except Exception as e:
                    print(f"❌ Error in training: {str(e)}")
                    continue
            
            if train_batches == 0:
                break
                
            train_loss = train_loss / train_batches
            
            # 검증
            model.eval()
            val_loss = 0.0
            val_batches = 0
            
            with torch.no_grad():
                for input_seq, seq_len in val_loader:
                    try:
                        input_seq = input_seq.to(device)
                        seq_len = seq_len.to(device)
                        
                        inputs, targets, target_seq_len = tf_data(input_seq, seq_len)
                        
                        pred = model(inputs, target_seq_len)
                        loss = masked_mse_loss(pred, targets, target_seq_len)
                        
                        val_loss += loss.item()
                        val_batches += 1
                        
                    except Exception as e:
                        continue
            
            if val_batches == 0:
                break
                
            val_loss = val_loss / val_batches
            
            # Calculate total loss
            total_loss = train_loss + val_loss
            
            # Early stopping
            if total_loss < best_total_loss:
                best_total_loss = total_loss
        
        fold_losses.append(best_total_loss)
        print(f"    Fold {fold+1} best total loss: {best_total_loss:.6f}")
        
        # 메모리 정리
        del model, optimizer, scheduler
        torch.cuda.empty_cache()
    
    # 7. K-fold 평균 손실 반환
    avg_loss = np.mean(fold_losses)
    std_loss = np.std(fold_losses)
    
    print(f"  📊 Trial {trial.number} - Average CV Loss: {avg_loss:.6f} (±{std_loss:.6f})")
    
    return avg_loss

In [None]:
# 메인 최적화 함수
def run_optuna_optimization():
    """Optuna를 사용한 하이퍼파라미터 최적화 실행"""
    
    print("🚀 BMED TF Model Hyperparameter Optimization with Optuna")
    print("="*80)
    
    # 전역 데이터 로드
    global dataset, range_mm
    
    print("📋 데이터 로드 중...")
    df, ndf, range_mm, exp_num_list = df_treat('BMED_DATA_AG.csv')
    seq = seq_data(ndf, exp_num_list)
    pad, seq_len, max_len = pad_seq(seq)
    dataset = gen_dataset(pad, seq_len)
    
    print(f"   - 총 실험 개수: {len(exp_num_list)}")
    print(f"   - 총 데이터 포인트: {len(dataset)}")
    print(f"   - 최대 시퀀스 길이: {max_len}")
    
    # SQLite 데이터베이스를 사용한 Optuna study 생성
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    db_url = f"sqlite:///bmed_optuna_study_{timestamp}.db"
    
    study = optuna.create_study(
        direction='minimize',
        study_name='bmed_tf_optimization',
        sampler=optuna.samplers.TPESampler(seed=42),
        storage=db_url,
        load_if_exists=True
    )
    
    # 최적화 실행
    n_trials = 100
    print(f"🔍 최적화 시작 (총 {n_trials} trials)")
    
    try:
        study.optimize(objective, n_trials=n_trials, timeout=None)
    except KeyboardInterrupt:
        print("\n⚠️ 최적화가 사용자에 의해 중단되었습니다.")
    
    # 결과 분석
    print("\n" + "="*80)
    print("📊 OPTIMIZATION RESULTS")
    print("="*80)
    
    print(f"✅ 완료된 trials: {len(study.trials)}")
    print(f"🏆 최고 성능 trial: {study.best_trial.number}")
    print(f"💯 최고 성능 값: {study.best_value:.6f}")
    
    print(f"\n🎯 최적 하이퍼파라미터:")
    for key, value in study.best_params.items():
        print(f"   {key}: {value}")
    
    # 상위 5개 trial 정보
    print(f"\n📈 상위 5개 Trials:")
    trials_df = study.trials_dataframe().sort_values('value').head(5)
    for idx, (_, trial) in enumerate(trials_df.iterrows()):
        print(f"   {idx+1}. Trial {int(trial['number'])}: {trial['value']:.6f}")
    
    # 결과 저장
    result_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Trials 결과 CSV로 저장
    trials_file = f"bmed_optuna_trials_{result_timestamp}.csv"
    trials_df = study.trials_dataframe()
    trials_df.to_csv(trials_file, index=False)
    print(f"💾 모든 trials 결과가 저장되었습니다: {trials_file}")
    
    # SQLite 데이터베이스 정보
    print(f"💾 SQLite 데이터베이스에 실시간 저장됨: {db_url}")
    print(f"   - 중단 후 재시작 시 자동으로 기존 결과를 불러옵니다")
    print(f"   - 다른 프로세스에서 진행상황 모니터링 가능합니다")
    
    print("="*80)
    print("🎉 하이퍼파라미터 최적화 완료!")
    
    return study

if __name__ == "__main__":
    study = run_optuna_optimization()

🚀 BMED TF Model Hyperparameter Optimization with Optuna
📋 데이터 로드 중...
   - 총 실험 개수: 15
   - 총 데이터 포인트: 15
   - 최대 시퀀스 길이: 37


[I 2025-09-15 15:49:02,930] A new study created in RDB with name: bmed_tf_optimization


🔍 최적화 시작 (총 100 trials)
  🔄 Trial 0, Fold 1/5
    Fold 1 best total loss: 0.011608
  🔄 Trial 0, Fold 2/5
    Fold 2 best total loss: 0.298200
  🔄 Trial 0, Fold 3/5
    Fold 3 best total loss: 0.196105
  🔄 Trial 0, Fold 4/5
    Fold 4 best total loss: 0.012230
  🔄 Trial 0, Fold 5/5


[I 2025-09-15 15:56:41,298] Trial 0 finished with value: 0.16931789900797106 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 6, 'lstm_dropout': 0.4, 'decoder_hidden_size': 48, 'decoder_n_layers': 3, 'decoder_dropout': 0.4, 'current_hidden_size': 48, 'current_n_layers': 5, 'current_dropout': 0.2, 'batch_size': 5}. Best is trial 0 with value: 0.16931789900797106.


    Fold 5 best total loss: 0.328448
  📊 Trial 0 - Average CV Loss: 0.169318 (±0.135794)
  🔄 Trial 1, Fold 1/5
    Fold 1 best total loss: 0.322963
  🔄 Trial 1, Fold 2/5
    Fold 2 best total loss: 0.325461
  🔄 Trial 1, Fold 3/5
    Fold 3 best total loss: 0.232099
  🔄 Trial 1, Fold 4/5
    Fold 4 best total loss: 0.212643
  🔄 Trial 1, Fold 5/5


[I 2025-09-15 15:57:48,208] Trial 1 finished with value: 0.28939867168664934 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 2, 'lstm_dropout': 0.4, 'decoder_hidden_size': 72, 'decoder_n_layers': 3, 'decoder_dropout': 0.4, 'current_hidden_size': 48, 'current_n_layers': 6, 'current_dropout': 0.1, 'batch_size': 15}. Best is trial 0 with value: 0.16931789900797106.


    Fold 5 best total loss: 0.353827
  📊 Trial 1 - Average CV Loss: 0.289399 (±0.056130)
  🔄 Trial 2, Fold 1/5
    Fold 1 best total loss: 0.017323
  🔄 Trial 2, Fold 2/5
    Fold 2 best total loss: 0.022412
  🔄 Trial 2, Fold 3/5
    Fold 3 best total loss: 0.029010
  🔄 Trial 2, Fold 4/5
    Fold 4 best total loss: 0.018293
  🔄 Trial 2, Fold 5/5


[I 2025-09-15 16:05:30,046] Trial 2 finished with value: 0.022032728915413226 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 16, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 48, 'current_n_layers': 3, 'current_dropout': 0.5, 'batch_size': 5}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.023125
  📊 Trial 2 - Average CV Loss: 0.022033 (±0.004152)
  🔄 Trial 3, Fold 1/5
    Fold 1 best total loss: 0.294786
  🔄 Trial 3, Fold 2/5
    Fold 2 best total loss: 0.297102
  🔄 Trial 3, Fold 3/5
    Fold 3 best total loss: 0.197776
  🔄 Trial 3, Fold 4/5
    Fold 4 best total loss: 0.171690
  🔄 Trial 3, Fold 5/5


[I 2025-09-15 16:10:42,014] Trial 3 finished with value: 0.25774471790840225 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 4, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.5, 'current_hidden_size': 72, 'current_n_layers': 6, 'current_dropout': 0.5, 'batch_size': 5}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.327370
  📊 Trial 3 - Average CV Loss: 0.257745 (±0.061271)
  🔄 Trial 4, Fold 1/5
    Fold 1 best total loss: 0.013070
  🔄 Trial 4, Fold 2/5
    Fold 2 best total loss: 0.320548
  🔄 Trial 4, Fold 3/5
    Fold 3 best total loss: 0.018206
  🔄 Trial 4, Fold 4/5
    Fold 4 best total loss: 0.015274
  🔄 Trial 4, Fold 5/5


[I 2025-09-15 16:20:33,413] Trial 4 finished with value: 0.07820196094689891 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 96, 'decoder_n_layers': 4, 'decoder_dropout': 0.5, 'current_hidden_size': 32, 'current_n_layers': 2, 'current_dropout': 0.4, 'batch_size': 3}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.023911
  📊 Trial 4 - Average CV Loss: 0.078202 (±0.121228)
  🔄 Trial 5, Fold 1/5
    Fold 1 best total loss: 0.010976
  🔄 Trial 5, Fold 2/5
    Fold 2 best total loss: 0.018370
  🔄 Trial 5, Fold 3/5
    Fold 3 best total loss: 0.018296
  🔄 Trial 5, Fold 4/5
    Fold 4 best total loss: 0.171643
  🔄 Trial 5, Fold 5/5


[I 2025-09-15 16:26:54,716] Trial 5 finished with value: 0.047762057992319265 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 5, 'lstm_dropout': 0.2, 'decoder_hidden_size': 128, 'decoder_n_layers': 3, 'decoder_dropout': 0.2, 'current_hidden_size': 48, 'current_n_layers': 2, 'current_dropout': 0.5, 'batch_size': 5}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.019526
  📊 Trial 5 - Average CV Loss: 0.047762 (±0.062015)
  🔄 Trial 6, Fold 1/5
    Fold 1 best total loss: 0.314062
  🔄 Trial 6, Fold 2/5
    Fold 2 best total loss: 0.319134
  🔄 Trial 6, Fold 3/5
    Fold 3 best total loss: 0.223684
  🔄 Trial 6, Fold 4/5
    Fold 4 best total loss: 0.202942
  🔄 Trial 6, Fold 5/5


[I 2025-09-15 16:33:36,212] Trial 6 finished with value: 0.2810863287188113 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 4, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 64, 'decoder_n_layers': 5, 'decoder_dropout': 0.6, 'current_hidden_size': 96, 'current_n_layers': 2, 'current_dropout': 0.1, 'batch_size': 3}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.345609
  📊 Trial 6 - Average CV Loss: 0.281086 (±0.056745)
  🔄 Trial 7, Fold 1/5
    Fold 1 best total loss: 0.029638
  🔄 Trial 7, Fold 2/5
    Fold 2 best total loss: 0.024815
  🔄 Trial 7, Fold 3/5
    Fold 3 best total loss: 0.037516
  🔄 Trial 7, Fold 4/5
    Fold 4 best total loss: 0.025037
  🔄 Trial 7, Fold 5/5


[I 2025-09-15 16:40:02,088] Trial 7 finished with value: 0.029486780830969412 and parameters: {'lstm_hidden_size': 72, 'lstm_n_layers': 5, 'lstm_dropout': 0.4, 'decoder_hidden_size': 16, 'decoder_n_layers': 6, 'decoder_dropout': 0.30000000000000004, 'current_hidden_size': 16, 'current_n_layers': 5, 'current_dropout': 0.2, 'batch_size': 5}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.030427
  📊 Trial 7 - Average CV Loss: 0.029487 (±0.004626)
  🔄 Trial 8, Fold 1/5
    Fold 1 best total loss: 0.313678
  🔄 Trial 8, Fold 2/5
    Fold 2 best total loss: 0.319133
  🔄 Trial 8, Fold 3/5
    Fold 3 best total loss: 0.076459
  🔄 Trial 8, Fold 4/5
    Fold 4 best total loss: 0.200142
  🔄 Trial 8, Fold 5/5


[I 2025-09-15 16:49:57,665] Trial 8 finished with value: 0.2512471362017095 and parameters: {'lstm_hidden_size': 32, 'lstm_n_layers': 6, 'lstm_dropout': 0.5, 'decoder_hidden_size': 16, 'decoder_n_layers': 6, 'decoder_dropout': 0.5, 'current_hidden_size': 64, 'current_n_layers': 5, 'current_dropout': 0.5, 'batch_size': 3}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.346824
  📊 Trial 8 - Average CV Loss: 0.251247 (±0.100800)
  🔄 Trial 9, Fold 1/5
    Fold 1 best total loss: 0.295447
  🔄 Trial 9, Fold 2/5
    Fold 2 best total loss: 0.301671
  🔄 Trial 9, Fold 3/5
    Fold 3 best total loss: 0.198960
  🔄 Trial 9, Fold 4/5
    Fold 4 best total loss: 0.020660
  🔄 Trial 9, Fold 5/5


[I 2025-09-15 16:56:20,255] Trial 9 finished with value: 0.22828113154197732 and parameters: {'lstm_hidden_size': 64, 'lstm_n_layers': 5, 'lstm_dropout': 0.4, 'decoder_hidden_size': 32, 'decoder_n_layers': 4, 'decoder_dropout': 0.4, 'current_hidden_size': 72, 'current_n_layers': 4, 'current_dropout': 0.5, 'batch_size': 5}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.324668
  📊 Trial 9 - Average CV Loss: 0.228281 (±0.112387)
  🔄 Trial 10, Fold 1/5
    Fold 1 best total loss: 0.322785
  🔄 Trial 10, Fold 2/5
    Fold 2 best total loss: 0.325342
  🔄 Trial 10, Fold 3/5
    Fold 3 best total loss: 0.232078
  🔄 Trial 10, Fold 4/5
    Fold 4 best total loss: 0.212531
  🔄 Trial 10, Fold 5/5


[I 2025-09-15 16:57:25,313] Trial 10 finished with value: 0.28897142559289934 and parameters: {'lstm_hidden_size': 48, 'lstm_n_layers': 2, 'lstm_dropout': 0.2, 'decoder_hidden_size': 16, 'decoder_n_layers': 2, 'decoder_dropout': 0.1, 'current_hidden_size': 128, 'current_n_layers': 3, 'current_dropout': 0.6, 'batch_size': 15}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.352122
  📊 Trial 10 - Average CV Loss: 0.288971 (±0.055739)
  🔄 Trial 11, Fold 1/5
    Fold 1 best total loss: 0.019625
  🔄 Trial 11, Fold 2/5
    Fold 2 best total loss: 0.028019
  🔄 Trial 11, Fold 3/5
    Fold 3 best total loss: 0.032124
  🔄 Trial 11, Fold 4/5
    Fold 4 best total loss: 0.029698
  🔄 Trial 11, Fold 5/5


[I 2025-09-15 17:03:48,948] Trial 11 finished with value: 0.02815317511558533 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 5, 'lstm_dropout': 0.30000000000000004, 'decoder_hidden_size': 16, 'decoder_n_layers': 6, 'decoder_dropout': 0.2, 'current_hidden_size': 16, 'current_n_layers': 4, 'current_dropout': 0.30000000000000004, 'batch_size': 5}. Best is trial 2 with value: 0.022032728915413226.


    Fold 5 best total loss: 0.031299
  📊 Trial 11 - Average CV Loss: 0.028153 (±0.004490)
  🔄 Trial 12, Fold 1/5
    Fold 1 best total loss: 0.014406
  🔄 Trial 12, Fold 2/5
    Fold 2 best total loss: 0.021917
  🔄 Trial 12, Fold 3/5
    Fold 3 best total loss: 0.028684
  🔄 Trial 12, Fold 4/5
    Fold 4 best total loss: 0.016722
  🔄 Trial 12, Fold 5/5


[I 2025-09-15 17:10:10,073] Trial 12 finished with value: 0.02147533755439023 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 5, 'lstm_dropout': 0.2, 'decoder_hidden_size': 16, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.30000000000000004, 'batch_size': 5}. Best is trial 12 with value: 0.02147533755439023.


    Fold 5 best total loss: 0.025648
  📊 Trial 12 - Average CV Loss: 0.021475 (±0.005332)
  🔄 Trial 13, Fold 1/5
    Fold 1 best total loss: 0.015655
  🔄 Trial 13, Fold 2/5
    Fold 2 best total loss: 0.103033
  🔄 Trial 13, Fold 3/5
    Fold 3 best total loss: 0.026980
  🔄 Trial 13, Fold 4/5
    Fold 4 best total loss: 0.022916
  🔄 Trial 13, Fold 5/5


[I 2025-09-15 17:14:04,140] Trial 13 finished with value: 0.04005235596559942 and parameters: {'lstm_hidden_size': 16, 'lstm_n_layers': 3, 'lstm_dropout': 0.2, 'decoder_hidden_size': 16, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 16, 'current_n_layers': 3, 'current_dropout': 0.30000000000000004, 'batch_size': 5}. Best is trial 12 with value: 0.02147533755439023.


    Fold 5 best total loss: 0.031677
  📊 Trial 13 - Average CV Loss: 0.040052 (±0.031926)
  🔄 Trial 14, Fold 1/5
    Fold 1 best total loss: 0.293735
  🔄 Trial 14, Fold 2/5
    Fold 2 best total loss: 0.299836
  🔄 Trial 14, Fold 3/5
    Fold 3 best total loss: 0.195517
  🔄 Trial 14, Fold 4/5
    Fold 4 best total loss: 0.173985
  🔄 Trial 14, Fold 5/5


[I 2025-09-15 17:21:42,337] Trial 14 finished with value: 0.2583388180161516 and parameters: {'lstm_hidden_size': 128, 'lstm_n_layers': 6, 'lstm_dropout': 0.1, 'decoder_hidden_size': 48, 'decoder_n_layers': 5, 'decoder_dropout': 0.2, 'current_hidden_size': 128, 'current_n_layers': 3, 'current_dropout': 0.4, 'batch_size': 5}. Best is trial 12 with value: 0.02147533755439023.


    Fold 5 best total loss: 0.328621
  📊 Trial 14 - Average CV Loss: 0.258339 (±0.061606)
  🔄 Trial 15, Fold 1/5
    Fold 1 best total loss: 0.308021
  🔄 Trial 15, Fold 2/5
    Fold 2 best total loss: 0.072000
  🔄 Trial 15, Fold 3/5
    Fold 3 best total loss: 0.029893
  🔄 Trial 15, Fold 4/5
    Fold 4 best total loss: 0.021531
  🔄 Trial 15, Fold 5/5


[I 2025-09-15 17:24:15,521] Trial 15 finished with value: 0.15714634917676448 and parameters: {'lstm_hidden_size': 96, 'lstm_n_layers': 5, 'lstm_dropout': 0.2, 'decoder_hidden_size': 128, 'decoder_n_layers': 5, 'decoder_dropout': 0.1, 'current_hidden_size': 64, 'current_n_layers': 3, 'current_dropout': 0.6, 'batch_size': 15}. Best is trial 12 with value: 0.02147533755439023.


    Fold 5 best total loss: 0.354287
  📊 Trial 15 - Average CV Loss: 0.157146 (±0.143849)
  🔄 Trial 16, Fold 1/5
    Fold 1 best total loss: 0.295417
  🔄 Trial 16, Fold 2/5
    Fold 2 best total loss: 0.302019
  🔄 Trial 16, Fold 3/5
