In [136]:
# Import modules
import torch
import torch.nn as nn
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from torch.utils.data import Subset
import math

In [137]:
def set_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if torch.cuda.is_available():
        print(f'Using device: {device}')
        print(f'GPU: {torch.cuda.get_device_name(0)}')
    else:
        print(f'Using device: {device}')

    return device

In [138]:
def norm_data(name):
    df = pd.read_csv(name)
    ndf = pd.DataFrame()
    range_mm={
        'V': {'min':df['V'].min()*0.8, 'max': df['V'].max()*1.2},
        'E': {'min':df['E'].min()*0.8, 'max': df['E'].max()*1.2},
        'VF': {'min':df['VF'].min()*0.8, 'max': df['VF'].max()*1.2},
        'VA': {'min':df['VA'].min()*0.8, 'max': df['VA'].max()*1.2},
        'VB': {'min':df['VB'].min()*0.8, 'max': df['VB'].max()*1.2},
        'CFLA': {'min':0, 'max': df['CFLA'].max()*1.2},
        'CALA': {'min':0, 'max': df['CALA'].max()*1.2},
        'CBLA': {'min':0, 'max': df['CBLA'].max()*1.2},
        'CFK': {'min':0, 'max': df['CFK'].max()*1.2},
        'CAK': {'min':0, 'max': df['CAK'].max()*1.2},
        'CBK': {'min':0, 'max': df['CBK'].max()*1.2},
        'I': {'min':0, 'max': df['I'].max()*1.2},
    }

    ndf['exp'] = df['exp']; ndf['t'] = df['t']

    for col in ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CBLA', 'CFK', 'CAK', 'CBK', 'I']:
        if col in range_mm:
            ndf[col] = (df[col] - range_mm[col]['min'])/(range_mm[col]['max'] - range_mm[col]['min'])
        else:
            ndf[col] = df[col]
    return ndf

In [139]:
def seq_data_const(ndf):
    sequences = []
    feature_cols = ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CBLA', 'CFK', 'CAK', 'CBK', 'I']
    
    for exp in ndf['exp'].unique():
        exp_data = ndf[ndf['exp'] == exp].sort_values(by='t')
        sequences.append(exp_data[feature_cols].values)
    
    return sequences

In [140]:
def padded_sequences(sequences):
    max_seq_len = max([len(seq) for seq in sequences])
    seq_len = [len(seq) for seq in sequences]
    padded_sequences = pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=-1)

    return padded_sequences, seq_len, max_seq_len

In [141]:
def gen_dataset(pad_seq, seq_len):
    input_tensor= pad_seq.float()
    seq_len_tensor= torch.tensor(seq_len)

    device = set_device()
    input_tensor = input_tensor.to(device)
    seq_len_tensor = seq_len_tensor.to(device)

    dataset = TensorDataset(input_tensor, seq_len_tensor)
    return dataset

In [142]:
def kfold_dataloaders(dataset, k_folds=5, batch_size=8, random_state=42):
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=random_state)
    dataloaders = []
    batch_size = math.ceil(len(dataset)/k_folds)
    
    for fold, (train_indices, val_indices) in enumerate(kfold.split(range(len(dataset)))):
        print(f"Fold {fold + 1}: Train size = {len(train_indices)}, Val size = {len(val_indices)}")
        
        # Create subsets for train and validation
        train_subset = Subset(dataset, train_indices)
        val_subset = Subset(dataset, val_indices)
        
        # Create DataLoaders
        train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
        
        dataloaders.append((train_loader, val_loader))
    return dataloaders

In [145]:
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers,dropout=0.2):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x, sel_len):
        lstm_out, _ = self.lstm(x)
        last_output = lstm_out[:, -1, :]
        norm = self.layer_norm(last_output)
        return self.dropout(norm)

In [147]:
class MLPDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers=2, num_nodes=None, dropout = 0.3):
        super().__init__()

        if num_nodes is None:
            num_nodes = hidden_size
        
        self.layers = nn.ModuleList()

    # 첫 번째 레이어: hidden_size → num_nodes
        self.layers.append(nn.Linear(hidden_size, num_nodes))
        self.layers.append(nn.LayerNorm(num_nodes))
        self.layers.append(nn.ReLU())
        self.layers.append(nn.Dropout(dropout))

        # 중간 은닉층들: num_nodes → num_nodes
        for i in range(num_layers - 1):
            self.layers.append(nn.Linear(num_nodes,num_nodes))
            self.layers.append(nn.LayerNorm(num_nodes))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout))

        # 마지막 출력층: num_nodes → output_size
        self.layers.append(nn.Linear(num_nodes, output_size))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
class EncoderDecoderModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, lstm_layers=2, mlp_layers=2, mlp_nodes=None, dropout=0.3):
        super().__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size,lstm_layers)
        self.decoder = MLPDecoder(hidden_size, output_size,mlp_layers, mlp_nodes, dropout)

    def forward(self, x, seq_lengths):
        encoded = self.encoder(x, seq_lengths)
        output = self.decoder(encoded)
        return output

In [144]:
df = pd.read_csv('BMED_DATA_AG.csv')
feature_cols = ['V', 'E', 'VF', 'VA', 'VB', 'CFLA', 'CALA', 'CBLA', 'CFK', 'CAK', 'CBK', 'I']
ndf = norm_data('BMED_DATA_AG.csv')
seq = seq_data_const(ndf)
pad_seq,seq_len,max_seq_len = padded_sequences(seq)
dataset = gen_dataset(pad_seq, seq_len)
dataloaders = kfold_dataloaders(dataset, k_folds=5, batch_size=8, random_state=42)
print(f"\nCreated {len(dataloaders)} fold dataloaders")
print(f"Each fold contains (train_loader, val_loader) tuple")

Using device: cuda
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
Fold 1: Train size = 31, Val size = 8
Fold 2: Train size = 31, Val size = 8
Fold 3: Train size = 31, Val size = 8
Fold 4: Train size = 31, Val size = 8
Fold 5: Train size = 32, Val size = 7

Created 5 fold dataloaders
Each fold contains (train_loader, val_loader) tuple
