In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# rolling minmax scaling 함수 (window=24)
def rolling_minmax_scale(series, window=24):
    roll_min = series.rolling(window=window, min_periods=window).min()
    roll_max = series.rolling(window=window, min_periods=window).max()
    scaled = (series - roll_min) / ((roll_max - roll_min) + 1e-8)
    scaled = scaled.replace([np.inf, -np.inf], np.nan)
    scaled = scaled.fillna(1.0)
    return scaled.clip(upper=1.0)

# binning 및 one-hot 인코딩 함수 (결과를 정수 0,1로)
def bin_and_encode(data, features, bins=100, drop_original=True):
    for feature in features:
        data[f'{feature}_Bin'] = pd.cut(data[feature], bins=bins, labels=False)
        one_hot = pd.get_dummies(data[f'{feature}_Bin'], prefix=f'{feature}_Bin').astype(np.int32)
        expected_columns = [f'{feature}_Bin_{i}' for i in range(bins)]
        one_hot = one_hot.reindex(columns=expected_columns, fill_value=0)
        data = pd.concat([data, one_hot], axis=1)
        if drop_original:
            data.drop(columns=[f'{feature}_Bin'], inplace=True)
    numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        data[col] = data[col].astype(np.float32)
    return data

##############################################
# Diffusion 모델을 위한 Dataset 정의
##############################################
class DiffusionTimeSeriesDataset(Dataset):
    def __init__(self, input_data, target_data, lookback=24):
        self.input_data = input_data.values
        self.target_data = target_data.values
        self.lookback = lookback

    def __len__(self):
        return len(self.input_data) - self.lookback

    def __getitem__(self, idx):
        # 조건 데이터: lookback window
        x = self.input_data[idx: idx + self.lookback, :]
        # 타깃: close_target 값을 이용해 상승이면 1, 하락이면 0으로 설정 (float형)
        y = self.target_data[idx + self.lookback, 0]
        y_prev = self.target_data[idx + self.lookback - 1, 0]
        label = 1.0 if y > y_prev else 0.0
        return torch.tensor(x, dtype=torch.float32), torch.tensor([label], dtype=torch.float32)

##############################################
# Condition Encoder: 시계열 window를 벡터로 인코딩
##############################################
class ConditionEncoder(nn.Module):
    def __init__(self, input_dim, lookback, condition_dim):
        super(ConditionEncoder, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim * lookback, condition_dim),
            nn.ReLU(),
            nn.Linear(condition_dim, condition_dim)
        )
        
    def forward(self, x):
        # x: [batch, lookback, input_dim]
        batch_size = x.size(0)
        x = x.view(batch_size, -1)
        return self.fc(x)

##############################################
# DiffusionClassifier: diffusion process를 통한 노이즈 예측 모델
##############################################
class DiffusionClassifier(nn.Module):
    def __init__(self, input_dim, lookback, condition_dim=128, num_timesteps=100, hidden_dim=128):
        super(DiffusionClassifier, self).__init__()
        self.num_timesteps = num_timesteps
        # diffusion 스케줄 (선형 beta schedule)
        betas = torch.linspace(1e-4, 0.02, num_timesteps)
        alphas = 1 - betas
        alphas_cumprod = torch.cumprod(alphas, dim=0)
        self.register_buffer('betas', betas)
        self.register_buffer('alphas', alphas)
        self.register_buffer('alphas_cumprod', alphas_cumprod)
        
        # 조건 인코더: 시계열 데이터를 조건으로 임베딩
        self.condition_encoder = ConditionEncoder(input_dim, lookback, condition_dim)
        # timestep 임베딩
        self.time_embedding = nn.Embedding(num_timesteps, hidden_dim)
        # 노이즈 예측 네트워크: 입력은 [y_noisy, condition, timestep embedding]
        self.model = nn.Sequential(
            nn.Linear(1 + condition_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x_condition, y_noisy, t):
        # x_condition: [batch, lookback, input_dim]
        # y_noisy: [batch, 1] - 노이즈가 추가된 타깃
        # t: [batch] - timestep 인덱스
        cond = self.condition_encoder(x_condition)           # [batch, condition_dim]
        t_emb = self.time_embedding(t)                         # [batch, hidden_dim]
        inp = torch.cat([y_noisy, cond, t_emb], dim=1)          # [batch, 1+condition_dim+hidden_dim]
        predicted_noise = self.model(inp)                      # [batch, 1]
        return predicted_noise
    
    def sample(self, x_condition, device):
        """
        reverse diffusion 과정을 통해 조건 x_condition에 대해 예측값을 샘플링
        최종 출력은 continuous 값으로, 임계값 0.5로 분류 가능함.
        """
        batch_size = x_condition.size(0)
        # 초기 y: 정규분포 노이즈
        y = torch.randn(batch_size, 1, device=device)
        for t in reversed(range(self.num_timesteps)):
            t_tensor = torch.full((batch_size,), t, device=device, dtype=torch.long)
            predicted_noise = self.forward(x_condition, y, t_tensor)
            alpha = self.alphas[t]
            alpha_cumprod = self.alphas_cumprod[t]
            beta = self.betas[t]
            # DDPM 업데이트: 간단화된 형태
            y = (1 / torch.sqrt(alpha)) * (y - (beta / torch.sqrt(1 - alpha_cumprod)) * predicted_noise)
            # t > 0이면 약간의 노이즈 추가
            if t > 0:
                noise = torch.randn_like(y)
                y = y + torch.sqrt(beta) * noise
        return y

##############################################
# Diffusion 모델 학습 및 평가 함수
##############################################
def train_diffusion_model(model, dataloader, num_epochs, device, lr=1e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    mse_loss = nn.MSELoss()
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for x, y in dataloader:
            x = x.to(device)  # [batch, lookback, input_dim]
            y = y.to(device)  # [batch, 1] (0.0 or 1.0)
            batch_size = x.size(0)
            # 각 배치마다 timestep t를 균등 샘플링
            t = torch.randint(0, model.num_timesteps, (batch_size,), device=device).long()
            # 해당 timestep에 따른 누적 알파값
            alphas_cumprod_t = model.alphas_cumprod[t].view(batch_size, 1)
            # 노이즈 샘플링
            noise = torch.randn_like(y)
            # y_noisy = sqrt(alpha_cumprod)*y + sqrt(1-alpha_cumprod)*noise
            y_noisy = torch.sqrt(alphas_cumprod_t) * y + torch.sqrt(1 - alphas_cumprod_t) * noise
            optimizer.zero_grad()
            predicted_noise = model(x, y_noisy, t)
            loss = mse_loss(predicted_noise, noise)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

def evaluate_diffusion_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            # reverse diffusion 과정을 통해 예측값 샘플링
            y_sampled = model.sample(x, device)
            # 0.5 기준으로 분류
            y_pred = (y_sampled >= 0.5).float()
            correct += (y_pred == y).sum().item()
            total += y.size(0)
    acc = correct / total
    print(f"Evaluation Accuracy: {acc:.4f}")
    return acc

##############################################
# 데이터 로드 및 전처리 (OHLC 4개 데이터 사용)
##############################################
data = pd.read_csv("BTC_upbit_KRW_min60.csv", index_col=0)
data = data[['open', 'high', 'low', 'close']]
data.index = pd.to_datetime(data.index)

ohlc_features = ['open', 'high', 'low', 'close']
for feature in ohlc_features:
    data[feature] = rolling_minmax_scale(data[feature], window=24)

data = bin_and_encode(data, ohlc_features, bins=100, drop_original=True)
# 타깃은 원본 close 값 사용 (실험 목적)
data['close_target'] = data['close']
data = data.dropna()

# 최종 입력: _Bin_ 접미사가 있는 열들만 사용
final_input_columns = [col for col in data.columns if '_Bin_' in col]
final_target_column = ['close_target']

data_input = data[final_input_columns]
data_target = data[final_target_column]

##############################################
# 실험 실행: Diffusion Model 기반 주가 상승/하락 예측
##############################################
def train_and_evaluate_diffusion(data, num_experiments=4, lookback=24, num_epochs=10):
    final_input_columns = [col for col in data.columns if '_Bin_' in col]
    target_cols = ['close_target']
    
    data_input = data[final_input_columns]
    data_target = data[target_cols]
    
    data_input = data_input.apply(pd.to_numeric).astype(np.float32)
    data_target = data_target.apply(pd.to_numeric).astype(np.float32)
    
    step_size = 2500
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    val_acc_list = []
    test_acc_list = []
    
    for exp in range(num_experiments):
        train_start = exp * step_size
        train_end = train_start + step_size * 8
        val_end = train_end + step_size
        test_end = val_end + step_size
        if test_end > len(data_input):
            break
        print(f"\nExperiment {exp}: 데이터 구간 [{train_start}:{test_end}]")
        
        train_input = data_input.iloc[train_start:train_end]
        train_target = data_target.iloc[train_start:train_end]
        val_input = data_input.iloc[train_end:val_end]
        val_target = data_target.iloc[train_end:val_end]
        test_input = data_input.iloc[val_end:test_end]
        test_target = data_target.iloc[val_end:test_end]
        
        train_dataset = DiffusionTimeSeriesDataset(train_input, train_target, lookback=lookback)
        val_dataset = DiffusionTimeSeriesDataset(val_input, val_target, lookback=lookback)
        test_dataset = DiffusionTimeSeriesDataset(test_input, test_target, lookback=lookback)
        
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
        
        input_dim = train_input.shape[1]
        model = DiffusionClassifier(input_dim=input_dim, lookback=lookback, 
                                    condition_dim=128, num_timesteps=100, hidden_dim=128).to(device)
        model_path = f"diffusion_model_experiment_{exp}.pth"
        if exp > 0:
            try:
                model.load_state_dict(torch.load(f"diffusion_model_experiment_{exp - 1}.pth"))
                print(f"Loaded model from experiment {exp - 1} for fine-tuning.")
            except FileNotFoundError:
                print(f"Model file for experiment {exp - 1} not found. Starting fresh training.")
        
        print(f"Experiment {exp}: Training Diffusion Model")
        train_diffusion_model(model, train_loader, num_epochs, device, lr=1e-4)
        torch.save(model.state_dict(), model_path)
        print(f"Saved model for experiment {exp}.")
        
        print("Validation Evaluation:")
        val_acc = evaluate_diffusion_model(model, val_loader, device)
        val_acc_list.append(val_acc)
        
        print("Test Evaluation:")
        test_acc = evaluate_diffusion_model(model, test_loader, device)
        test_acc_list.append(test_acc)
    
    if val_acc_list:
        avg_val_acc = sum(val_acc_list) / len(val_acc_list)
        avg_test_acc = sum(test_acc_list) / len(test_acc_list)
        print(f"\nFinal Average Validation Accuracy: {avg_val_acc:.4f}")
        print(f"Final Average Test Accuracy: {avg_test_acc:.4f}")
    else:
        print("실험이 한 번도 실행되지 않았습니다.")

# 최종적으로 Diffusion Model 실험 실행
train_and_evaluate_diffusion(data, num_experiments=4, lookback=24, num_epochs=10)


  data['close_target'] = data['close']



Experiment 0: 데이터 구간 [0:25000]
Experiment 0: Training Diffusion Model
Epoch 1/10, Loss: 0.9116
Epoch 2/10, Loss: 0.6441
Epoch 3/10, Loss: 0.5454
Epoch 4/10, Loss: 0.5273
Epoch 5/10, Loss: 0.4875
Epoch 6/10, Loss: 0.4613
Epoch 7/10, Loss: 0.4215
Epoch 8/10, Loss: 0.4034
Epoch 9/10, Loss: 0.3565
Epoch 10/10, Loss: 0.3303
Saved model for experiment 0.
Validation Evaluation:
Evaluation Accuracy: 0.5513
Test Evaluation:
Evaluation Accuracy: 0.5828

Experiment 1: 데이터 구간 [2500:27500]
Loaded model from experiment 0 for fine-tuning.
Experiment 1: Training Diffusion Model
Epoch 1/10, Loss: 0.3608
Epoch 2/10, Loss: 0.3295
Epoch 3/10, Loss: 0.3193
Epoch 4/10, Loss: 0.2910
Epoch 5/10, Loss: 0.2821
Epoch 6/10, Loss: 0.2726
Epoch 7/10, Loss: 0.2537
Epoch 8/10, Loss: 0.2479
Epoch 9/10, Loss: 0.2477
Epoch 10/10, Loss: 0.2383
Saved model for experiment 1.
Validation Evaluation:
Evaluation Accuracy: 0.5557
Test Evaluation:
Evaluation Accuracy: 0.5388

Experiment 2: 데이터 구간 [5000:30000]
Loaded model from 

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# rolling minmax scaling 함수 (window=24)
def rolling_minmax_scale(series, window=12):
    roll_min = series.rolling(window=window, min_periods=window).min()
    roll_max = series.rolling(window=window, min_periods=window).max()
    scaled = (series - roll_min) / ((roll_max - roll_min) + 1e-8)
    scaled = scaled.replace([np.inf, -np.inf], np.nan)
    scaled = scaled.fillna(1.0)
    return scaled.clip(upper=1.0)

# binning 및 one-hot 인코딩 함수 (결과를 정수 0,1로)
def bin_and_encode(data, features, bins=100, drop_original=True):
    for feature in features:
        data[f'{feature}_Bin'] = pd.cut(data[feature], bins=bins, labels=False)
        one_hot = pd.get_dummies(data[f'{feature}_Bin'], prefix=f'{feature}_Bin').astype(np.int32)
        expected_columns = [f'{feature}_Bin_{i}' for i in range(bins)]
        one_hot = one_hot.reindex(columns=expected_columns, fill_value=0)
        data = pd.concat([data, one_hot], axis=1)
        if drop_original:
            data.drop(columns=[f'{feature}_Bin'], inplace=True)
    numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        data[col] = data[col].astype(np.float32)
    return data

##############################################
# Diffusion 모델을 위한 Dataset 정의
##############################################
class DiffusionTimeSeriesDataset(Dataset):
    def __init__(self, input_data, target_data, lookback=12):
        self.input_data = input_data.values
        self.target_data = target_data.values
        self.lookback = lookback

    def __len__(self):
        return len(self.input_data) - self.lookback

    def __getitem__(self, idx):
        # 조건 데이터: lookback window
        x = self.input_data[idx: idx + self.lookback, :]
        # 타깃: close_target 값을 이용해 상승이면 1, 하락이면 0으로 설정 (float형)
        y = self.target_data[idx + self.lookback, 0]
        y_prev = self.target_data[idx + self.lookback - 1, 0]
        label = 1.0 if y > y_prev else 0.0
        return torch.tensor(x, dtype=torch.float32), torch.tensor([label], dtype=torch.float32)

##############################################
# Condition Encoder: 시계열 window를 벡터로 인코딩
##############################################
class ConditionEncoder(nn.Module):
    def __init__(self, input_dim, lookback, condition_dim):
        super(ConditionEncoder, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim * lookback, condition_dim),
            nn.ReLU(),
            nn.Linear(condition_dim, condition_dim)
        )
        
    def forward(self, x):
        # x: [batch, lookback, input_dim]
        batch_size = x.size(0)
        x = x.view(batch_size, -1)
        return self.fc(x)

##############################################
# DiffusionClassifier: diffusion process를 통한 노이즈 예측 모델
##############################################
class DiffusionClassifier(nn.Module):
    def __init__(self, input_dim, lookback, condition_dim=128, num_timesteps=100, hidden_dim=128):
        super(DiffusionClassifier, self).__init__()
        self.num_timesteps = num_timesteps
        # diffusion 스케줄 (선형 beta schedule)
        betas = torch.linspace(1e-4, 0.02, num_timesteps)
        alphas = 1 - betas
        alphas_cumprod = torch.cumprod(alphas, dim=0)
        self.register_buffer('betas', betas)
        self.register_buffer('alphas', alphas)
        self.register_buffer('alphas_cumprod', alphas_cumprod)
        
        # 조건 인코더: 시계열 데이터를 조건으로 임베딩
        self.condition_encoder = ConditionEncoder(input_dim, lookback, condition_dim)
        # timestep 임베딩
        self.time_embedding = nn.Embedding(num_timesteps, hidden_dim)
        # 노이즈 예측 네트워크: 입력은 [y_noisy, condition, timestep embedding]
        self.model = nn.Sequential(
            nn.Linear(1 + condition_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x_condition, y_noisy, t):
        # x_condition: [batch, lookback, input_dim]
        # y_noisy: [batch, 1] - 노이즈가 추가된 타깃
        # t: [batch] - timestep 인덱스
        cond = self.condition_encoder(x_condition)           # [batch, condition_dim]
        t_emb = self.time_embedding(t)                         # [batch, hidden_dim]
        inp = torch.cat([y_noisy, cond, t_emb], dim=1)          # [batch, 1+condition_dim+hidden_dim]
        predicted_noise = self.model(inp)                      # [batch, 1]
        return predicted_noise
    
    def sample(self, x_condition, device):
        """
        reverse diffusion 과정을 통해 조건 x_condition에 대해 예측값을 샘플링
        최종 출력은 continuous 값으로, 임계값 0.5로 분류 가능함.
        """
        batch_size = x_condition.size(0)
        # 초기 y: 정규분포 노이즈
        y = torch.randn(batch_size, 1, device=device)
        for t in reversed(range(self.num_timesteps)):
            t_tensor = torch.full((batch_size,), t, device=device, dtype=torch.long)
            predicted_noise = self.forward(x_condition, y, t_tensor)
            alpha = self.alphas[t]
            alpha_cumprod = self.alphas_cumprod[t]
            beta = self.betas[t]
            # DDPM 업데이트: 간단화된 형태
            y = (1 / torch.sqrt(alpha)) * (y - (beta / torch.sqrt(1 - alpha_cumprod)) * predicted_noise)
            # t > 0이면 약간의 노이즈 추가
            if t > 0:
                noise = torch.randn_like(y)
                y = y + torch.sqrt(beta) * noise
        return y

##############################################
# Diffusion 모델 학습 및 평가 함수
##############################################
def train_diffusion_model(model, dataloader, num_epochs, device, lr=1e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    mse_loss = nn.MSELoss()
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for x, y in dataloader:
            x = x.to(device)  # [batch, lookback, input_dim]
            y = y.to(device)  # [batch, 1] (0.0 or 1.0)
            batch_size = x.size(0)
            # 각 배치마다 timestep t를 균등 샘플링
            t = torch.randint(0, model.num_timesteps, (batch_size,), device=device).long()
            # 해당 timestep에 따른 누적 알파값
            alphas_cumprod_t = model.alphas_cumprod[t].view(batch_size, 1)
            # 노이즈 샘플링
            noise = torch.randn_like(y)
            # y_noisy = sqrt(alpha_cumprod)*y + sqrt(1-alpha_cumprod)*noise
            y_noisy = torch.sqrt(alphas_cumprod_t) * y + torch.sqrt(1 - alphas_cumprod_t) * noise
            optimizer.zero_grad()
            predicted_noise = model(x, y_noisy, t)
            loss = mse_loss(predicted_noise, noise)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}")

def evaluate_diffusion_model(model, dataloader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in dataloader:
            x = x.to(device)
            y = y.to(device)
            # reverse diffusion 과정을 통해 예측값 샘플링
            y_sampled = model.sample(x, device)
            # 0.5 기준으로 분류
            y_pred = (y_sampled >= 0.5).float()
            correct += (y_pred == y).sum().item()
            total += y.size(0)
    acc = correct / total
    print(f"Evaluation Accuracy: {acc:.4f}")
    return acc

##############################################
# 데이터 로드 및 전처리 (OHLC 4개 데이터 사용)
##############################################
data = pd.read_csv("ETH_upbit_KRW_min5_0309.csv", index_col=0)
data = data[['open', 'high', 'low', 'close']]
data.index = pd.to_datetime(data.index)

ohlc_features = ['open', 'high', 'low', 'close']
for feature in ohlc_features:
    data[feature] = rolling_minmax_scale(data[feature], window=12)

data = bin_and_encode(data, ohlc_features, bins=100, drop_original=True)
# 타깃은 원본 close 값 사용 (실험 목적)
data['close_target'] = data['close']
data = data.dropna()

# 최종 입력: _Bin_ 접미사가 있는 열들만 사용
final_input_columns = [col for col in data.columns if '_Bin_' in col]
final_target_column = ['close_target']

data_input = data[final_input_columns]
data_target = data[final_target_column]

##############################################
# 실험 실행: Diffusion Model 기반 주가 상승/하락 예측
##############################################
def train_and_evaluate_diffusion(data, num_experiments=16, lookback=12, num_epochs=10):
    final_input_columns = [col for col in data.columns if '_Bin_' in col]
    target_cols = ['close_target']
    
    data_input = data[final_input_columns]
    data_target = data[target_cols]
    
    data_input = data_input.apply(pd.to_numeric).astype(np.float32)
    data_target = data_target.apply(pd.to_numeric).astype(np.float32)
    
    step_size = 32000
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    val_acc_list = []
    test_acc_list = []
    
    for exp in range(num_experiments):
        train_start = exp * step_size
        train_end = train_start + step_size * 8
        val_end = train_end + step_size
        test_end = val_end + step_size
        if test_end > len(data_input):
            break
        print(f"\nExperiment {exp}: 데이터 구간 [{train_start}:{test_end}]")
        
        train_input = data_input.iloc[train_start:train_end]
        train_target = data_target.iloc[train_start:train_end]
        val_input = data_input.iloc[train_end:val_end]
        val_target = data_target.iloc[train_end:val_end]
        test_input = data_input.iloc[val_end:test_end]
        test_target = data_target.iloc[val_end:test_end]
        
        train_dataset = DiffusionTimeSeriesDataset(train_input, train_target, lookback=lookback)
        val_dataset = DiffusionTimeSeriesDataset(val_input, val_target, lookback=lookback)
        test_dataset = DiffusionTimeSeriesDataset(test_input, test_target, lookback=lookback)
        
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
        
        input_dim = train_input.shape[1]
        model = DiffusionClassifier(input_dim=input_dim, lookback=lookback, 
                                    condition_dim=128, num_timesteps=100, hidden_dim=128).to(device)
        model_path = f"diffusion_model_experiment_{exp}.pth"
        if exp > 0:
            try:
                model.load_state_dict(torch.load(f"diffusion_model_experiment_{exp - 1}.pth"))
                print(f"Loaded model from experiment {exp - 1} for fine-tuning.")
            except FileNotFoundError:
                print(f"Model file for experiment {exp - 1} not found. Starting fresh training.")
        
        print(f"Experiment {exp}: Training Diffusion Model")
        train_diffusion_model(model, train_loader, num_epochs, device, lr=1e-4)
        torch.save(model.state_dict(), model_path)
        print(f"Saved model for experiment {exp}.")
        
        print("Validation Evaluation:")
        val_acc = evaluate_diffusion_model(model, val_loader, device)
        val_acc_list.append(val_acc)
        
        print("Test Evaluation:")
        test_acc = evaluate_diffusion_model(model, test_loader, device)
        test_acc_list.append(test_acc)
    
    if val_acc_list:
        avg_val_acc = sum(val_acc_list) / len(val_acc_list)
        avg_test_acc = sum(test_acc_list) / len(test_acc_list)
        print(f"\nFinal Average Validation Accuracy: {avg_val_acc:.4f}")
        print(f"Final Average Test Accuracy: {avg_test_acc:.4f}")
    else:
        print("실험이 한 번도 실행되지 않았습니다.")

# 최종적으로 Diffusion Model 실험 실행
train_and_evaluate_diffusion(data, num_experiments=15, lookback=12, num_epochs=10)


  data['close_target'] = data['close']



Experiment 0: 데이터 구간 [0:320000]
Experiment 0: Training Diffusion Model
Epoch 1/10, Loss: 0.5566
Epoch 2/10, Loss: 0.4920
Epoch 3/10, Loss: 0.4769
Epoch 4/10, Loss: 0.4562
Epoch 5/10, Loss: 0.4325
Epoch 6/10, Loss: 0.4058
Epoch 7/10, Loss: 0.3693
Epoch 8/10, Loss: 0.3273
Epoch 9/10, Loss: 0.2844
Epoch 10/10, Loss: 0.2462
Saved model for experiment 0.
Validation Evaluation:
Evaluation Accuracy: 0.6002
Test Evaluation:
Evaluation Accuracy: 0.5946

Experiment 1: 데이터 구간 [32000:352000]
Loaded model from experiment 0 for fine-tuning.
Experiment 1: Training Diffusion Model
Epoch 1/10, Loss: 0.2551
Epoch 2/10, Loss: 0.2178
Epoch 3/10, Loss: 0.1908
Epoch 4/10, Loss: 0.1644
Epoch 5/10, Loss: 0.1419
Epoch 6/10, Loss: 0.1251
Epoch 7/10, Loss: 0.1110
Epoch 8/10, Loss: 0.0991
Epoch 9/10, Loss: 0.0914
Epoch 10/10, Loss: 0.0831
Saved model for experiment 1.
Validation Evaluation:
Evaluation Accuracy: 0.5915
Test Evaluation:
Evaluation Accuracy: 0.5971

Experiment 2: 데이터 구간 [64000:384000]
Loaded model 

In [1]:
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from math import sqrt

##############################################
# 데이터 전처리: rolling minmax scaling 및 binning
##############################################
def rolling_minmax_scale(series, window=24):
    roll_min = series.rolling(window=window, min_periods=window).min()
    roll_max = series.rolling(window=window, min_periods=window).max()
    scaled = (series - roll_min) / ((roll_max - roll_min) + 1e-8)
    scaled = scaled.replace([np.inf, -np.inf], np.nan)
    scaled = scaled.fillna(1.0)
    return scaled.clip(upper=1.0)

def bin_and_encode(data, features, bins=100, drop_original=True):
    for feature in features:
        data[f'{feature}_Bin'] = pd.cut(data[feature], bins=bins, labels=False)
        one_hot = pd.get_dummies(data[f'{feature}_Bin'], prefix=f'{feature}_Bin').astype(np.int32)
        expected_columns = [f'{feature}_Bin_{i}' for i in range(bins)]
        one_hot = one_hot.reindex(columns=expected_columns, fill_value=0)
        data = pd.concat([data, one_hot], axis=1)
        if drop_original:
            data.drop(columns=[f'{feature}_Bin'], inplace=True)
    numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        data[col] = data[col].astype(np.float32)
    return data

##############################################
# Diffusion 모델을 위한 Dataset 정의 (회귀 + 방향 평가용)
##############################################
class DiffusionTimeSeriesDataset(Dataset):
    def __init__(self, input_data, target_data, lookback=24):
        self.input_data = input_data.values
        self.target_data = target_data.values  # continuous target 값 (scaled close)
        self.lookback = lookback

    def __len__(self):
        return len(self.input_data) - self.lookback

    def __getitem__(self, idx):
        # 조건 데이터: lookback window
        x = self.input_data[idx: idx + self.lookback, :]
        # 타깃: 다음 시점의 target (continuous)
        target = self.target_data[idx + self.lookback, 0]
        # 이전 시점의 target (분류 평가에 사용)
        prev_target = self.target_data[idx + self.lookback - 1, 0]
        return (torch.tensor(x, dtype=torch.float32),
                torch.tensor([target], dtype=torch.float32),
                torch.tensor([prev_target], dtype=torch.float32))

##############################################
# Condition Encoder: 시계열 window를 벡터로 인코딩
##############################################
class ConditionEncoder(nn.Module):
    def __init__(self, input_dim, lookback, condition_dim):
        super(ConditionEncoder, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim * lookback, condition_dim),
            nn.ReLU(),
            nn.Linear(condition_dim, condition_dim)
        )
        
    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, -1)
        return self.fc(x)

##############################################
# DiffusionClassifier: 조건부 diffusion 모델 (회귀, 평가 시 방향 분류)
##############################################
class DiffusionClassifier(nn.Module):
    def __init__(self, input_dim, lookback, condition_dim=128, num_timesteps=100, hidden_dim=128):
        super(DiffusionClassifier, self).__init__()
        self.num_timesteps = num_timesteps
        betas = torch.linspace(1e-4, 0.02, num_timesteps)
        alphas = 1 - betas
        alphas_cumprod = torch.cumprod(alphas, dim=0)
        self.register_buffer('betas', betas)
        self.register_buffer('alphas', alphas)
        self.register_buffer('alphas_cumprod', alphas_cumprod)
        
        self.condition_encoder = ConditionEncoder(input_dim, lookback, condition_dim)
        self.time_embedding = nn.Embedding(num_timesteps, hidden_dim)
        self.model = nn.Sequential(
            nn.Linear(1 + condition_dim + hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, x_condition, y_noisy, t):
        cond = self.condition_encoder(x_condition)
        t_emb = self.time_embedding(t)
        inp = torch.cat([y_noisy, cond, t_emb], dim=1)
        predicted_noise = self.model(inp)
        return predicted_noise
    
    def sample(self, x_condition, device):
        batch_size = x_condition.size(0)
        y = torch.randn(batch_size, 1, device=device)
        for t in reversed(range(self.num_timesteps)):
            t_tensor = torch.full((batch_size,), t, device=device, dtype=torch.long)
            predicted_noise = self.forward(x_condition, y, t_tensor)
            alpha = self.alphas[t]
            alpha_cumprod = self.alphas_cumprod[t]
            beta = self.betas[t]
            y = (1 / torch.sqrt(alpha)) * (y - (beta / torch.sqrt(1 - alpha_cumprod)) * predicted_noise)
            if t > 0:
                noise = torch.randn_like(y)
                y = y + torch.sqrt(beta) * noise
        return y

##############################################
# Diffusion 모델 학습 및 평가 함수 (회귀 + 방향 평가)
##############################################
def train_diffusion_model(model, dataloader, num_epochs, device, lr=1e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    mse_loss = nn.MSELoss()
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for x, y, _ in dataloader:
            x = x.to(device)
            y = y.to(device)
            batch_size = x.size(0)
            t = torch.randint(0, model.num_timesteps, (batch_size,), device=device).long()
            alphas_cumprod_t = model.alphas_cumprod[t].view(batch_size, 1)
            noise = torch.randn_like(y)
            y_noisy = torch.sqrt(alphas_cumprod_t) * y + torch.sqrt(1 - alphas_cumprod_t) * noise
            optimizer.zero_grad()
            predicted_noise = model(x, y_noisy, t)
            loss = mse_loss(predicted_noise, noise)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.6f}")

def evaluate_diffusion_model(model, dataloader, device):
    model.eval()
    mse_loss = nn.MSELoss()
    total_mse = 0
    total_samples = 0
    correct = 0
    with torch.no_grad():
        for x, y, y_prev in dataloader:
            x = x.to(device)
            y = y.to(device)
            y_prev = y_prev.to(device)
            y_sampled = model.sample(x, device)
            loss = mse_loss(y_sampled, y)
            total_mse += loss.item() * y.size(0)
            total_samples += y.size(0)
            # 예측된 방향: 1 if 예측값 > y_prev, 0 otherwise
            y_pred_class = (y_sampled > y_prev).float()
            # 실제 방향: 1 if y > y_prev, else 0
            y_true_class = (y > y_prev).float()
            correct += (y_pred_class == y_true_class).sum().item()
    avg_mse = total_mse / total_samples
    accuracy = correct / total_samples
    print(f"Evaluation MSE: {avg_mse:.6f}, Accuracy: {accuracy:.4f}")
    return avg_mse, accuracy

##############################################
# 데이터 로드 및 전처리 (OHLC 값 사용: 원본 값에 scaling 후 인코딩)
##############################################
data = pd.read_csv("ETH_upbit_KRW_min5_0309.csv", index_col=0)
data.index = pd.to_datetime(data.index)
data = data[['open', 'high', 'low', 'close']]

# 각 OHLC에 대해 rolling minmax scaling 적용 후 새 컬럼 생성 (scaled 값)
for feature in ['open', 'high', 'low', 'close']:
    data[feature + '_scaled'] = rolling_minmax_scale(data[feature], window=24)
data = data.dropna()

# one-hot 인코딩: _scaled 컬럼 대상 (각 100구간 → 총 400차원)
features_to_bin = ['open_scaled', 'high_scaled', 'low_scaled', 'close_scaled']
data = bin_and_encode(data, features_to_bin, bins=100, drop_original=True)

# 타깃: close_scaled 컬럼을 그대로 사용 (continuous regression target)
data['close_target'] = data['close_scaled']
data = data.dropna()

# 최종 입력: '_scaled_Bin_'가 포함된 열들만 선택
final_input_columns = [col for col in data.columns if '_scaled_Bin_' in col]
final_target_column = ['close_target']

data_input = data[final_input_columns]
data_target = data[final_target_column]

##############################################
# 실험 실행: Diffusion Model 기반 주가 예측 (회귀 + 방향 평가)
##############################################
def train_and_evaluate_diffusion(data, num_experiments=16, lookback=24, num_epochs=10):
    final_input_columns = [col for col in data.columns if '_scaled_Bin_' in col]
    target_cols = ['close_target']
    
    data_input = data[final_input_columns]
    data_target = data[target_cols]
    
    data_input = data_input.apply(pd.to_numeric).astype(np.float32)
    data_target = data_target.apply(pd.to_numeric).astype(np.float32)
    
    step_size = 31200
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    val_mse_list = []
    test_mse_list = []
    val_acc_list = []
    test_acc_list = []
    
    for exp in range(num_experiments):
        train_start = exp * step_size
        train_end = train_start + step_size * 8
        val_end = train_end + step_size
        test_end = val_end + step_size
        if test_end > len(data_input):
            break
        print(f"\nExperiment {exp}: 데이터 구간 [{train_start}:{test_end}]")
        
        train_input = data_input.iloc[train_start:train_end]
        train_target = data_target.iloc[train_start:train_end]
        val_input = data_input.iloc[train_end:val_end]
        val_target = data_target.iloc[train_end:val_end]
        test_input = data_input.iloc[val_end:test_end]
        test_target = data_target.iloc[val_end:test_end]
        
        train_dataset = DiffusionTimeSeriesDataset(train_input, train_target, lookback=lookback)
        val_dataset = DiffusionTimeSeriesDataset(val_input, val_target, lookback=lookback)
        test_dataset = DiffusionTimeSeriesDataset(test_input, test_target, lookback=lookback)
        
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
        
        input_dim = train_input.shape[1]
        model = DiffusionClassifier(input_dim=input_dim, lookback=lookback, 
                                    condition_dim=128, num_timesteps=100, hidden_dim=128).to(device)
        model_path = f"diffusion_model_experiment_{exp}.pth"
        if exp > 0:
            try:
                model.load_state_dict(torch.load(f"diffusion_model_experiment_{exp - 1}.pth"))
                print(f"Loaded model from experiment {exp - 1} for fine-tuning.")
            except FileNotFoundError:
                print(f"Model file for experiment {exp - 1} not found. Starting fresh training.")
        
        print(f"Experiment {exp}: Training Diffusion Model")
        train_diffusion_model(model, train_loader, num_epochs, device, lr=1e-4)
        torch.save(model.state_dict(), model_path)
        print(f"Saved model for experiment {exp}.")
        
        print("Validation Evaluation:")
        val_mse, val_acc = evaluate_diffusion_model(model, val_loader, device)
        val_mse_list.append(val_mse)
        val_acc_list.append(val_acc)
        
        print("Test Evaluation:")
        test_mse, test_acc = evaluate_diffusion_model(model, test_loader, device)
        test_mse_list.append(test_mse)
        test_acc_list.append(test_acc)
    
        print(f"Experiment {exp}: Validation MSE: {val_mse:.6f}, val_Accuracy: {val_acc:.4f}, test_Accuracy: {test_acc:.4f}")
    
    if val_mse_list:
        avg_val_mse = sum(val_mse_list) / len(val_mse_list)
        avg_test_mse = sum(test_mse_list) / len(test_mse_list)
        avg_val_acc = sum(val_acc_list) / len(val_acc_list)
        avg_test_acc = sum(test_acc_list) / len(test_acc_list)
        print(f"\nFinal Average Validation MSE: {avg_val_mse:.6f}")
        print(f"Final Average Test MSE: {avg_test_mse:.6f}")
        print(f"Final Average Val Accuracy: {avg_val_acc:.4f}")
        print(f"Final Average Test Accuracy: {avg_test_acc:.4f}")
    else:
        print("실험이 한 번도 실행되지 않았습니다.")

##############################################
# 전체 실행 시간 측정
##############################################
start_time = time.time()
train_and_evaluate_diffusion(data, num_experiments=16, lookback=24, num_epochs=10)
end_time = time.time()
elapsed = end_time - start_time
print(f"\n총 수행 시간: {elapsed:.2f}초")


  data['close_target'] = data['close_scaled']



Experiment 0: 데이터 구간 [0:312000]
Experiment 0: Training Diffusion Model
Epoch 1/10, Loss: 0.368460
Epoch 2/10, Loss: 0.251631
Epoch 3/10, Loss: 0.232721
Epoch 4/10, Loss: 0.221808
Epoch 5/10, Loss: 0.209884
Epoch 6/10, Loss: 0.197628
Epoch 7/10, Loss: 0.189020
Epoch 8/10, Loss: 0.180835
Epoch 9/10, Loss: 0.172085
Epoch 10/10, Loss: 0.165783
Saved model for experiment 0.
Validation Evaluation:
Evaluation MSE: 0.050323, Accuracy: 0.5410
Test Evaluation:
Evaluation MSE: 0.051397, Accuracy: 0.5469
Experiment 0: Validation MSE: 0.050323, val_Accuracy: 0.5410, test_Accuracy: 0.5469

Experiment 1: 데이터 구간 [31200:343200]
Loaded model from experiment 0 for fine-tuning.
Experiment 1: Training Diffusion Model
Epoch 1/10, Loss: 0.168132
Epoch 2/10, Loss: 0.158581
Epoch 3/10, Loss: 0.148966
Epoch 4/10, Loss: 0.142342
Epoch 5/10, Loss: 0.136289
Epoch 6/10, Loss: 0.132642
Epoch 7/10, Loss: 0.128058
Epoch 8/10, Loss: 0.121237
Epoch 9/10, Loss: 0.118595
Epoch 10/10, Loss: 0.113431
Saved model for experi