In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TORCH_USE_CUDA_DSA"] = '1'

import pandas as pd
import numpy as np
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx

import networkx as nx
import numpy as np
import planarity
import networkx as nx
import torch

from utils import *
df_0, df_1, df_2, df_3, df_4, df_5, df_6 = load_data()

df_6.drop(columns=['A060000','A002860'],inplace=True)


train_start_day = '2017-01-03'
train_end_day = '2020-03-20'

valid_start_day = '2020-03-23'
valid_end_day = '2020-11-02'

test_start_day = '2020-11-03'
test_end_day = '2021-09-30'


#lstm에서는 이게 중요.
seq_length = 100
rebal_term = 20
often_freq = 1

# 정적인 자산 운용. 미리 stock을 정해놓고 들어간다.
# 모든 idx는 iloc으로 접근한다.

df_list = [df_0, df_1, df_2, df_3, df_4, df_5, df_6]

In [2]:
#select_stocks
#get the iloc of train_start_day from df_open
train_start_idx = df_0.index.get_loc(train_start_day)
test_start_idx = df_0.index.get_loc(test_start_day)

selected_stocks = select_stocks(df_list, train_start_idx, test_start_idx, 50, 30)

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

from utils import *

class LSTMCustomDataset(Dataset):
    '''
    init 변수 설명.
    
    df_list : [df_open, df_high, df_low, df_close, df_vol, df_index, df_markets]
    stock_list: 미리 뽑아 둔 주식리스트를 활용한다. select_stocks 함수 이용.
    start_date : Train/val/test 처음에 자를 날짜. 이 때 인자로서는 train/val/test별로 다르게 들어가야함.
    end_date: Train/val/test 나중에 자를 날짜. 이 때 인자로서는 train/val/test별로 다르게 들어가야함.
    seq_length : LSTM에 들어갈 sequence length -> 20, 100 모두 진행할 것.
    rebal_term : 리밸런싱을 할 term -> 며칠 뒤의 라벨을 맞출 것인가. -> 이건 20 영업일 뒤로
    often_freq : 데이터를 뽑아낼 주기. 여기선 모두 1로 둔다.
    
    rebal_idx_list: 시퀀스를 만들어낼 리밸런싱 날짜들의 인덱스를 저장해둔 리스트. 그래프 시퀀스는 리밸런싱 날짜를 기준으로 하나씩 만들어짐.
        
    '''
    def __init__(self, df_list, stock_list, start_date, end_date, seq_length, rebal_term, often_freq, device):
        self.stock_list = stock_list
        self.start_date = start_date
        self.end_date = end_date
        self.seq_length = seq_length
        self.rebal_term = rebal_term
        self.often_freq = often_freq
        self.device = device
        # 스케일링을 적용할 데이터 프레임 복사 및 스케일링
        self.df_open = standard_scale_feature(df_list[0],pd.to_datetime(self.start_date), pd.to_datetime(self.end_date))
        self.df_high = standard_scale_feature(df_list[1],pd.to_datetime(self.start_date), pd.to_datetime(self.end_date))
        self.df_low = standard_scale_feature(df_list[2], pd.to_datetime(self.start_date), pd.to_datetime(self.end_date))
        self.df_close = standard_scale_feature(df_list[3], pd.to_datetime(self.start_date), pd.to_datetime(self.end_date))
        self.df_vol = standard_scale_feature(df_list[4], pd.to_datetime(self.start_date), pd.to_datetime(self.end_date))
        self.df_index = df_list[5]  # 인덱스는 스케일링하지 않음
        self.df_markets = df_list[6]  # 시장 데이터는 스케일링하지 않음
        self.df_close_notscaled = df_list[3].copy()  # 라벨 생성에 사용할 원본 종가 데이터

        self.rebal_idx_list = []
        self.stocks_list = []
    
        self.make_rebal_idx_list()

    def make_labels(self, selected_stocks_final, rebal_idx):

        # 코스피 인덱스의 로그 리턴 구하기
        future_index_prices = self.df_index.iloc[rebal_idx+self.rebal_term].values[0]
        current_index_prices = self.df_index.iloc[rebal_idx].values[0]
        
        index_log_return = np.log(((future_index_prices+1e-6)/(current_index_prices+1e-6))+1e-6)
        
        # 선택된 주식들의 로그 리턴 구하기
        future_close_prices = self.df_close_notscaled[selected_stocks_final].iloc[rebal_idx+self.rebal_term].values
        current_close_prices = self.df_close_notscaled[selected_stocks_final].iloc[rebal_idx].values
        
        close_log_returns = np.log((future_close_prices+1e-6)/(current_close_prices+1e-6)+1e-6)
        
        # 라벨 만들기
        labels = [1 if close_log_return > index_log_return else 0 for close_log_return in close_log_returns]  # 20일 뒤에 인덱스보다 먹으면 0      
        
        return labels



    def make_rebal_idx_list(self):
        '''
        데이터셋 만들 때, 시퀀스를 만들 날짜들을 만드는 함수
        '''
        
        if not isinstance(self.start_date, pd.Timestamp):
            self.start_date = pd.to_datetime(self.start_date)
        if not isinstance(self.end_date, pd.Timestamp):
            self.end_date = pd.to_datetime(self.end_date)
        
        start_idx = self.df_open.index.get_loc(self.start_date)
        end_idx = self.df_open.index.get_loc(self.end_date)
        self.rebal_idx_list = list(range(start_idx, end_idx, self.often_freq))

    def prepare_data(self):
        # 데이터 준비 로직 변경
        self.time_series_data = []
        self.labels = []
        
        for rebal_idx in self.rebal_idx_list:
            selected_stocks_final = self.stock_list
            
            # 시계열 데이터를 저장할 리스트 초기화
            sequence_data = []
            
            for past_days in range(self.seq_length, 0, -1):
                # 각 시점에 대한 특성 데이터 추출
                features = []
                for stock in selected_stocks_final:
                    features.append([
                        self.df_open[stock].iloc[rebal_idx - past_days],
                        self.df_open[stock].iloc[rebal_idx - past_days-self.rebal_term],
                        self.df_high[stock].iloc[rebal_idx - past_days],
                        self.df_high[stock].iloc[rebal_idx - past_days-self.rebal_term],
                        self.df_low[stock].iloc[rebal_idx - past_days],
                        self.df_low[stock].iloc[rebal_idx - past_days-self.rebal_term],
                        self.df_close[stock].iloc[rebal_idx - past_days],
                        self.df_close[stock].iloc[rebal_idx-past_days-self.rebal_term],
                        self.df_vol[stock].iloc[rebal_idx - past_days],
                        self.df_vol[stock].iloc[rebal_idx - past_days-self.rebal_term]
                    ])
                sequence_data.append(features)
            labels = self.make_labels(selected_stocks_final, rebal_idx)
            
            self.time_series_data.append(np.array(sequence_data))
            self.labels.append(labels)
    
    def __len__(self):
        return len(self.time_series_data)
    
    def __getitem__(self, idx):
        sequence_data = torch.tensor(self.time_series_data[idx], dtype=torch.float).to(self.device)
        labels = torch.tensor(self.labels[idx], dtype=torch.long).to(self.device)

        return sequence_data, labels

In [137]:
first_labels = test_lstm_dataset.labels[0]
second_labels = test_lstm_dataset.labels[1]

# 값이 다른 인덱스만 찾기
different_indices = [i for i, (first, second) in enumerate(zip(first_labels, second_labels)) if first != second]

print("Indices with different labels:", different_indices)


Indices with different labels: [3, 15]


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
%%capture
train_lstm_dataset = LSTMCustomDataset(df_list, selected_stocks, train_start_day, train_end_day, seq_length, rebal_term, often_freq,device)
train_lstm_dataset.prepare_data()

In [12]:
%%capture
valid_lstm_dataset = LSTMCustomDataset(df_list, selected_stocks, valid_start_day, valid_end_day, seq_length, rebal_term, often_freq,device)
valid_lstm_dataset.prepare_data()

In [13]:
%%capture
test_lstm_dataset = LSTMCustomDataset(df_list, selected_stocks, test_start_day, test_end_day, seq_length, rebal_term, often_freq,device)
test_lstm_dataset.prepare_data()

786개 rebal_idx 마다 100일의 sequence가 만들어지고, 이 떄는 각 stock마다 10개의 stock indicator을 담은 정보임.

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [81]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F

class MultiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, num_stocks, dropout=0.2):
        super(MultiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_stocks = num_stocks
        
        # 각 주식별로 LSTM 모델을 만들기 위한 ModuleList
        self.lstm_list = nn.ModuleList([nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout) 
                                         for _ in range(num_stocks)])
        
        # 각 주식별로 다른 fully connected layer를 만들기 위한 ModuleList
        self.fc_list = nn.ModuleList([nn.Linear(hidden_size, output_size) for _ in range(num_stocks)])
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        outputs = []
        for i in range(self.num_stocks):
            h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
            c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
            out, _ = self.lstm_list[i](x[:, i, :, :].squeeze(1), (h0, c0))
            out = self.dropout(out[:, -1, :])
            out = self.fc_list[i](out)
            outputs.append(out.unsqueeze(1))  # 주식별 예측을 3차원 텐서로 만들기 위해 unsqueeze 사용
        outputs = torch.cat(outputs, dim=1)  # [배치 크기, 주식 수, 클래스 수] 형태로 변환
        return outputs


In [82]:
def train_lstm(train_dataloader, valid_dataloader, seed, num_epochs, hidden_size, num_classes, num_stocks, input_size, num_layers, dropout, device, seq_length, rebal_term, patience=10):
    
    model = MultiLSTMModel(input_size, hidden_size, num_layers, num_classes, num_stocks, dropout)
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=5e-6)

    best_val_loss = float('inf')
    best_model_weights = None
    patience_counter = 0
    
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for i, (inputs, labels) in enumerate(train_dataloader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)  # outputs 크기: [배치 크기, 주식 수, 클래스 수]

            loss = 0
            for j in range(outputs.size(1)):  # 주식 수만큼 반복
                loss += criterion(outputs[:, j, :], labels[:, j])
            loss /= outputs.size(1)  # 평균 손실 계산

            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_dataloader)
        
        # 검증 과정
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for i, (inputs, labels) in enumerate(valid_dataloader):
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs)

                loss = 0
                for j in range(outputs.size(1)):  # 주식 수만큼 반복
                    loss += criterion(outputs[:, j, :], labels[:, j])
                loss /= outputs.size(1)  # 평균 손실 계산

                val_loss += loss.item()
            val_loss /= len(valid_dataloader)

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_weights = model.state_dict()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter > patience:
                    print(f"Early stopping triggered. Stopping training at epoch {epoch}")
                    break
    
    if best_model_weights is not None:
        model_path = f'./checkpoints/best_lstm_model_{seed}_{seq_length}_{rebal_term}.pth'
        torch.save(best_model_weights, model_path)
        print(f"Best LSTM model weights saved successfully at {model_path}.")
    
    best_lstm = MultiLSTMModel(input_size, hidden_size, num_layers, num_classes, num_stocks, dropout)
    best_lstm.load_state_dict(torch.load(model_path))
    best_lstm.to(device)
    best_lstm.eval()
    
    return best_lstm, outputs, labels


In [83]:
# 데이터로더 설정 및 모델 학습
num_stocks = len(selected_stocks)  # 주식 종목 수
input_size = 10  # 입력 특성 수
hidden_size = 64  # LSTM 은닉 상태 크기
num_layers = 2  # LSTM 층 수
num_classes = 2  # 이진 분류이므로 출력 크기는 2
dropout = 0.2  # 드롭아웃 비율
num_epochs = 100

train_dataloader = DataLoader(train_lstm_dataset, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid_lstm_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_lstm_dataset, batch_size=32, shuffle=False)

# 장치 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# SEED 설정
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.deterministic = True
torch.backends.cudnn.benchmark = False

# 학습 함수 호출
best_lstm, outputs, labels = train_lstm(train_dataloader, valid_dataloader, seed=seed, num_epochs=num_epochs, hidden_size=hidden_size,
                       num_classes=num_classes, num_stocks=num_stocks, input_size=input_size, num_layers=num_layers,
                       dropout=dropout, device=device, seq_length=seq_length, rebal_term=rebal_term, patience=10)


Early stopping triggered. Stopping training at epoch 24
Best LSTM model weights saved successfully at ./checkpoints/best_lstm_model_42_100_20.pth.


In [None]:
def test_lstm(test_dataloader, input_size, hidden_size, num_layers, num_classes, num_stocks, dropout, device, model_path):
    # 모델 초기화
    model = MultiLSTMModel(input_size, hidden_size, num_layers, num_classes, num_stocks, dropout)
    model.to(device)

    # 저장된 state_dict 불러오기
    model.load_state_dict(torch.load(model_path))
    
    model.eval()  # 모델을 평가 모드로 설정
    all_predictions = []
    all_labels = []

    outputs_list = []
    
    with torch.no_grad():  # 그래디언트 계산을 비활성화
        for inputs, labels in test_dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)  # 모델을 통해 추론 실행
            _, predicted = torch.max(outputs, dim=2)  # 가장 높은 점수를 가진 클래스를 선택
            outputs_list.append(outputs)
            all_predictions.append(predicted.cpu().numpy())  # 예측 결과 저장
            all_labels.append(labels.cpu().numpy())  # 실제 라벨 저장

    # 결과를 numpy 배열로 변환
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_predictions, all_labels, outputs_list


model_path = f"./checkpoints/best_lstm_model_{seed}_{seq_length}_{rebal_term}.pth"
all_predictions, all_labels, outputs_list = test_lstm(test_dataloader, input_size, hidden_size, num_layers, num_classes, num_stocks, dropout, device, model_path)


In [None]:
model = MultiLSTMModel(input_size, hidden_size, num_layers, num_classes, num_stocks, dropout)
model.to(device)
model_path = f'./checkpoints/best_lstm_model_{seed}_{seq_length}_{rebal_term}.pth'
model.load_state_dict(torch.load(model_path))

def test_lstm(model, test_dataset):
    model.eval()
    test_preds, test_labels = [], []
    results = []
    
    with torch.no_grad():
        for inputs, labels in test_dataset:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            test_preds.append(outputs)
            test_labels.append(labels)
            
            for i in range(outputs.size(0)):
                for j in range(outputs.size(1)):
                    result = torch.argmax(outputs[i, j]).item() == labels[i, j].item()
                    results.append(result)
    

In [101]:
def test_lstm(test_dataloader, input_size, hidden_size, num_layers, num_classes, num_stocks, dropout, device, model_path):
    # 모델 초기화
    

    # 저장된 state_dict 불러오기
    model.load_state_dict(torch.load(model_path))
    
    model.eval()  # 모델을 평가 모드로 설정
    all_predictions = []
    all_labels = []

    outputs_list = []
    
    with torch.no_grad():  # 그래디언트 계산을 비활성화
        for inputs, labels in test_dataloader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)  # 모델을 통해 추론 실행
            softmax_output = torch.softmax(outputs, dim=1)  # 가장 높은 점수를 가진 클래스를 선택
            outputs_list.append(outputs)
            preds = torch.argmax(softmax_output, dim=1).cpu.numpy()
            
            one_preds = []
            zero_preds = []
            for idx, pred in enumerate(preds):
                if pred == 1:
                    one_preds.append(idxm softmax_output[idx])
                else:
                    zero_preds.append(idx)
            
            

    # 결과를 numpy 배열로 변환
    all_predictions = np.concatenate(all_predictions, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_predictions, all_labels, outputs_list


model_path = f"./checkpoints/best_lstm_model_{seed}_{seq_length}_{rebal_term}.pth"
all_predictions, all_labels, outputs_list = test_lstm(test_dataloader, input_size, hidden_size, num_layers, num_classes, num_stocks, dropout, device, model_path)


In [114]:
len(outputs_list[0][0][0]) # 배치개수 * 배치사이즈가 있고, 각 배치사이즈마다 30개의 주식에 대한 각 2가지 경우에 대한 예측이 있다.

2

In [132]:
all_labels[:20]

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        0, 0, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
        0, 1, 0, 1, 0, 0, 0, 1],
       [1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 0, 0, 0, 0,

In [131]:
all_predictions[:20]

array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0,