In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/코랩파일들/파이널프로젝트/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

Mounted at /content/drive


'cpu'

In [None]:
import torch.nn as nn

In [None]:
import pandas_datareader.data as web
import datetime as dt
import yfinance as yfin

yfin.pdr_override()
start = dt.datetime(2016,7,8)
end = dt.datetime(2020,5,11)

train = web.get_data_yahoo('^KS11',start = start,end = end)

[*********************100%%**********************]  1 of 1 completed


In [None]:
train[train.columns].min(axis=0)

Open           1474.449951
High           1516.750000
Low            1439.430054
Close          1457.640015
Adj Close      1457.640015
Volume       184200.000000
dtype: float64

# Min_Max Scaling 사용한 이유
- 지수이기에 큰 변동성이 없다 > 이상치가 적다.

In [None]:
mins = train[train.columns].min(axis=0)
sizes = train[train.columns].max(axis=0)- mins

In [None]:
train_ft = (train[train.columns]-mins)/sizes
train_ft

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-07-08,0.446100,0.424634,0.450719,0.443172,0.443172,0.095722
2016-07-11,0.452445,0.434906,0.468923,0.465477,0.465477,0.085333
2016-07-12,0.462992,0.441400,0.476448,0.467836,0.467836,0.138056
2016-07-13,0.477795,0.455679,0.484627,0.480391,0.480391,0.096333
2016-07-14,0.474632,0.452075,0.486456,0.483214,0.483214,0.086667
...,...,...,...,...,...,...
2020-04-29,0.414388,0.404237,0.431035,0.429547,0.429547,1.000000
2020-05-04,0.387084,0.368606,0.396178,0.383789,0.383789,0.534833
2020-05-06,0.398760,0.377869,0.418092,0.413064,0.413064,0.498889
2020-05-07,0.401045,0.386802,0.417326,0.412932,0.412932,0.328611


In [None]:
def transform_data(data,mins,sizes,seq_len=60,pred_len=1):

    # data = (data-mins)/sizes

    col_len = data.shape[-1]
    window_size = seq_len+pred_len
    data_len = len(data)-window_size+1 # 0부터 시작하니까
    new_arr = data[:data_len]
    # print(new_arr)
    for i in range(1,window_size):

        data = np.delete(data,0,axis=0)
        new_arr = np.concatenate([new_arr,data[:data_len]],axis=1)
    new_arr = new_arr.reshape(data_len,window_size,col_len)

    x_arr = new_arr[:,:seq_len]
    y_arr = new_arr[:,seq_len:,3]

    return x_arr,y_arr

In [None]:
seq_len=63
pred_len=7
x_arr, y_arr = transform_data(train_ft.values,mins,sizes,seq_len,pred_len)

In [None]:
x_arr.shape # batch,seq,feature

(868, 63, 6)

In [None]:
x_arr = x_arr[:,:,3:4]
x_arr.shape

(868, 63, 1)

In [None]:
y_arr.shape

(868, 7)

In [None]:
class FinanceDataset(torch.utils.data.Dataset):
    def __init__(self,x,y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self,idx):
        item={}
        item['x'] = torch.Tensor(self.x[idx])
        if self.y is not None:
            item['y'] = torch.Tensor(self.y[idx])

        return item

In [None]:
dt = FinanceDataset(x_arr,y_arr)

In [None]:
dl = torch.utils.data.DataLoader(dt,batch_size=1,shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[[1963.1000],
          [1988.5400],
          [1991.2300],
          [2005.5500],
          [2008.7700],
          [2017.2600],
          [2021.1100],
          [2016.8900],
          [2015.4600],
          [2012.2200],
          [2010.3400],
          [2012.3199],
          [2027.3400],
          [2025.0500],
          [2021.1000],
          [2016.1899],
          [2029.6100],
          [2019.0300],
          [1994.7900],
          [2000.0300],
          [2017.9399],
          [2031.1200],
          [2043.7800],
          [2044.6400],
          [2048.8000],
          [2050.4700],
          [2047.7600],
          [2043.7500],
          [2055.4700],
          [2056.2400],
          [2042.1600],
          [2049.9299],
          [2043.7600],
          [2042.9200],
          [2037.5000],
          [2032.3500],
          [2039.7400],
          [2034.6500],
          [2032.7200],
          [2038.3101],
          [2060.0801],
          [2066.5300],
          [2061.8799],
      

In [None]:
class moving_avg(torch.nn.Module):
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = torch.nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x

class series_decomp(torch.nn.Module):
  def __init__(self, kernel_size):
      super(series_decomp, self).__init__()
      self.moving_avg = moving_avg(kernel_size, stride=1)

  def forward(self, x):
      moving_mean = self.moving_avg(x)
      residual = x - moving_mean
      return moving_mean, residual

class DLinear(torch.nn.Module):
  def __init__(self, window_size, forcast_size, kernel_size, individual, feature_size):
      super().__init__()
      self.window_size = window_size
      self.forcast_size = forcast_size
      self.decompsition = series_decomp(kernel_size)
      self.individual = individual
      self.channels = feature_size
      self.fc_layer = torch.nn.Linear(feature_size, 1)
      if self.individual:
          self.Linear_Seasonal = torch.nn.ModuleList()
          self.Linear_Trend = torch.nn.ModuleList()
          for i in range(self.channels):
              self.Linear_Trend.append(torch.nn.Linear(self.window_size, self.forcast_size))
              self.Linear_Trend[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
              self.Linear_Seasonal.append(torch.nn.Linear(self.window_size, self.forcast_size))
              self.Linear_Seasonal[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
      else:
          self.Linear_Trend = torch.nn.Linear(self.window_size, self.forcast_size)
          self.Linear_Trend.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
          self.Linear_Seasonal = torch.nn.Linear(self.window_size,  self.forcast_size)
          self.Linear_Seasonal.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))

  def forward(self, x):
      trend_init, seasonal_init = self.decompsition(x)
      trend_init, seasonal_init = trend_init.permute(0,2,1), seasonal_init.permute(0,2,1) # batch,feature,seq
      if self.individual:
          trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.forcast_size], dtype=trend_init.dtype).to(trend_init.device)
          seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.forcast_size], dtype=seasonal_init.dtype).to(seasonal_init.device)
          for idx in range(self.channels):
              trend_output[:, idx, :] = self.Linear_Trend[idx](trend_init[:, idx, :])
              seasonal_output[:, idx, :] = self.Linear_Seasonal[idx](seasonal_init[:, idx, :])
      else:
          trend_output = self.Linear_Trend(trend_init)
          seasonal_output = self.Linear_Seasonal(seasonal_init)
      x = seasonal_output + trend_output

      return x.permute(0,2,1).squeeze()


In [None]:
model = DLinear(seq_len,pred_len,kernel_size=25,individual=False,feature_size=6)

In [None]:
tmp = model(batch['x'].to(device))
tmp

tensor([2033.0326, 2033.0782, 2032.9641, 2033.1910, 2033.0518, 2033.0779,
        2033.0900], grad_fn=<SqueezeBackward0>)

In [None]:
def train_loop(dl,model,loss_fn,optimizer,device):

    epoch_loss = 0
    model.train()

    for batch in dl:
        pred = model(batch['x'].to(device))
        loss = loss_fn(pred,batch['y'].to(device))

        optimizer.zero_grad() # 경사 초기화
        loss.backward()# 역전파 부분
        optimizer.step() # 가중치 업데이트

        epoch_loss += loss.item()

    epoch_loss /= len(dl)

    return epoch_loss

In [None]:
@torch.inference_mode()
def test_loop(dl,model,loss_fn,device):

    epoch_loss = 0
    model.eval()
    pred_list = []
    for batch in dl:

        pred = model(batch['x'].to(device))

        if batch.get('y') is not None:
            loss = loss_fn(pred,batch['y'].to(device))
            epoch_loss += loss.item()
        pred = pred.to('cpu').numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list) # 배치단위 예측값들 결합.
    epoch_loss /= len(dl)
    return epoch_loss,pred

# RMSE
- 오류가 커짐에 따라서 큰 패널티를 주고싶었다.
- 직관적이다.

In [None]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = torch.nn.MSELoss()
        eps = 1e-6
        loss = torch.sqrt(criterion(x, y) + eps)
        return loss

In [None]:
n_splits = 5
window_size = seq_len
forcast_size = pred_len
kernel_size=25
batch_size = 32
individual = False
feature_size = len(train.columns)
epochs = 1000
loss_fn = RMSELoss()
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
cv = KFold(n_splits = n_splits,shuffle=True, random_state = SEED)

In [None]:
is_holdout = True
reset_seeds(SEED)
best_score_list = []
for i, (tri,vai) in enumerate(cv.split(x_arr)):
    model = DLinear(window_size,forcast_size,kernel_size,individual,feature_size).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    # 학습용
    train_dt = FinanceDataset(x_arr[tri],y_arr[tri])
    train_dl = torch.utils.data.DataLoader(train_dt,batch_size=batch_size,shuffle=True)

    # 검증용
    valid_dt = FinanceDataset(x_arr[vai],y_arr[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt,batch_size=batch_size,shuffle=False)

    best_score = np.inf # 낮을수록 좋은 것이니까
    patience= 0
    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl,model,loss_fn,optimizer,device)
        valid_loss,pred = test_loop(valid_dl,model,loss_fn,device)

        pred = pred*sizes[3]+mins[3]# 원상복귀를 해줘야함. 3번째(종가만 가져오면 됨.)
        y_true = y_arr[vai]*sizes[3]+mins[3]
        score = mean_squared_error(y_true,pred,squared=False)
        patience+=1

        if best_score > score :

            patience = 0
            print(score)
            best_score = score
            torch.save(model.state_dict(),f'{DATA_PATH}model7d_{i}.pth')

        if patience ==100 :
            break
    print(f'Fold[{i}] BEST RMSE : {best_score}')
    best_score_list.append(best_score)

    if is_holdout :
        break

# y_arr*sizes[3]+mins[3]

  0%|          | 0/1000 [00:00<?, ?it/s]

96.2991652080909
83.98722016082809
78.2525395656518
72.58738976881286
70.36450555402065
62.78122025976854
59.705881759799404
59.092637520254925
58.086745956509176
51.85523850277338
47.46606590344339
47.42304281490708
45.65677377304651
45.369056453712254
45.144473699499926
43.97861674015532
42.8806150603737
42.6535311307232
42.01197390156161
42.006305113687006
42.00499460972544
41.8630766928273
Fold[0] BEST RMSE : 41.8630766928273


In [None]:
tmp.shape

torch.Size([7])

In [None]:
batch['y']

tensor([[2033.7300, 2015.4399, 2022.6600, 2027.6100, 2040.4301, 2040.9399,
         2040.6000]])

In [None]:
mov_avg = moving_avg(25,stride=1)

In [None]:
mov_avg(X).shape

torch.Size([32, 3, 5])

In [None]:
mov_avg = moving_avg(25,stride=1)
moving_mean = mov_avg(X)
print(X.size(), moving_mean.size())
res = X - moving_mean
print(res.size(), moving_mean.size())

torch.Size([32, 3, 5]) torch.Size([32, 3, 5])
torch.Size([32, 3, 5]) torch.Size([32, 3, 5])


In [None]:
seasonal_init = res
trend_init = moving_mean

In [None]:
seasonal_init, trend_init = seasonal_init.permute(0,2,1), trend_init.permute(0,2,1)

In [None]:
seasonal_init.shape

torch.Size([32, 5, 3])

In [None]:
season_layer = nn.Linear(in_seq_len, pred_len)

In [None]:
trend_layer = nn.Linear(in_seq_len, pred_len)
seasonal_output = season_layer(seasonal_init)
trend_output =  trend_layer(trend_init)

In [None]:
seasonal_output.shape

torch.Size([32, 5, 2])

In [None]:
trend_output.shape

torch.Size([32, 5, 2])

In [None]:
print(seasonal_output.size(),  trend_output.size(), (seasonal_output+trend_output).size())

torch.Size([32, 5, 2]) torch.Size([32, 5, 2]) torch.Size([32, 5, 2])


In [None]:
(seasonal_output+trend_output).permute(0,2,1).size()

torch.Size([32, 2, 5])