In [60]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = "/content/drive/MyDrive/코랩파일들/파이널프로젝트/data/"
SEED = 42

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


'cpu'

In [61]:
import torch.nn as nn

In [152]:
import pandas_datareader.data as web
import datetime as dt
import yfinance as yfin

yfin.pdr_override()
start = dt.datetime(2016,1,1)
end = dt.datetime(2023,3,1)

train = web.get_data_yahoo('^KS200',start = start,end = end)

[*********************100%%**********************]  1 of 1 completed


In [153]:
train

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-04,239.300003,239.350006,234.630005,234.630005,234.630005,68200
2016-01-05,233.960007,237.250000,233.960007,236.130005,236.130005,67000
2016-01-06,236.479996,236.479996,233.270004,234.600006,234.600006,89000
2016-01-07,233.229996,234.539993,231.520004,231.929993,231.929993,88000
2016-01-08,230.399994,233.779999,229.910004,233.679993,233.679993,117200
...,...,...,...,...,...,...
2023-02-22,317.940002,318.739990,315.829987,315.940002,315.940002,131300
2023-02-23,317.769989,321.130005,317.089996,319.440002,319.440002,144600
2023-02-24,319.890015,321.269989,316.799988,316.940002,316.940002,120100
2023-02-27,314.290009,314.429993,311.630005,313.940002,313.940002,109300


In [154]:
train[train.columns].min(axis=0)

Open         200.800003
High         206.660004
Low          196.270004
Close        199.279999
Adj Close    199.279999
Volume         0.000000
dtype: float64

In [155]:
train['AVG'] = train['Close'].rolling(window=20).mean()
train

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,AVG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-04,239.300003,239.350006,234.630005,234.630005,234.630005,68200,
2016-01-05,233.960007,237.250000,233.960007,236.130005,236.130005,67000,
2016-01-06,236.479996,236.479996,233.270004,234.600006,234.600006,89000,
2016-01-07,233.229996,234.539993,231.520004,231.929993,231.929993,88000,
2016-01-08,230.399994,233.779999,229.910004,233.679993,233.679993,117200,
...,...,...,...,...,...,...,...
2023-02-22,317.940002,318.739990,315.829987,315.940002,315.940002,131300,322.272000
2023-02-23,317.769989,321.130005,317.089996,319.440002,319.440002,144600,321.965500
2023-02-24,319.890015,321.269989,316.799988,316.940002,316.940002,120100,321.413499
2023-02-27,314.290009,314.429993,311.630005,313.940002,313.940002,109300,320.976999


# Min_Max Scaling 사용한 이유
- 지수이기에 큰 변동성이 없다 > 이상치가 적다.

In [156]:
mins = train[train.columns].min(axis=0)
sizes = train[train.columns].max(axis=0)- mins

In [157]:
train_ft = (train[train.columns]-mins)/sizes
train_ft

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,AVG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-04,0.160430,0.134871,0.158199,0.146608,0.146608,0.000379,
2016-01-05,0.138178,0.126207,0.155436,0.152828,0.152828,0.000373,
2016-01-06,0.148679,0.123030,0.152590,0.146483,0.146483,0.000495,
2016-01-07,0.135136,0.115026,0.145373,0.135410,0.135410,0.000490,
2016-01-08,0.123344,0.111890,0.138733,0.142668,0.142668,0.000652,
...,...,...,...,...,...,...,...
2023-02-22,0.488124,0.462414,0.493072,0.483826,0.483826,0.000731,0.450207
2023-02-23,0.487416,0.472275,0.498268,0.498341,0.498341,0.000805,0.448727
2023-02-24,0.496250,0.472852,0.497072,0.487973,0.487973,0.000668,0.446063
2023-02-27,0.472914,0.444632,0.475751,0.475531,0.475531,0.000608,0.443956


In [158]:
def transform_data(data,mins,sizes,seq_len=60,pred_len=1):

    # data = (data-mins)/sizes

    col_len = data.shape[-1]
    window_size = seq_len+pred_len
    data_len = len(data)-window_size+1 # 0부터 시작하니까
    new_arr = data[:data_len]
    # print(new_arr)
    for i in range(1,window_size):

        data = np.delete(data,0,axis=0)
        new_arr = np.concatenate([new_arr,data[:data_len]],axis=1)
    new_arr = new_arr.reshape(data_len,window_size,col_len)

    x_arr = new_arr[:,:seq_len,:-1]
    y_arr = new_arr[:,seq_len:,3]

    return x_arr,y_arr

In [159]:
seq_len=120
pred_len=60
x_arr, y_arr = transform_data(train_ft.values,mins,sizes,seq_len,pred_len)

In [160]:
x_arr.shape # batch,seq,feature

(1573, 120, 6)

In [161]:
x_arr = x_arr[:,:,3:4]
x_arr.shape

(1573, 120, 1)

In [162]:
y_arr.shape

(1573, 60)

In [163]:
class FinanceDataset(torch.utils.data.Dataset):
    def __init__(self,x,y=None):
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self,idx):
        item={}
        item['x'] = torch.Tensor(self.x[idx])
        if self.y is not None:
            item['y'] = torch.Tensor(self.y[idx])

        return item

In [78]:
dt = FinanceDataset(x_arr,y_arr)

In [79]:
dl = torch.utils.data.DataLoader(dt,batch_size=1,shuffle=False)
batch = next(iter(dl))
batch

{'x': tensor([[[0.2496],
          [0.2560],
          [0.2532],
          [0.2418],
          [0.2490],
          [0.2366],
          [0.2345],
          [0.2482],
          [0.2394],
          [0.2280],
          [0.2278],
          [0.2338],
          [0.2099],
          [0.2072],
          [0.2283],
          [0.2359],
          [0.2241],
          [0.2383],
          [0.2432],
          [0.2460],
          [0.2529],
          [0.2430],
          [0.2344],
          [0.2482],
          [0.2491],
          [0.2186],
          [0.2044],
          [0.2190],
          [0.2331],
          [0.2307],
          [0.2442],
          [0.2482],
          [0.2483],
          [0.2471],
          [0.2462],
          [0.2495],
          [0.2503],
          [0.2484],
          [0.2651],
          [0.2709],
          [0.2695],
          [0.2708],
          [0.2644],
          [0.2681],
          [0.2770],
          [0.2781],
          [0.2785],
          [0.2773],
          [0.2800],
          [0.28

In [164]:
class moving_avg(torch.nn.Module):
    def __init__(self, kernel_size, stride):
        super(moving_avg, self).__init__()
        self.kernel_size = kernel_size
        self.avg = torch.nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0)

    def forward(self, x):
        front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1)
        x = torch.cat([front, x, end], dim=1)
        x = self.avg(x.permute(0, 2, 1))
        x = x.permute(0, 2, 1)
        return x

class series_decomp(torch.nn.Module):
  def __init__(self, kernel_size):
      super(series_decomp, self).__init__()
      self.moving_avg = moving_avg(kernel_size, stride=1)

  def forward(self, x):
      moving_mean = self.moving_avg(x)
      residual = x - moving_mean
      return moving_mean, residual

class DLinear(torch.nn.Module):
  def __init__(self, window_size, forcast_size, kernel_size, individual, feature_size):
      super().__init__()
      self.window_size = window_size
      self.forcast_size = forcast_size
      self.decompsition = series_decomp(kernel_size)
      self.individual = individual
      self.channels = feature_size
      self.fc_layer = torch.nn.Linear(feature_size, 1)
      if self.individual:
          self.Linear_Seasonal = torch.nn.ModuleList()
          self.Linear_Trend = torch.nn.ModuleList()
          for i in range(self.channels):
              self.Linear_Trend.append(torch.nn.Linear(self.window_size, self.forcast_size))
              self.Linear_Trend[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
              self.Linear_Seasonal.append(torch.nn.Linear(self.window_size, self.forcast_size))
              self.Linear_Seasonal[i].weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
      else:
          self.Linear_Trend = torch.nn.Linear(self.window_size, self.forcast_size)
          self.Linear_Trend.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))
          self.Linear_Seasonal = torch.nn.Linear(self.window_size,  self.forcast_size)
          self.Linear_Seasonal.weight = torch.nn.Parameter((1/self.window_size)*torch.ones([self.forcast_size, self.window_size]))

  def forward(self, x):
      trend_init, seasonal_init = self.decompsition(x)
      trend_init, seasonal_init = trend_init.permute(0,2,1), seasonal_init.permute(0,2,1) # batch,feature,seq
      if self.individual:
          trend_output = torch.zeros([trend_init.size(0), trend_init.size(1), self.forcast_size], dtype=trend_init.dtype).to(trend_init.device)
          seasonal_output = torch.zeros([seasonal_init.size(0), seasonal_init.size(1), self.forcast_size], dtype=seasonal_init.dtype).to(seasonal_init.device)
          for idx in range(self.channels):
              trend_output[:, idx, :] = self.Linear_Trend[idx](trend_init[:, idx, :])
              seasonal_output[:, idx, :] = self.Linear_Seasonal[idx](seasonal_init[:, idx, :])
      else:
          trend_output = self.Linear_Trend(trend_init)
          seasonal_output = self.Linear_Seasonal(seasonal_init)
      x = seasonal_output + trend_output

      return x.permute(0,2,1).squeeze()


In [81]:
model = DLinear(seq_len,pred_len,kernel_size=25,individual=False,feature_size=6)

In [82]:
tmp = model(batch['x'].to(device))
tmp

tensor([0.3171, 0.2248, 0.2213, 0.2052, 0.3377, 0.3360, 0.4278, 0.2100, 0.1377,
        0.1315, 0.2524, 0.3444, 0.2161, 0.2786, 0.2797, 0.2714, 0.1636, 0.2559,
        0.2396, 0.2351, 0.1947, 0.4095, 0.3465, 0.2191, 0.2712, 0.2821, 0.2730,
        0.3112, 0.3774, 0.2394, 0.2423, 0.2795, 0.3345, 0.1962, 0.2744, 0.2794,
        0.3088, 0.2141, 0.1462, 0.2455, 0.1871, 0.4133, 0.3808, 0.2766, 0.2199,
        0.2325, 0.2502, 0.1954, 0.2329, 0.3816, 0.2702, 0.2898, 0.1936, 0.1721,
        0.2661, 0.2488, 0.3875, 0.2242, 0.3850, 0.3248],
       grad_fn=<SqueezeBackward0>)

In [165]:
def train_loop(dl,model,loss_fn,optimizer,device):

    epoch_loss = 0
    model.train()

    for batch in dl:
        pred = model(batch['x'].to(device))
        loss = loss_fn(pred,batch['y'].to(device))

        optimizer.zero_grad() # 경사 초기화
        loss.backward()# 역전파 부분
        optimizer.step() # 가중치 업데이트

        epoch_loss += loss.item()

    epoch_loss /= len(dl)

    return epoch_loss

In [166]:
@torch.inference_mode()
def test_loop(dl,model,loss_fn,device):

    epoch_loss = 0
    model.eval()
    pred_list = []
    for batch in dl:

        pred = model(batch['x'].to(device))

        if batch.get('y') is not None:
            loss = loss_fn(pred,batch['y'].to(device))
            epoch_loss += loss.item()
        pred = pred.to('cpu').numpy()
        pred_list.append(pred)

    pred = np.concatenate(pred_list) # 배치단위 예측값들 결합.
    epoch_loss /= len(dl)
    return epoch_loss,pred

# RMSE
- 오류가 커짐에 따라서 큰 패널티를 주고싶었다.
- 직관적이다.

In [167]:
class RMSELoss(torch.nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()

    def forward(self,x,y):
        criterion = torch.nn.MSELoss()
        eps = 1e-6
        loss = torch.sqrt(criterion(x, y) + eps)
        return loss

In [168]:
n_splits = 5
window_size = seq_len
forcast_size = pred_len
kernel_size=25
batch_size = 32
individual = False
feature_size = len(train.columns)
epochs = 500
loss_fn = RMSELoss()
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
cv = KFold(n_splits = n_splits,shuffle=True, random_state = SEED)

In [169]:
is_holdout = True
reset_seeds(SEED)
best_score_list = []
tmp = pd.DataFrame()
for i, (tri,vai) in enumerate(cv.split(x_arr)):
    model = DLinear(window_size,forcast_size,kernel_size,individual,feature_size).to(device)
    optimizer = torch.optim.Adam(model.parameters())

    # 학습용
    train_dt = FinanceDataset(x_arr[tri],y_arr[tri])
    train_dl = torch.utils.data.DataLoader(train_dt,batch_size=batch_size,shuffle=True)

    # 검증용
    valid_dt = FinanceDataset(x_arr[vai],y_arr[vai])
    valid_dl = torch.utils.data.DataLoader(valid_dt,batch_size=batch_size,shuffle=False)

    best_score = np.inf # 낮을수록 좋은 것이니까
    patience= 0
    t_lst = []
    v_lst = []
    for epoch in tqdm(range(epochs)):
        train_loss = train_loop(train_dl,model,loss_fn,optimizer,device)
        valid_loss,pred = test_loop(valid_dl,model,loss_fn,device)
        t_lst.append(train_loss)
        v_lst.append(valid_loss)

        pred = pred*sizes[-1]+mins[-1]# 원상복귀를 해줘야함. 3번째(종가만 가져오면 됨.)
        y_true = y_arr[vai]*sizes[-1]+mins[-1]
        score = mean_squared_error(y_true,pred,squared=False)
        patience+=1
        if best_score > score :

            patience = 0
            print(score)
            best_score = score
            torch.save(model.state_dict(),f'{DATA_PATH}model7d_{i}.pth')

        if patience ==30 :
            break
    print(f'Fold[{i}] BEST RMSE : {best_score}')
    best_score_list.append(best_score)

    if is_holdout :
        break
tmp['t_loss'] = t_lst
tmp['v_loss'] = v_lst

# y_arr*sizes[3]+mins[3]

  0%|          | 0/500 [00:00<?, ?it/s]

23.807332070188547
19.024785684695225
17.81494232210304
17.75770574820956
17.214318877568978
16.45377552339103
16.317976866936654
16.13358055022028
15.900968105996412
15.798800171247915
15.667186398079043
Fold[0] BEST RMSE : 15.667186398079043


In [170]:
import plotly.express as px
px.line(tmp,y = ['t_loss','v_loss'])

In [172]:
start = dt.datetime(2023,1,1)
pred_data = web.get_data_yahoo('^KS200',start = start)

[*********************100%%**********************]  1 of 1 completed


In [173]:
pred_data['AVG'] = pred_data['Close'].rolling(20).mean()
pred_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,AVG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-01-02,292.899994,294.019989,289.190002,289.790009,289.790009,92500,
2023-01-03,290.640015,290.950012,284.320007,289.579987,289.579987,106600,
2023-01-04,288.339996,296.700012,287.940002,295.980011,295.980011,109900,
2023-01-05,297.649994,299.690002,296.010010,297.869995,297.869995,137300,
2023-01-06,296.690002,303.100006,296.399994,301.529999,301.529999,121000,
...,...,...,...,...,...,...,...
2023-09-07,336.929993,338.019989,335.079987,336.739990,336.739990,88400,334.381001
2023-09-08,336.290009,336.660004,333.920013,336.399994,336.399994,79800,334.178001
2023-09-11,336.519989,337.619995,335.329987,337.579987,337.579987,112600,334.108000
2023-09-12,338.510010,338.809998,335.279999,335.540009,335.540009,94700,334.068501


In [174]:
pred_data = (pred_data-mins)/sizes

In [175]:
tgt_data = pred_data['Close'].iloc[-120:].to_numpy().reshape(1,-1,1)
tgt_data

array([[[0.48303751],
        [0.47702385],
        [0.47101033],
        [0.48556731],
        [0.48826316],
        [0.49257627],
        [0.50908262],
        [0.50223958],
        [0.50792138],
        [0.51542801],
        [0.49162247],
        [0.51459849],
        [0.52712346],
        [0.54574485],
        [0.55080458],
        [0.55495189],
        [0.55739879],
        [0.55839423],
        [0.55665231],
        [0.55777203],
        [0.55316858],
        [0.5460352 ],
        [0.53674526],
        [0.51924359],
        [0.51712842],
        [0.52177339],
        [0.52745519],
        [0.53836264],
        [0.52770402],
        [0.52625256],
        [0.53757464],
        [0.5347545 ],
        [0.52571339],
        [0.52256139],
        [0.51563532],
        [0.51916056],
        [0.52268581],
        [0.52807726],
        [0.54147313],
        [0.55967989],
        [0.56814032],
        [0.57154114],
        [0.56905273],
        [0.56432486],
        [0.57162404],
        [0

In [176]:
model = DLinear(window_size,forcast_size,kernel_size,individual,feature_size).to(device)
state_dict = torch.load(f'{DATA_PATH}model7d_{i}.pth')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [177]:
pred = model(torch.Tensor(tgt_data))

In [178]:
pred = pred*sizes[-1]+mins[-1]

In [179]:
pred = pred.detach().numpy()

In [180]:
dates=pd.date_range(start=dt.datetime.today().date()+dt.timedelta(1), end=dt.datetime.today().date()+dt.timedelta(90), freq='D')

In [181]:
dates[dates.weekday<5][:60]

DatetimeIndex(['2023-09-14', '2023-09-15', '2023-09-18', '2023-09-19',
               '2023-09-20', '2023-09-21', '2023-09-22', '2023-09-25',
               '2023-09-26', '2023-09-27', '2023-09-28', '2023-09-29',
               '2023-10-02', '2023-10-03', '2023-10-04', '2023-10-05',
               '2023-10-06', '2023-10-09', '2023-10-10', '2023-10-11',
               '2023-10-12', '2023-10-13', '2023-10-16', '2023-10-17',
               '2023-10-18', '2023-10-19', '2023-10-20', '2023-10-23',
               '2023-10-24', '2023-10-25', '2023-10-26', '2023-10-27',
               '2023-10-30', '2023-10-31', '2023-11-01', '2023-11-02',
               '2023-11-03', '2023-11-06', '2023-11-07', '2023-11-08',
               '2023-11-09', '2023-11-10', '2023-11-13', '2023-11-14',
               '2023-11-15', '2023-11-16', '2023-11-17', '2023-11-20',
               '2023-11-21', '2023-11-22', '2023-11-23', '2023-11-24',
               '2023-11-27', '2023-11-28', '2023-11-29', '2023-11-30',
      

In [182]:
tmp2 = pd.DataFrame()

In [183]:
tmp2['date'] = dates[dates.weekday<5][:60]
tmp2['pred'] = pred

In [184]:
px.line(tmp2,x='date',y='pred')