In [1]:
import sys
sys.path.append('../..')

In [2]:
from Data.data75 import getData
from Data.dataByCode import getData as getDataByCode

import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

from tqdm import tqdm

In [3]:
path = '../../Data'
list_name = 'stock_list.csv'
start_date = '20140106'
end_date = '20220527'

In [4]:
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [5]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path, sample_name))
answer = sample_submission.copy()
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,0,0,0,0,0
1,80,0,0,0,0,0
2,100,0,0,0,0,0
3,120,0,0,0,0,0
4,150,0,0,0,0,0
...,...,...,...,...,...,...
365,348210,0,0,0,0,0
366,352820,0,0,0,0,0
367,357780,0,0,0,0,0
368,363280,0,0,0,0,0


In [6]:
common_data = getData(start_date, end_date)
common_data

Unnamed: 0,Date,weekday,weeknum,kospi_Close,kospi_Volume,kospi_Change,kosdaq_Close,kosdaq_Volume,kosdaq_Change,nasdaq_Close,jpy_Close,jpy_Change,acf_Close,acf_Change,btc_Close,btc_Volume,btc_Change
0,2014-01-06,0,02,1953.28,193630000.0,0.0037,500.62,2.887100e+05,0.0026,4113.68,10.2232,0.0156,0.8108,0.0010,3206000.0,21580.0,0.0329
1,2014-01-07,1,02,1959.44,193030000.0,0.0032,504.86,3.324500e+05,0.0085,4153.18,10.2227,-0.0000,0.8115,0.0009,3206000.0,21580.0,0.0329
2,2014-01-08,2,02,1958.96,217070000.0,-0.0002,509.78,3.687500e+05,0.0097,4165.61,10.1636,-0.0058,0.8110,-0.0006,3206000.0,21580.0,0.0329
3,2014-01-09,3,02,1946.11,276400000.0,-0.0066,511.60,3.639600e+05,0.0036,4156.19,10.1402,-0.0023,0.8072,-0.0047,3206000.0,21580.0,0.0329
4,2014-01-10,4,02,1938.54,262990000.0,-0.0039,510.19,3.996100e+05,-0.0028,4174.66,10.1946,0.0054,0.8119,0.0058,3206000.0,21580.0,0.0329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2185,2022-05-23,0,21,2647.38,644200000.0,0.0031,883.59,1.080000e+09,0.0042,11535.27,9.8349,-0.0125,0.6862,0.0007,37336000.0,1470.0,-0.0406
2186,2022-05-24,1,21,2605.87,851240000.0,-0.0157,865.07,1.100000e+09,-0.0210,11264.45,9.9427,0.0110,0.6820,-0.0061,37739000.0,1450.0,0.0108
2187,2022-05-25,2,21,2617.22,712610000.0,0.0044,872.69,9.893000e+08,0.0088,11434.74,9.9451,0.0002,0.6820,0.0000,37863000.0,1080.0,0.0033
2188,2022-05-26,3,21,2612.45,595850000.0,-0.0018,871.43,9.474200e+08,-0.0014,11740.65,9.9082,-0.0037,0.6809,-0.0016,37595000.0,2170.0,-0.0071


In [7]:
d1 = getDataByCode('000060', start_date, end_date)
d2 = pd.merge(common_data, d1, how='left', left_on='Date', right_index=True)
d2.shape

(2190, 24)

In [8]:
data_dict = dict()
x_public_dict = dict()
x_dict = dict()
y_dict = dict()

for code in tqdm(stock_list['종목코드'].values):
    imported_data = getDataByCode(code, start_date, end_date)
    data = pd.merge(common_data, imported_data, how='left', left_on='Date', right_index=True)

    data = data.drop(columns=['Date', 'weeknum'])
        
    data = data.fillna(method='ffill')
    data = data.fillna(method='bfill')

    size = len(data) // 5 - 2
    col_size = data.shape[1]

    x = np.zeros((size, col_size))
    y = np.zeros((size, 5))

    for i in range(size):
        friday = 4 + 5 * i

        x[i] = data.iloc[friday]

        closes = []

        for j in range(5):
            next = 5 * (i + 1) + j
            closes.append(data.Close.iloc[next])

        friday_close = data.Close.iloc[friday]

        y[i] = (np.array(closes) - friday_close) / friday_close

    x_public = data.iloc[-6].to_numpy().reshape((1, -1))
    
    x_dict[code] = x
    y_dict[code] = y
    x_public_dict[code] = x_public
    data_dict[code] = data

100%|██████████| 370/370 [01:43<00:00,  3.58it/s]


In [9]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.dropout = nn.Dropout()
        
        self.bn = nn.BatchNorm1d(9)
        
        self.fc1 = nn.Linear(22, 9)  # 5*5 from image dimension
        self.fc2 = nn.Linear(9, 5)
        
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)

    def forward(self, x):
        model = nn.Sequential(
            self.fc1,
            self.bn,
            nn.ReLU(),
            self.dropout,
            self.fc2
        )
        
        return model(x)

In [10]:
for i, code in enumerate(tqdm(stock_list['종목코드'].values)):
    model = Net()

    x = x_dict[code]
    y = y_dict[code]
    
    # scaler = StandardScaler()
    # scaler.fit(x)
    # print(x)
    # x = scaler.transform(x)
    # print(x)
    # break
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    
    x_public = x_public_dict[code]
    # x_public = scaler.transform(x_public)
    
    data = data_dict[code]
    
    tensor_x_train = torch.Tensor(x_train) # transform to torch tensor
    tensor_y_train = torch.Tensor(y_train)
    
    tensor_x_test = torch.Tensor(x_test) # transform to torch tensor
    tensor_y_test = torch.Tensor(y_test)

    train_dataset = TensorDataset(tensor_x_train, tensor_y_train) # create your datset
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True) # create your dataloader

    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.1)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer=optimizer,
                                        lr_lambda=lambda epoch: 0.95 ** epoch,
                                        last_epoch=-1,
                                        verbose=False)
    
    losses = []
    
    for epoch in range(100):
        min_loss = 9999999
        
        model.train()

        for j, batch_data in enumerate(train_dataloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = batch_data

            optimizer.zero_grad()   # zero the gradient buffers
            output = model(inputs)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()    # Does the update
        
        scheduler.step()
        
        model.eval()
        
        with torch.no_grad():
            output = model(tensor_x_test)
            loss = criterion(output, tensor_y_test)
            l = loss.item()
            
            losses.append(l)
            
            if l < min_loss:
                min_loss = l
                torch.save(model.state_dict(), 'model.pt')
    
#     plt.plot(losses[10:])
#     plt.show()
#     print(losses)

#     if i == 10:
#         break
    
    model.eval()
    
    model.load_state_dict(torch.load('model.pt'))

    with torch.no_grad():
        prediction = model(torch.Tensor(x_public))
        prediction = prediction.numpy()
        
        #prediction = np.zeros((1, 5))
        
        friday_close = data.Close.iloc[-6]

        sample_submission.iloc[i,1:] = (friday_close * (1 + prediction))[0]
        answer.iloc[i,1:] = data.Close.iloc[-5:].to_numpy()

100%|██████████| 370/370 [04:34<00:00,  1.35it/s]


In [11]:
answer.iloc[answer['Index'].isin([31390, 36490]), 1:] = [0,0,0,0,0]
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39750.0,39700.0,40800.0,39800.0,39800.0
1,80,35350.0,34950.0,35900.0,36150.0,36250.0
2,100,59600.0,59100.0,59700.0,59900.0,59800.0
3,120,123000.0,125500.0,125500.0,127500.0,129000.0
4,150,80400.0,79900.0,80400.0,80100.0,80400.0
...,...,...,...,...,...,...
365,348210,68400.0,66500.0,67000.0,66300.0,67000.0
366,352820,221000.0,215500.0,219500.0,214500.0,218500.0
367,357780,269900.0,260600.0,260600.0,270900.0,271800.0
368,363280,24500.0,24400.0,24550.0,24600.0,24400.0


In [12]:
sample_submission.iloc[sample_submission['Index'].isin([31390, 36490]), 1:] = [0,0,0,0,0]
sample_submission

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39051.019531,39143.753906,39207.828125,39281.000000,39273.183594
1,80,35596.164062,35655.742188,35683.968750,35654.519531,35656.613281
2,100,58595.679688,58666.500000,58680.292969,58547.582031,58637.632812
3,120,124493.414062,124496.570312,124666.640625,124481.156250,124612.500000
4,150,79681.914062,79682.273438,79695.398438,79679.984375,79765.679688
...,...,...,...,...,...,...
365,348210,68358.406250,68431.578125,68478.867188,68567.859375,68618.796875
366,352820,220816.484375,221168.062500,220994.593750,221217.671875,221030.531250
367,357780,262762.125000,262844.000000,262722.906250,262892.625000,262863.906250
368,363280,24694.646484,24727.578125,24729.218750,24711.736328,24703.125000


In [13]:
result_arr = sample_submission.iloc[:,1:].to_numpy()
answer_arr = answer.iloc[:,1:].to_numpy()

print(np.mean(np.abs(result_arr - answer_arr)))

1806.2522331608952


In [14]:
sample_submission.to_csv('submission.csv', index=False)