In [109]:
import sys
sys.path.append('../..')

In [110]:
from Data.data import getData

import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import RidgeCV, MultiTaskLassoCV, MultiTaskElasticNetCV
from tqdm import tqdm

In [111]:
path = '../../Data'
list_name = 'stock_list.csv'
start_date = '20200106'
end_date = '20220527'

In [112]:
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [126]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path, sample_name))
answer = sample_submission.copy()
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,0,0,0,0,0
1,80,0,0,0,0,0
2,100,0,0,0,0,0
3,120,0,0,0,0,0
4,150,0,0,0,0,0
...,...,...,...,...,...,...
365,348210,0,0,0,0,0
366,352820,0,0,0,0,0
367,357780,0,0,0,0,0
368,363280,0,0,0,0,0


In [114]:
common_data = getData(start_date, end_date)
common_data

Unnamed: 0,Date,weekday,weeknum,kospi_Close,kospi_Volume,kospi_Change,kosdaq_Close,kosdaq_Volume,kosdaq_Change,nasdaq_Close,...,jpy_Change,acf_Close,acf_Change,ugb_Close,vix_Close,btc_Close,btc_Volume,btc_Change,kgb_Close,kgb_Change
0,2020-01-06,0,02,2155.07,592670000.0,-0.0098,655.31,8.842800e+08,-0.0218,9071.465,...,-0.0009,0.6716,-0.0055,1.81,13.85,8855000,1260.000000,0.0467,1.537000,-0.0147
1,2020-01-07,1,02,2175.54,568240000.0,0.0095,663.44,9.703100e+08,0.0124,9068.582,...,-0.0001,0.6663,-0.0079,1.83,13.79,9391000,2850.000000,0.0605,1.616000,0.0514
2,2020-01-08,2,02,2151.31,913830000.0,-0.0111,640.94,1.380000e+09,-0.0339,9129.242,...,-0.0109,0.6683,0.0030,1.87,13.45,9096000,4040.000000,-0.0314,1.628000,0.0074
3,2020-01-09,3,02,2186.45,592600000.0,0.0163,666.09,9.806200e+08,0.0392,9203.426,...,-0.0065,0.6674,-0.0013,1.85,12.54,8946000,1180.000000,-0.0165,1.688000,0.0369
4,2020-01-10,4,02,2206.39,594540000.0,0.0091,673.03,8.373900e+08,0.0104,9178.859,...,-0.0003,0.6710,0.0054,1.83,12.56,9175427,1027.142857,0.0100,1.708571,0.0010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,2022-05-23,0,21,2647.38,644200000.0,0.0031,883.59,1.080000e+09,0.0042,11535.270,...,-0.0125,0.6862,0.0007,2.86,28.48,37336000,1470.000000,-0.0406,3.257000,0.0052
621,2022-05-24,1,21,2605.87,851240000.0,-0.0157,865.07,1.100000e+09,-0.0210,11264.450,...,0.0110,0.6820,-0.0061,2.76,29.45,37739000,1450.000000,0.0108,3.216000,-0.0126
622,2022-05-25,2,21,2617.22,712610000.0,0.0044,872.69,9.893000e+08,0.0088,11434.740,...,0.0002,0.6820,0.0000,2.75,28.37,37863000,1080.000000,0.0033,3.173000,-0.0134
623,2022-05-26,3,21,2612.45,595850000.0,-0.0018,871.43,9.474200e+08,-0.0014,11740.650,...,-0.0037,0.6809,-0.0016,2.75,27.50,37595000,2170.000000,-0.0071,3.239000,0.0208


In [115]:
data_dict = dict()
x_public_dict = dict()
x_dict = dict()
y_dict = dict()

for code in tqdm(stock_list['종목코드'].values):
    imported_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']]
    data = pd.merge(common_data, imported_data, how='left', left_on='Date', right_index=True)

    col_names = list(data)
    drop_name = [c for c in col_names if not(c.endswith('Change') or c == 'Close')]

    data = data.drop(columns=drop_name)

    data = data.fillna(method='ffill')
    data = data.fillna(method='bfill')

    size = len(data) // 5 - 2
    col_size = data.shape[1]

    x = np.zeros((size, col_size))
    y = np.zeros((size, 5))

    for i in range(size):
        friday = 4 + 5 * i

        x[i] = data.iloc[friday]

        closes = []

        for j in range(5):
            next = 5 * (i + 1) + j
            closes.append(data.Close.iloc[next])

        friday_close = data.Close.iloc[friday]

        y[i] = (np.array(closes) - friday_close) / friday_close

    x_public = data.iloc[-6].to_numpy().reshape((1, -1))
    
    x_dict[code] = x
    y_dict[code] = y
    x_public_dict[code] = x_public
    data_dict[code] = data

100%|██████████| 370/370 [00:50<00:00,  7.26it/s]


In [149]:
for i, code in enumerate(tqdm(stock_list['종목코드'].values)):
    model = RidgeCV()

    x = x_dict[code]
    y = y_dict[code]
    x_public = x_public_dict[code]
    data = data_dict[code]

    model.fit(x, y)

    prediction = model.predict(x_public)
    #prediction = np.zeros((1, 5))
    
    friday_close = data.Close.iloc[-6]

    sample_submission.iloc[i,1:] = (friday_close * (1 + prediction))[0]
    answer.iloc[i,1:] = data.Close.iloc[-5:].to_numpy()

100%|██████████| 370/370 [00:01<00:00, 348.37it/s]


In [136]:
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39750.0,39700.0,40800.0,39800.0,39800.0
1,80,35350.0,34950.0,35900.0,36150.0,36250.0
2,100,59600.0,59100.0,59700.0,59900.0,59800.0
3,120,123000.0,125500.0,125500.0,127500.0,129000.0
4,150,80400.0,79900.0,80400.0,80100.0,80400.0
...,...,...,...,...,...,...
365,348210,68400.0,66500.0,67000.0,66300.0,67000.0
366,352820,221000.0,215500.0,219500.0,214500.0,218500.0
367,357780,269900.0,260600.0,260600.0,270900.0,271800.0
368,363280,24500.0,24400.0,24550.0,24600.0,24400.0


In [150]:
sample_submission

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39093.592084,39296.733175,39453.542915,39853.709171,39712.239719
1,80,35615.277705,35744.167008,35744.133052,35666.442575,35581.814554
2,100,58690.492403,58596.400961,58634.440725,58481.260231,58604.441665
3,120,124015.894278,123971.729543,125214.294263,125585.622650,125512.609719
4,150,79696.056313,79940.532975,80501.441262,80628.206658,80698.451563
...,...,...,...,...,...,...
365,348210,67988.844379,68149.250439,68378.891497,68573.256457,68677.211179
366,352820,220587.504668,222190.318002,222135.694940,223575.814016,223135.751855
367,357780,263181.014156,263996.927387,263714.892120,264248.010103,264140.794142
368,363280,24634.265699,24823.839326,24935.659733,25062.553548,25040.394363


In [153]:
result_arr = sample_submission.iloc[:,1:].to_numpy()
answer_arr = answer.iloc[:,1:].to_numpy()

print(np.mean(np.abs(result_arr - answer_arr)))

1903.1350612958515


In [148]:
sample_submission.to_csv('submission.csv', index=False)