In [36]:
import sys
sys.path.append('../..')

In [37]:
from Data.data75 import getData

import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import LinearRegression, RidgeCV, MultiTaskLassoCV, MultiTaskElasticNetCV
from tqdm import tqdm

In [38]:
path = '../../Data'
list_name = 'stock_list.csv'
start_date = '20040105'
end_date = '20220527'

In [39]:
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [40]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path, sample_name))
answer = sample_submission.copy()
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,0,0,0,0,0
1,80,0,0,0,0,0
2,100,0,0,0,0,0
3,120,0,0,0,0,0
4,150,0,0,0,0,0
...,...,...,...,...,...,...
365,348210,0,0,0,0,0
366,352820,0,0,0,0,0
367,357780,0,0,0,0,0
368,363280,0,0,0,0,0


In [41]:
common_data = getData(start_date, end_date)
common_data

Unnamed: 0,Date,weekday,weeknum,kospi_Close,kospi_Volume,kospi_Change,kosdaq_Close,kosdaq_Volume,kosdaq_Change,nasdaq_Close,jpy_Close,jpy_Change,acf_Close,acf_Change,btc_Close,btc_Volume,btc_Change
0,2004-01-05,0,02,824.10,408260000.0,0.0035,452.70,3.251500e+05,0.0029,2047.36,11.2177,0.0049,0.9481,0.0092,3206000.0,21580.0,0.0329
1,2004-01-06,1,02,823.43,484060000.0,-0.0008,447.30,3.532600e+05,-0.0119,2057.37,11.1624,-0.0049,0.9465,-0.0017,3206000.0,21580.0,0.0329
2,2004-01-07,2,02,827.07,404560000.0,0.0044,446.10,3.538300e+05,-0.0027,2077.68,11.1692,0.0006,0.9512,0.0050,3206000.0,21580.0,0.0329
3,2004-01-08,3,02,824.15,428760000.0,-0.0035,444.10,3.616900e+05,-0.0045,2100.25,11.1268,-0.0038,0.9511,-0.0001,3206000.0,21580.0,0.0329
4,2004-01-09,4,02,845.27,543290000.0,0.0256,453.00,3.508400e+05,0.0200,2086.92,11.0590,-0.0061,0.9485,-0.0027,3206000.0,21580.0,0.0329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4795,2022-05-23,0,21,2647.38,644200000.0,0.0031,883.59,1.080000e+09,0.0042,11535.27,9.8349,-0.0125,0.6862,0.0007,37336000.0,1470.0,-0.0406
4796,2022-05-24,1,21,2605.87,851240000.0,-0.0157,865.07,1.100000e+09,-0.0210,11264.45,9.9427,0.0110,0.6820,-0.0061,37739000.0,1450.0,0.0108
4797,2022-05-25,2,21,2617.22,712610000.0,0.0044,872.69,9.893000e+08,0.0088,11434.74,9.9451,0.0002,0.6820,0.0000,37863000.0,1080.0,0.0033
4798,2022-05-26,3,21,2612.45,595850000.0,-0.0018,871.43,9.474200e+08,-0.0014,11740.65,9.9082,-0.0037,0.6809,-0.0016,37595000.0,2170.0,-0.0071


In [42]:
data_dict = dict()
x_public_dict = dict()
x_dict = dict()
y_dict = dict()

for code in tqdm(stock_list['종목코드'].values):
    imported_data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']]
    data = pd.merge(common_data, imported_data, how='left', left_on='Date', right_index=True)

    col_names = list(data)
    drop_name = [c for c in col_names if not(c.endswith('Change') or c == 'Close')]

    data = data.drop(columns=drop_name)

    data = data.fillna(method='ffill')
    data = data.fillna(method='bfill')

    size = len(data) // 5 - 2
    col_size = data.shape[1]

    x = np.zeros((size, col_size))
    y = np.zeros((size, 5))

    for i in range(size):
        friday = 4 + 5 * i

        x[i] = data.iloc[friday]

        closes = []

        for j in range(5):
            next = 5 * (i + 1) + j
            closes.append(data.Close.iloc[next])

        friday_close = data.Close.iloc[friday]

        y[i] = (np.array(closes) - friday_close) / friday_close

    x_public = data.iloc[-6].to_numpy().reshape((1, -1))
    
    x_dict[code] = x
    y_dict[code] = y
    x_public_dict[code] = x_public
    data_dict[code] = data

100%|██████████| 370/370 [01:54<00:00,  3.23it/s]


In [43]:
for i, code in enumerate(tqdm(stock_list['종목코드'].values)):
    model = RidgeCV()

    x = x_dict[code]
    y = y_dict[code]
    x_public = x_public_dict[code]
    data = data_dict[code]

    model.fit(x, y)

    prediction = model.predict(x_public)
    #prediction = np.zeros((1, 5))
    
    friday_close = data.Close.iloc[-6]

    sample_submission.iloc[i,1:] = (friday_close * (1 + prediction))[0]
    answer.iloc[i,1:] = data.Close.iloc[-5:].to_numpy()

100%|██████████| 370/370 [00:01<00:00, 299.19it/s]


In [44]:
answer.loc[answer['Index'].isin([31390, 36490]), 1:] = [0,0,0,0,0]
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39750.0,39700.0,40800.0,39800.0,39800.0
1,80,35350.0,34950.0,35900.0,36150.0,36250.0
2,100,59600.0,59100.0,59700.0,59900.0,59800.0
3,120,123000.0,125500.0,125500.0,127500.0,129000.0
4,150,80400.0,79900.0,80400.0,80100.0,80400.0
...,...,...,...,...,...,...
365,348210,68400.0,66500.0,67000.0,66300.0,67000.0
366,352820,221000.0,215500.0,219500.0,214500.0,218500.0
367,357780,269900.0,260600.0,260600.0,270900.0,271800.0
368,363280,24500.0,24400.0,24550.0,24600.0,24400.0


In [45]:
sample_submission.loc[sample_submission['Index'].isin([31390, 36490]), 1:] = [0,0,0,0,0]
sample_submission

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,38936.176330,39058.043448,38962.684016,39026.838697,38862.846105
1,80,35520.695169,35609.970080,35647.368139,35661.752594,35638.492938
2,100,58620.214583,58498.245800,58520.651356,58338.662235,58382.226165
3,120,124534.523672,124512.955592,124577.978317,124589.089570,124794.948389
4,150,79682.108316,79758.900778,79818.056521,79992.840435,80070.705349
...,...,...,...,...,...,...
365,348210,67927.637235,68068.445580,68334.558614,68643.741282,68788.992403
366,352820,221056.531831,221752.148964,222191.641709,222866.244791,222729.932367
367,357780,262803.955757,262865.311694,262826.895068,262906.742946,262891.276471
368,363280,24639.138502,24829.229429,24951.281015,25087.960553,25067.487502


In [46]:
result_arr = sample_submission.iloc[:,1:].to_numpy()
answer_arr = answer.iloc[:,1:].to_numpy()

print(np.mean(np.abs(result_arr - answer_arr)))

1793.7778805810929


In [47]:
sample_submission.to_csv('submission.csv', index=False)