In [30]:
import sys
sys.path.append('../..')

In [31]:
from Data.data75 import getData
from Data.dataByCode import getData as getDataByCode

import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import LinearRegression, RidgeCV, MultiTaskLassoCV, MultiTaskElasticNetCV
from tqdm import tqdm

In [32]:
path = '../../Data'
list_name = 'stock_list.csv'
start_date = '20180108'
end_date = '20220527'

In [33]:
stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [34]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path, sample_name))
answer = sample_submission.copy()
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,0,0,0,0,0
1,80,0,0,0,0,0
2,100,0,0,0,0,0
3,120,0,0,0,0,0
4,150,0,0,0,0,0
...,...,...,...,...,...,...
365,348210,0,0,0,0,0
366,352820,0,0,0,0,0
367,357780,0,0,0,0,0
368,363280,0,0,0,0,0


In [35]:
common_data = getData(start_date, end_date)
common_data

Unnamed: 0,Date,weekday,weeknum,kospi_Close,kospi_Volume,kospi_Change,kosdaq_Close,kosdaq_Volume,kosdaq_Change,nasdaq_Close,jpy_Close,jpy_Change,acf_Close,acf_Change,btc_Close,btc_Volume,btc_Change
0,2018-01-08,0,02,2513.28,311430000.0,0.0063,839.51,1.230000e+09,0.0139,7157.39,9.4510,0.0062,0.7661,-0.0010,23737000.0,10470.000000,-0.0478
1,2018-01-09,1,02,2510.23,374320000.0,-0.0012,829.99,1.200000e+09,-0.0113,7163.58,9.5155,0.0068,0.7694,0.0043,22595000.0,11850.000000,-0.0481
2,2018-01-10,2,02,2499.75,444020000.0,-0.0042,834.91,1.100000e+09,0.0059,7153.57,9.6032,0.0092,0.7672,-0.0029,20500000.0,12370.000000,-0.0927
3,2018-01-11,3,02,2487.91,444340000.0,-0.0047,852.51,1.290000e+09,0.0211,7211.78,9.5777,-0.0027,0.7701,0.0038,19147000.0,13750.000000,-0.0660
4,2018-01-12,4,02,2496.42,438780000.0,0.0034,873.05,1.180000e+09,0.0241,7261.06,9.5461,-0.0033,0.7661,-0.0052,19757997.0,10121.428571,-0.0290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140,2022-05-23,0,21,2647.38,644200000.0,0.0031,883.59,1.080000e+09,0.0042,11535.27,9.8349,-0.0125,0.6862,0.0007,37336000.0,1470.000000,-0.0406
1141,2022-05-24,1,21,2605.87,851240000.0,-0.0157,865.07,1.100000e+09,-0.0210,11264.45,9.9427,0.0110,0.6820,-0.0061,37739000.0,1450.000000,0.0108
1142,2022-05-25,2,21,2617.22,712610000.0,0.0044,872.69,9.893000e+08,0.0088,11434.74,9.9451,0.0002,0.6820,0.0000,37863000.0,1080.000000,0.0033
1143,2022-05-26,3,21,2612.45,595850000.0,-0.0018,871.43,9.474200e+08,-0.0014,11740.65,9.9082,-0.0037,0.6809,-0.0016,37595000.0,2170.000000,-0.0071


In [36]:
data_dict = dict()
x_public_dict = dict()
x_dict = dict()
y_dict = dict()

for code in tqdm(stock_list['종목코드'].values):
    imported_data = getDataByCode(code, start_date, end_date)
    data = pd.merge(common_data, imported_data, how='left', left_on='Date', right_index=True)

    data = data.drop(columns=['Date', 'weeknum'])
        
    data = data.fillna(method='ffill')
    data = data.fillna(method='bfill')

    size = len(data) // 5 - 2
    col_size = data.shape[1]

    x = np.zeros((size, col_size))
    y = np.zeros((size, 5))

    for i in range(size):
        friday = 4 + 5 * i

        x[i] = data.iloc[friday]

        closes = []

        for j in range(5):
            next = 5 * (i + 1) + j
            closes.append(data.Close.iloc[next])

        friday_close = data.Close.iloc[friday]

        y[i] = (np.array(closes) - friday_close) / friday_close

    x_public = data.iloc[-6].to_numpy().reshape((1, -1))
    
    x_dict[code] = x
    y_dict[code] = y
    x_public_dict[code] = x_public
    data_dict[code] = data

100%|██████████| 370/370 [01:18<00:00,  4.74it/s]


In [50]:
for i, code in enumerate(tqdm(stock_list['종목코드'].values)):
    model = MultiTaskElasticNetCV()

    x = x_dict[code]
    y = y_dict[code]
    x_public = x_public_dict[code]
    data = data_dict[code]

    model.fit(x, y)

    prediction = model.predict(x_public)
    #prediction = np.zeros((1, 5))
    
    friday_close = data.Close.iloc[-6]

    sample_submission.iloc[i,1:] = (friday_close * (1 + prediction))[0]
    answer.iloc[i,1:] = data.Close.iloc[-5:].to_numpy()

100%|██████████| 370/370 [00:19<00:00, 19.43it/s]


In [51]:
answer.iloc[answer['Index'].isin([31390, 36490]), 1:] = [0,0,0,0,0]
answer

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39750.0,39700.0,40800.0,39800.0,39800.0
1,80,35350.0,34950.0,35900.0,36150.0,36250.0
2,100,59600.0,59100.0,59700.0,59900.0,59800.0
3,120,123000.0,125500.0,125500.0,127500.0,129000.0
4,150,80400.0,79900.0,80400.0,80100.0,80400.0
...,...,...,...,...,...,...
365,348210,68400.0,66500.0,67000.0,66300.0,67000.0
366,352820,221000.0,215500.0,219500.0,214500.0,218500.0
367,357780,269900.0,260600.0,260600.0,270900.0,271800.0
368,363280,24500.0,24400.0,24550.0,24600.0,24400.0


In [52]:
sample_submission.iloc[sample_submission['Index'].isin([31390, 36490]), 1:] = [0,0,0,0,0]
sample_submission

Unnamed: 0,Index,2022-05-23,2022-05-24,2022-05-25,2022-05-26,2022-05-27
0,60,39043.654614,39093.438172,39122.808155,39186.690500,39141.026473
1,80,35579.718079,35658.570022,35735.736780,35640.800337,35642.472618
2,100,58677.898388,58672.904975,58649.951370,58514.084576,58661.851068
3,120,124465.634399,124207.415474,124374.860084,124384.307393,124567.465288
4,150,79777.704346,79692.005690,79506.378094,79275.112726,79541.668397
...,...,...,...,...,...,...
365,348210,68344.684501,68396.259659,68558.506350,68703.410755,68770.579525
366,352820,220722.915885,221199.848919,220876.752288,221270.002500,221120.426473
367,357780,262808.720484,262849.259012,262651.793884,262931.649271,262951.330188
368,363280,24672.074335,24711.691595,24707.203941,24707.180622,24697.414475


In [53]:
result_arr = sample_submission.iloc[:,1:].to_numpy()
answer_arr = answer.iloc[:,1:].to_numpy()

print(np.mean(np.abs(result_arr - answer_arr)))

1828.024932692871


In [41]:
sample_submission.to_csv('submission.csv', index=False)