In [270]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from tqdm import tqdm

In [271]:
path = '../../Data'
list_name = 'stock_list.csv'

stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [272]:
# Get Data & Modeling
start_date = '20210104'

end_date = '20211203'
# end_date = '20211105'


start_weekday = pd.to_datetime(start_date).weekday()
max_weeknum = pd.to_datetime(end_date).strftime('%V')
Business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'WEEKDAY of "start_date" : {start_weekday}')
print(f'NUM of WEEKS to "end_date" : {max_weeknum}')
print(f'HOW MANY "Business_days" : {Business_days.shape}', )
Business_days.head()

WEEKDAY of "start_date" : 0
NUM of WEEKS to "end_date" : 48
HOW MANY "Business_days" : (240, 1)


Unnamed: 0,Date
0,2021-01-04
1,2021-01-05
2,2021-01-06
3,2021-01-07
4,2021-01-08


In [273]:
sample_name = 'sample_submission.csv'
sample_submission = pd.read_csv(os.path.join(path,sample_name))
answer = sample_submission.copy()

In [274]:
nasdaq = fdr.DataReader('QQQ', start = start_date, end = end_date)[['Close', 'Open']].reset_index()
nasdaq = pd.merge(Business_days, nasdaq, how='outer')
nasdaq['weekday'] = nasdaq.Date.apply(lambda x : x.weekday())
nasdaq['weeknum'] = nasdaq.Date.apply(lambda x : x.strftime('%V'))
nasdaq.Close = nasdaq.Close.ffill()
nasdaq.Open = nasdaq.Open.ffill()
nasdaq = nasdaq.rename(columns={'Close': 'NasdaqClose', 'Open':'NasdaqOpen'})

data_dict = dict()
x_public_dict = dict()
x_dict = dict()
y_dict = dict()

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close', 'Open']].reset_index()
    data = pd.merge(nasdaq, data, how ='outer', on='Date')
    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%V'))
    data.Close = data.Close.ffill()
    data.Open = data.Open.ffill()

    # print(data)

    size = len(data) // 5 - 2

    x = np.zeros((size,))
    y = np.zeros((size,))

    for i in range(size):
        friday = 4 + 5 * i
        monday = 5 + 5 * i

        nasdaq_close = data.NasdaqClose.iloc[friday]
        nasdaq_open = data.NasdaqOpen.iloc[friday]
        x[i] = (nasdaq_close - nasdaq_open) / nasdaq_open

        friday_close = data.Close.iloc[friday]
        monday_open = data.Open.iloc[monday]
        y[i] = (monday_open - friday_close) / friday_close

    nasdaq_close = data.NasdaqClose.iloc[-6]
    nasdaq_open = data.NasdaqOpen.iloc[-6]

    x_public = np.array([[(nasdaq_close - nasdaq_open) / nasdaq_open]])
    
    x = x.reshape((-1, 1))

    x_dict[code] = x
    y_dict[code] = y
    x_public_dict[code] = x_public
    data_dict[code] = data

100%|█████████████████████████████████████████| 370/370 [01:06<00:00,  5.53it/s]


In [285]:
for code in tqdm(stock_list['종목코드'].values):
    model = ElasticNetCV()

    x = x_dict[code]
    y = y_dict[code]
    x_public = x_public_dict[code]
    data = data_dict[code]

    model.fit(x,y)
    prediction = (1 + model.predict(x_public)) * data.Close.iloc[-6]
    sample_submission.loc[:,code] = [prediction] * 5
    answer.loc[:,code] = data.Close.iloc[-5:].to_numpy()

sample_submission.isna().sum().sum()

100%|█████████████████████████████████████████| 370/370 [00:13<00:00, 27.09it/s]


0

In [286]:
columns = list(sample_submission.columns[1:])

columns = ['Day'] + [str(x).zfill(6) for x in columns]

sample_submission.columns = columns

In [287]:
answer

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-29,31200.0,30300.0,56369.0,129000.0,110500.0,15200.0,43250.0,79200.0,116000.0,...,43750.0,48750.0,96859.0,37100.0,19800.0,49100.0,369000.0,266300.0,24200.0,17650.0
1,2021-11-30,31300.0,29000.0,55317.0,124000.0,108500.0,15150.0,42150.0,77800.0,114000.0,...,43300.0,49250.0,94257.0,35550.0,18600.0,45500.0,364500.0,255800.0,23100.0,19500.0
2,2021-12-01,31700.0,29400.0,55221.0,125000.0,112000.0,15500.0,42750.0,81200.0,116500.0,...,48950.0,48700.0,94931.0,36050.0,18900.0,46200.0,352500.0,264200.0,23850.0,18650.0
3,2021-12-02,32150.0,29550.0,57518.0,129000.0,110000.0,16150.0,43100.0,81600.0,120000.0,...,51900.0,46250.0,91654.0,33800.0,18650.0,48550.0,330000.0,274700.0,25200.0,18050.0
4,2021-12-03,32700.0,30600.0,57709.0,131000.0,108500.0,16400.0,44900.0,82500.0,118000.0,...,51900.0,46800.0,91847.0,34500.0,19100.0,49000.0,354500.0,275900.0,25800.0,18150.0


In [288]:
sample_submission

Unnamed: 0,Day,000060,000080,000100,000120,000150,000240,000250,000270,000660,...,330860,336260,336370,347860,348150,348210,352820,357780,363280,950130
0,2021-11-29,31823.258493,31032.577854,56828.381667,129933.666695,120425.333325,15536.128539,44022.765097,79793.989419,115195.943774,...,44781.000494,48823.498374,97445.90793,39666.79168,20026.071332,50335.405243,372336.065947,276492.811925,25061.554461,16637.626293
1,2021-11-30,31823.258493,31032.577854,56828.381667,129933.666695,120425.333325,15536.128539,44022.765097,79793.989419,115195.943774,...,44781.000494,48823.498374,97445.90793,39666.79168,20026.071332,50335.405243,372336.065947,276492.811925,25061.554461,16637.626293
2,2021-12-01,31823.258493,31032.577854,56828.381667,129933.666695,120425.333325,15536.128539,44022.765097,79793.989419,115195.943774,...,44781.000494,48823.498374,97445.90793,39666.79168,20026.071332,50335.405243,372336.065947,276492.811925,25061.554461,16637.626293
3,2021-12-02,31823.258493,31032.577854,56828.381667,129933.666695,120425.333325,15536.128539,44022.765097,79793.989419,115195.943774,...,44781.000494,48823.498374,97445.90793,39666.79168,20026.071332,50335.405243,372336.065947,276492.811925,25061.554461,16637.626293
4,2021-12-03,31823.258493,31032.577854,56828.381667,129933.666695,120425.333325,15536.128539,44022.765097,79793.989419,115195.943774,...,44781.000494,48823.498374,97445.90793,39666.79168,20026.071332,50335.405243,372336.065947,276492.811925,25061.554461,16637.626293


In [289]:
result_arr = sample_submission.iloc[:,1:].to_numpy()
answer_arr = answer.iloc[:,1:].to_numpy()

print(np.sum(np.abs(result_arr - answer_arr)) / np.sum(answer_arr))

0.03511438587880671
