In [113]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import MultiTaskElasticNetCV
from tqdm import tqdm

In [114]:
path = '../../Data'
list_name = 'stock_list.csv'

stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [115]:
# start_date = '20200106'
# end_date = '20201225'
start_date = '20210104'
end_date = '20211231'

business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'HOW MANY "Business_days" : {business_days.shape}', )
print(business_days.head())

HOW MANY "Business_days" : (260, 1)
        Date
0 2021-01-04
1 2021-01-05
2 2021-01-06
3 2021-01-07
4 2021-01-08


In [116]:
test_y_days = []
size = business_days.shape[0] // 20 * 20
for i in range(size):
    if 15 <= i % 20:
        test_y_days.append(business_days.iloc[i][0])
print(test_y_days)

[Timestamp('2021-01-25 00:00:00'), Timestamp('2021-01-26 00:00:00'), Timestamp('2021-01-27 00:00:00'), Timestamp('2021-01-28 00:00:00'), Timestamp('2021-01-29 00:00:00'), Timestamp('2021-02-22 00:00:00'), Timestamp('2021-02-23 00:00:00'), Timestamp('2021-02-24 00:00:00'), Timestamp('2021-02-25 00:00:00'), Timestamp('2021-02-26 00:00:00'), Timestamp('2021-03-22 00:00:00'), Timestamp('2021-03-23 00:00:00'), Timestamp('2021-03-24 00:00:00'), Timestamp('2021-03-25 00:00:00'), Timestamp('2021-03-26 00:00:00'), Timestamp('2021-04-19 00:00:00'), Timestamp('2021-04-20 00:00:00'), Timestamp('2021-04-21 00:00:00'), Timestamp('2021-04-22 00:00:00'), Timestamp('2021-04-23 00:00:00'), Timestamp('2021-05-17 00:00:00'), Timestamp('2021-05-18 00:00:00'), Timestamp('2021-05-19 00:00:00'), Timestamp('2021-05-20 00:00:00'), Timestamp('2021-05-21 00:00:00'), Timestamp('2021-06-14 00:00:00'), Timestamp('2021-06-15 00:00:00'), Timestamp('2021-06-16 00:00:00'), Timestamp('2021-06-17 00:00:00'), Timestamp('20

In [117]:
answer = pd.DataFrame(0, index=test_y_days, columns=stock_list['종목코드'])
answer = answer.reset_index(level=0)
answer = answer.rename(columns = {'index': 'Day'})
answer = answer.rename_axis(None, axis=1)
print(answer[:20])

          Day  000060  000080  000100  000120  000150  000240  000250  000270  \
0  2021-01-25       0       0       0       0       0       0       0       0   
1  2021-01-26       0       0       0       0       0       0       0       0   
2  2021-01-27       0       0       0       0       0       0       0       0   
3  2021-01-28       0       0       0       0       0       0       0       0   
4  2021-01-29       0       0       0       0       0       0       0       0   
5  2021-02-22       0       0       0       0       0       0       0       0   
6  2021-02-23       0       0       0       0       0       0       0       0   
7  2021-02-24       0       0       0       0       0       0       0       0   
8  2021-02-25       0       0       0       0       0       0       0       0   
9  2021-02-26       0       0       0       0       0       0       0       0   
10 2021-03-22       0       0       0       0       0       0       0       0   
11 2021-03-23       0       

In [118]:
train_x = dict()
train_y = dict()
test_x = dict()
test_y = dict()

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(business_days, data, how = 'outer')
    data.Close = data.Close.ffill()
    data.Close = data.Close.bfill()

    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%Y-%V'))

    data = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
    
    #print(data[:5])
    size = data.shape[0]
    changed_size = size // 4 * 4
    data = data.iloc[:changed_size]
    #print(data.shape)

    nan = np.sum(np.sum(np.isnan(data)))
    if nan > 0:
        print(nan)
        print(data)
        break

    train_x[code] = data.iloc[::4, :].to_numpy()
    train_y[code] = data.iloc[1::4, :].to_numpy()
    test_x[code] = data.iloc[2::4, :].to_numpy()
    test_y[code] = data.iloc[3::4, :].to_numpy()
    #print('nan:', np.sum(np.isnan(train_x[code])))

    #print(len(train_x[code]), len(train_y[code]), len(test_x[code]), len(test_y[code]))
    
    answer[code] = test_y[code].flatten()

100%|█████████████████████████████████████████| 370/370 [01:07<00:00,  5.45it/s]


In [119]:
print(answer[:5])

         Day   000060   000080   000100    000120   000150   000240   000250  \
0 2021-01-25  15850.0  34600.0  69672.0  172000.0  57300.0  16950.0  67200.0   
1 2021-01-26  15950.0  33700.0  68332.0  169500.0  57500.0  16300.0  65500.0   
2 2021-01-27  16000.0  33600.0  67854.0  169000.0  56500.0  17000.0  61900.0   
3 2021-01-28  16200.0  32650.0  65270.0  173500.0  54400.0  16300.0  60700.0   
4 2021-01-29  15850.0  31900.0  62877.0  168000.0  51400.0  15850.0  60300.0   

    000270    000660  ...   330860   336260   336370   347860   348150  \
0  93300.0  135000.0  ...  56800.0  63200.0  54260.0  33750.0  45650.0   
1  89700.0  129000.0  ...  54300.0  60400.0  52429.0  34150.0  50600.0   
2  90000.0  128500.0  ...  54800.0  61200.0  50501.0  32750.0  48500.0   
3  88200.0  123000.0  ...  53100.0  59500.0  47803.0  30350.0  44950.0   
4  82500.0  122500.0  ...  50300.0  55700.0  45297.0  27500.0  42750.0   

    348210    352820    357780   363280   950130  
0  55500.0  201465.0  2

In [123]:
diff = 0

for code in tqdm(stock_list['종목코드'].values):
    model = MultiTaskElasticNetCV(max_iter=10000)
    model.fit(train_x[code], train_y[code])
    y_pred = model.predict(test_x[code])
    diff += np.sum(np.abs(y_pred - test_y[code]))

print(diff / np.sum(list(test_y.values())))

100%|█████████████████████████████████████████| 370/370 [00:20<00:00, 18.08it/s]

0.03840472573729934





In [121]:
diff = 0

for code in tqdm(stock_list['종목코드'].values):
    y_pred = test_x[code]
    diff += np.sum(np.abs(y_pred - test_y[code]))

print(diff / np.sum(list(test_y.values())))

100%|██████████████████████████████████████| 370/370 [00:00<00:00, 54767.52it/s]

0.04396220419697606





In [122]:
diff = 0

for code in tqdm(stock_list['종목코드'].values):
    size = len(test_x[code])
    y_pred = np.zeros((size, 5))
    y_pred += np.expand_dims(test_x[code][:,4], axis=1)
    diff += np.sum(np.abs(y_pred - test_y[code]))

print(diff / np.sum(list(test_y.values())))

100%|██████████████████████████████████████| 370/370 [00:00<00:00, 35981.74it/s]

0.03194013894335032



