In [135]:
import pandas as pd
import numpy as np
import os
import FinanceDataReader as fdr

from sklearn.linear_model import LinearRegression
from tqdm import tqdm

In [136]:
path = '../../Data'
list_name = 'stock_list.csv'

stock_list = pd.read_csv(os.path.join(path,list_name))
stock_list['종목코드'] = stock_list['종목코드'].apply(lambda x : str(x).zfill(6))
stock_list = stock_list.sort_values(by=['종목코드'])
stock_list

Unnamed: 0,종목명,종목코드,상장시장
109,메리츠화재,000060,KOSPI
126,하이트진로,000080,KOSPI
67,유한양행,000100,KOSPI
69,CJ대한통운,000120,KOSPI
156,두산,000150,KOSPI
...,...,...,...
328,넥스틴,348210,KOSDAQ
31,하이브,352820,KOSPI
199,솔브레인,357780,KOSDAQ
176,티와이홀딩스,363280,KOSPI


In [137]:
start_date = '20200106'
end_date = '20201225'
# start_date = '20210104'
# end_date = '20211231'

business_days = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'HOW MANY "Business_days" : {business_days.shape}', )
print(business_days.head())

HOW MANY "Business_days" : (255, 1)
        Date
0 2020-01-06
1 2020-01-07
2 2020-01-08
3 2020-01-09
4 2020-01-10


In [138]:
test_y_days = []
size = business_days.shape[0] // 20 * 20
for i in range(size):
    if 15 <= i % 20:
        test_y_days.append(business_days.iloc[i][0])
print(test_y_days)

[Timestamp('2020-01-27 00:00:00'), Timestamp('2020-01-28 00:00:00'), Timestamp('2020-01-29 00:00:00'), Timestamp('2020-01-30 00:00:00'), Timestamp('2020-01-31 00:00:00'), Timestamp('2020-02-24 00:00:00'), Timestamp('2020-02-25 00:00:00'), Timestamp('2020-02-26 00:00:00'), Timestamp('2020-02-27 00:00:00'), Timestamp('2020-02-28 00:00:00'), Timestamp('2020-03-23 00:00:00'), Timestamp('2020-03-24 00:00:00'), Timestamp('2020-03-25 00:00:00'), Timestamp('2020-03-26 00:00:00'), Timestamp('2020-03-27 00:00:00'), Timestamp('2020-04-20 00:00:00'), Timestamp('2020-04-21 00:00:00'), Timestamp('2020-04-22 00:00:00'), Timestamp('2020-04-23 00:00:00'), Timestamp('2020-04-24 00:00:00'), Timestamp('2020-05-18 00:00:00'), Timestamp('2020-05-19 00:00:00'), Timestamp('2020-05-20 00:00:00'), Timestamp('2020-05-21 00:00:00'), Timestamp('2020-05-22 00:00:00'), Timestamp('2020-06-15 00:00:00'), Timestamp('2020-06-16 00:00:00'), Timestamp('2020-06-17 00:00:00'), Timestamp('2020-06-18 00:00:00'), Timestamp('20

In [139]:
answer = pd.DataFrame(0, index=test_y_days, columns=stock_list['종목코드'])
answer = answer.reset_index(level=0)
answer = answer.rename(columns = {'index': 'Day'})
answer = answer.rename_axis(None, axis=1)
print(answer[:20])

          Day  000060  000080  000100  000120  000150  000240  000250  000270  \
0  2020-01-27       0       0       0       0       0       0       0       0   
1  2020-01-28       0       0       0       0       0       0       0       0   
2  2020-01-29       0       0       0       0       0       0       0       0   
3  2020-01-30       0       0       0       0       0       0       0       0   
4  2020-01-31       0       0       0       0       0       0       0       0   
5  2020-02-24       0       0       0       0       0       0       0       0   
6  2020-02-25       0       0       0       0       0       0       0       0   
7  2020-02-26       0       0       0       0       0       0       0       0   
8  2020-02-27       0       0       0       0       0       0       0       0   
9  2020-02-28       0       0       0       0       0       0       0       0   
10 2020-03-23       0       0       0       0       0       0       0       0   
11 2020-03-24       0       

In [140]:
train_x = dict()
train_y = dict()
test_x = dict()
test_y = dict()

for code in tqdm(stock_list['종목코드'].values):
    data = fdr.DataReader(code, start = start_date, end = end_date)[['Close']].reset_index()
    data = pd.merge(business_days, data, how = 'outer')
    data.Close = data.Close.ffill()
    data.Close = data.Close.bfill()

    data['weekday'] = data.Date.apply(lambda x : x.weekday())
    data['weeknum'] = data.Date.apply(lambda x : x.strftime('%Y-%V'))

    data = pd.pivot_table(data = data, values = 'Close', columns = 'weekday', index = 'weeknum')
    
    #print(data[:5])
    size = data.shape[0]
    changed_size = size // 4 * 4
    data = data.iloc[:changed_size]
    #print(data.shape)

    nan = np.sum(np.sum(np.isnan(data)))
    if nan > 0:
        print(nan)
        print(data)
        break

    train_x[code] = data.iloc[::4, :].to_numpy()
    train_y[code] = data.iloc[1::4, :].to_numpy()
    test_x[code] = data.iloc[2::4, :].to_numpy()
    test_y[code] = data.iloc[3::4, :].to_numpy()
    #print('nan:', np.sum(np.isnan(train_x[code])))

    #print(len(train_x[code]), len(train_y[code]), len(test_x[code]), len(test_y[code]))
    
    answer[code] = test_y[code].flatten()

100%|█████████████████████████████████████████| 370/370 [00:59<00:00,  6.22it/s]


In [141]:
print(answer[:5])

         Day   000060   000080   000100    000120   000150   000240   000250  \
0 2020-01-27  15850.0  32650.0  41578.0  142000.0  67600.0  13750.0  31400.0   
1 2020-01-28  15350.0  31450.0  41669.0  136000.0  64900.0  13350.0  30400.0   
2 2020-01-29  15400.0  31850.0  41211.0  139000.0  66000.0  13250.0  30450.0   
3 2020-01-30  15450.0  31550.0  40754.0  143000.0  65000.0  13050.0  29000.0   
4 2020-01-31  15700.0  30000.0  39930.0  146000.0  62700.0  13250.0  28900.0   

    000270   000660  ...   330860  336260   336370   347860   348150   348210  \
0  43400.0  98700.0  ...  38700.0  8321.0  26407.0  26000.0  31300.0  23881.0   
1  42700.0  96300.0  ...  38700.0  7948.0  27612.0  26000.0  31300.0  23881.0   
2  42550.0  97900.0  ...  38700.0  7948.0  27178.0  26000.0  31300.0  23881.0   
3  41950.0  94000.0  ...  38700.0  7771.0  27708.0  26000.0  31300.0  23881.0   
4  40900.0  93500.0  ...  38700.0  7369.0  25154.0  26000.0  31300.0  23881.0   

     352820    357780   363280  

In [142]:
diff = 0

for code in tqdm(stock_list['종목코드'].values):
    model = LinearRegression()
    model.fit(train_x[code], train_y[code])
    y_pred = model.predict(test_x[code])
    diff += np.sum(np.abs(y_pred - test_y[code]))

print(diff / np.sum(list(test_y.values())))

100%|███████████████████████████████████████| 370/370 [00:00<00:00, 2622.29it/s]

0.062157975989307675





In [143]:
diff = 0

for code in tqdm(stock_list['종목코드'].values):
    y_pred = test_x[code]
    diff += np.sum(np.abs(y_pred - test_y[code]))

print(diff / np.sum(list(test_y.values())))

100%|██████████████████████████████████████| 370/370 [00:00<00:00, 93543.85it/s]

0.05234513386165297





In [144]:
diff = 0

for code in tqdm(stock_list['종목코드'].values):
    size = len(test_x[code])
    y_pred = np.zeros((size, 5))
    y_pred += np.expand_dims(test_x[code][:,4], axis=1)
    diff += np.sum(np.abs(y_pred - test_y[code]))

print(diff / np.sum(list(test_y.values())))

100%|██████████████████████████████████████| 370/370 [00:00<00:00, 53230.86it/s]

0.03956833427963391



