In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pykrx
from pykrx import stock
from datetime import datetime
import datetime as dt
import os
from scipy.cluster.vq import vq, kmeans, whiten
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import random
import json
from collections import OrderedDict
import pickle

# 전체 종목 List 반환

In [2]:
today = datetime.now().strftime("%Y%m%d")
code_list = stock.get_market_ticker_list("20201111", market="KOSPI")

stock_list = pd.DataFrame()
name_list = []

for code in code_list:
    name = stock.get_market_ticker_name(code)
    name_list.append(name)

name_list = np.array(name_list)
stock_list["Code"] = code_list
stock_list["Name"] = name_list

1001 20201111 ST 0


In [3]:
print (stock_list)

       Code     Name
0    095570   AJ네트웍스
1    006840    AK홀딩스
2    027410      BGF
3    282330   BGF리테일
4    138930  BNK금융지주
..      ...      ...
792  079980      휴비스
793  005010      휴스틸
794  069260      휴켐스
795  000540     흥국화재
796  003280     흥아해운

[797 rows x 2 columns]


# 종목당 기간별 Price값 얻어오기

In [None]:
def get_price_by_name (name):
    target = stock_list[stock_list['Name']== name]
    price_data = stock.get_market_ohlcv_by_date("20171101", "20201101", target['Code'])
    return price_data

# 모든 상장종목의 차트를 아래 경로에 저장!!

In [None]:
Path = 'C:\\Chart\\'
if not os.path.isdir(Path):
    os.mkdir(Path)
    
for idx,row in stock_list.iterrows():
    price_data = stock.get_market_ohlcv_by_date("20191101","20201101", row.Code)
    plt.figure(figsize=(8,4.5))
    plt.plot(price_data['종가'])
    plt.title(row.Name)
    plt.savefig(Path + row.Name + ".png")

# 정해진 기간만큼의 주가 데이터를 DataFrame형태로 저장

In [None]:
def get_price_data_in_period(start_date,end_date,result_period):
    chart_df = pd.DataFrame()
    chart_list = []
    rst_list = []
    chart_name_list = []
    normal_len = 0

    today = datetime.now().strftime("%Y%m%d")

    for idx,row in stock_list.iterrows():
        price_data = stock.get_market_ohlcv_by_date(start_date, end_date, row.Code)
        if idx == 0:
            normal_len = len(price_data['종가'])
        if len(price_data['종가']) == normal_len:
            chart_name_list.append(row.Name)

            #효율적인 학습을 위한 Normalization
            chart_list.append(np.array(price_data['종가'] / np.max(price_data['종가'])))

            rst_date = datetime.strptime(end_date, '%Y%m%d') + dt.timedelta(days=result_period)
            rst_date = rst_date.strftime("%Y%m%d")

            result_data = stock.get_market_ohlcv_by_date(end_date, rst_date, row.Code)
            rst_list.append(np.array(result_data['종가'] / np.max(price_data['종가'])))

    chart_df['종목명'] = np.array(chart_name_list)
    chart_df['차트'] = chart_list
    chart_df['결과'] = rst_list
    return chart_df

In [None]:
def get_price_data_with_period(end_date,ndays,result_days):
    start_date_x = (datetime.strptime(end_date, '%Y%m%d') - dt.timedelta(days = ndays * 2)).strftime("%Y%m%d")
    
    chart_list = []
    chart_list_real=[]
    highest_list = []
    highest_list_real = []
    closing_list = []
    closing_list_real = []
    chart_name_list = []
    chart_df = pd.DataFrame()
    
    for idx,row in stock_list.iterrows():
        print (row.Name)
        price_data_before = stock.get_market_ohlcv_by_date(start_date_x, end_date, row.Code)
        
        if len(price_data_before) < ndays:
            continue
            
        x = np.array(price_data_before['종가'])[-1 * ndays:]        
        norm_x = x / np.max(x)
        
        start_date_y = datetime.strptime(end_date,'%Y%m%d') + dt.timedelta(days= 1)
        end_date_y = start_date_y + dt.timedelta(days = result_days * 2)
        
        start_date_y = start_date_y.strftime("%Y%m%d")
        end_date_y = end_date_y.strftime("%Y%m%d")
        
        price_data_after = stock.get_market_ohlcv_by_date(start_date_y,end_date_y,row.Code)
        if len(price_data_after) < result_days:
            continue
                
        y_closing = np.array(price_data_after['종가'])[:result_days]
        y_highest = np.array(price_data_after['고가'])[:result_days]
        
        y_closing_norm = y_closing / np.max(x)
        y_highest_norm = y_highest / np.max(x)
        
        chart_name_list.append(row.Name)
        chart_list.append(norm_x)
        closing_list.append(y_closing_norm)
        highest_list.append(y_highest_norm)
        chart_list_real.append(x)
        closing_list_real.append(y_closing)
        highest_list_real.append(y_highest)
        
    chart_df['종목명'] = np.array(chart_name_list)
    chart_df['차트'] = chart_list
    chart_df['종가'] = closing_list
    chart_df['고가'] = highest_list
    chart_df['차트(원본)'] = chart_list_real
    chart_df['종가(원본)'] = closing_list_real
    chart_df['고가(원본)'] = highest_list_real
    return chart_df

In [None]:
def get_price_data_with_period(start_data,end_date):
    chart_list = []
    highest_list = []
    closing_list = []
    rst_list = []
    chart_name_list = []
    chart_df = pd.DataFrame()
    
    for idx,row in stock_list.iterrows():
        
        price_data = stock.get_market_ohlcv_by_date(start_date, end_date, row.Code)

        x = np.array(price_data['종가'])        
        x_closing = np.array(price_data['종가'])
        x_highest = np.array(price_data['고가'])
        
        chart_name_list.append(row.Name)
        chart_list.append(norm_x)
        closing_list.append(y_closing_norm)
        highest_list.append(y_highest_norm)
        
    chart_df['종목명'] = np.array(chart_name_list)
    chart_df['차트'] = chart_list
    chart_df['종가'] = closing_list
    chart_df['고가'] = highest_list
    return chart_df

# Random한 학습 데이터 생성

In [None]:
def generate_train_data(n,left_date,right_date,train_len,result_len,code_list):
    x_list = []
    y_list = []
    y_closing_list=[]
    y_highest_list = []

    while(True):
        
        if len(x_list) == n:
            break
            
        duration = (datetime.strptime(right_date, '%Y%m%d') - datetime.strptime(left_date, '%Y%m%d')).days
        rand_offset = random.randrange(0,duration)

        start_date = datetime.strptime(left_date, '%Y%m%d') + dt.timedelta(days=rand_offset)
        end_date = (start_date + dt.timedelta(days= (train_len + result_len) * 2)).strftime("%Y%m%d")
        start_date = start_date.strftime("%Y%m%d")

        rand_code = code_list[random.randrange(0,len(code_list))]
        price_data = stock.get_market_ohlcv_by_date(start_date, end_date,rand_code)
        
        if len(price_data['종가']) < train_len + result_len:
            continue
        
        closing_price_norm = price_data['종가'] / np.max(price_data['종가'][:train_len])
        highest_price_norm = price_data['고가'] / np.max(price_data['종가'][:train_len])
        
        train_data = closing_price_norm[:train_len]
        label_data = highest_price_norm[train_len:train_len+result_len]
        raw_closing_data = closing_price_norm[train_len:train_len+result_len]
        raw_highest_data = highest_price_norm[train_len:train_len+result_len]
        
        #label_data = int(np.max(label_data) >= (train_data[-1] * (1 + goal)))
        
        x_list.append(train_data)
        #y_list.append(label_data)
        y_closing_list.append(raw_closing_data)
        y_highest_list.append(raw_highest_data)

    x_list = np.array(x_list)
    #y_list = np.array(y_list)
    y_closing_list = np.array(y_closing_list)
    y_highest_list = np.array(y_highest_list)
    
    return x_list,y_closing_list,y_highest_list

In [None]:
x_list,y_closing_list,y_highest_list = generate_train_data(10000,"20180101","20201101",20,5,stock_list["Code"])

# K-Means Clutering을 통해 차트를 군집화

In [None]:
kmeans = KMeans(n_clusters=8)
kmeans.fit(list(chart_df['차트']))

filter_2 = chart_df['차트'][kmeans.labels_ == 2]
filter_2_name = chart_df['종목명'][kmeans.labels_ == 2]

print (filter_2_name)
for items in filter_2:
    plt.plot(items)
    plt.show()
    plt.cla()

# 1D CNN을 통한 주가의 향방 Classification

In [None]:
print (np.shape(chart_df['차트'][0]))

# Random Forest를 통한 주가의 향방 Classification

In [None]:
goal = 0.03
x_list,y_list,y_raw_list = generate_train_data(2000,goal,"20190101","20201101",20,5,stock_list["Code"])
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
rf.fit(x_list,y_list)

In [None]:
chart_df = get_price_data_with_period(goal,'20201101',20,5)
test_x = np.array(chart_df['차트'].tolist())
test_y = rf.predict(test_x)
real_y = np.array(chart_df['결과'].tolist())

In [None]:
t_idx = np.reshape(np.argwhere(test_y==1),-1)
purchase = real_y[t_idx]
expect = 0
for idx in t_idx:
    if real_y[idx] == 1:
        expect += (1+goal)
    else:
        expect += chart_df['종가'][idx][-1]
        print (chart_df['종가'][idx][-1])

expect /= len(t_idx)
    
print (expect)

In [None]:
print (len(real_y[real_y == 1]) / len(is_matched))

In [None]:
print("Accuracy is: ", accuracy_score(test_y, real_y))
print(classification_report(test_y, real_y))

In [None]:
test_y = rf.predict(x_list)
real_y = y_list
is_matched = (test_y == real_y)
print (len(is_matched[is_matched == True]))

In [None]:
rf.score(test_x)

In [None]:
x_list,y_closing_list,y_highest_list = generate_train_data(15000,"20160101","20171222",20,5,stock_list["Code"])

In [None]:
chart_df = get_price_data_with_period(0.07,'20171229',20,5)
print (chart_df)

# 백테스팅을 위한 주간 데이터 수집

In [None]:
start_date = '20171229'
result_list=[]
for k in range(0,40):
    expect_list = []
    goal_list = []
    start_date = datetime.strptime(start_date,'%Y%m%d') + dt.timedelta(days= 7)
    start_date = start_date.strftime("%Y%m%d")
    chart_df = get_price_data_with_period(start_date,20,5)
    for i in range(0,30):
        goal = 0.01 + (0.005 * i)
        y_list = []
        for k,item in enumerate(y_highest_list):
            y_list.append(int(np.max(item) >= (1 + goal) * x_list[k][-1]))

        y_list = np.array(y_list)

        rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
        rf.fit(x_list,y_list)

        test_x = np.array(chart_df['차트'].tolist())
                
        if (len(test_x) == 0):
            continue
        test_y = rf.predict(test_x)
        real_y = []

        for k,item in enumerate(chart_df['고가']):
            real_y.append(int(np.max(item) >= (1 + goal) * chart_df['차트'][k][-1]))

        real_y = np.array(real_y)

        #print("Accuracy is: ", accuracy_score(test_y, real_y))
        #print(classification_report(test_y, real_y))

        t_idx = np.reshape(np.argwhere(test_y==1),-1)
        purchase = real_y[t_idx]
        #print (chart_df['종목명'][t_idx])
        expect = 0
        for idx in t_idx:
            if real_y[idx] == 1:
                expect += (1 + goal)
            else:
                expect += chart_df['종가'][idx][-1]/chart_df['차트'][idx][-1]

        expect /= len(t_idx)
        goal_list.append(goal*100)
        expect_list.append(expect)

    plt.plot(goal_list,expect_list)
    
    result_list.append(np.array(expect_list))
    plt.title(str(start_date))
    plt.show()

# 백테스팅의 익절 라인별 수익률 계산

In [None]:
result_list = np.transpose(result_list)

In [None]:
s_list = []
for items in result_list:
    s = 1
    for g in items:
        ## 수수료 계산(세금 0.3% + 키움증권 mts 기준 수수료 0.015% 차감)
        s *= (g-0.00315)
        print (s)
    s_list.append(s)

plt.plot(goal_list,s_list)
plt.show()

In [None]:
print (start_date_list)
print (chart_df_list[0].to_csv('C:\\Chart\\20171229.csv'))