# 목표

#### 1. 기본적으로 주가를 예측하는 것을 목표로 한다.
#### 2. 주가를 예측하기 위해, 주가 데이터만 사용하는 것이 아니라 기업의 재무상태 또한 고려하려 한다.
#### 3. 여기서 문제점은 주가의 종가는 매일 업데이트 되지만 재무상태는 1년에 한 번 공시된다는 것이다. 
#### 4. 물론 분기별 재무보고서가 있지만 수정되는 경우도 많고, 정확하지 않은 경우가 많아 사업계획서를 기준으로 한다.

## Imports

#### 데이터 분석 라이브러리

In [None]:
import pandas as pd
import numpy as np

#### 시각화 라이브러리

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#### 머신러닝용 라이브러리

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

#### 기타 필요한 라이브러리

In [4]:
import warnings
warnings.filterwarnings('ignore')
from fqtoolkit import dgToDf as dtd
import statsmodels.api as sm
import riskfolio as rp

## Data 불러오기

O 데이터는 기본적으로 데이터 가이드에서 재공되는 주가정보와 재무 정보를 사용한다.    
   
- 기업 : 코스피 시총 상위 50개 기업 + 코스닥 시총 상위 50개 기업 
- 기간 : 2020.01 ~ 2022.05  
- 기준 : 월말, 1~12월 data same
- 단위 : KRW(원)


O 재무 정보는 재무재표를 기준으로 하고, Feature로 **Asset, Liability, Sale, Income, Price**를 사용한다.

- assets : 총자산 
- liability : 총부채
- sale : 매출액
- income : 영업이익
- price : 수정주가(배당반영)

In [5]:
assets, liability, sale, income, price = [item[1] for item in dtd("data2.xlsx", "Sheet1", 5).items()]

- 문제는 앞서 말한 주가 정보와 재무재표의 기간이 맞지 않는다는 것인데, 주가 데이터는 당월 종가를 기준으로 하고, 재무제표는 당년 6월에 발표된다고 가정하였다. 
- 2020년 1월에 투자를 한다고 가정하면, 2019년 6월에 발표된 재무제표를 기준으로 판단할 수 밖에 없기 때문에 price를 shift 해줘 주가 데이터와 재무 정보의 시작점을 같게 한다.

In [6]:
assets0 = assets.pct_change()
liability0 = liability.pct_change()

In [7]:
price.shift(12);

In [8]:
price = price.dropna(axis=1)
returns = price.shift(12+6).pct_change()

returns

Unnamed: 0_level_0,삼성전자,SK하이닉스,삼성SDI,현대차,카카오,기아,POSCO홀딩스,현대모비스,한국전력,SK텔레콤,...,기업은행,롯데케미칼,한국조선해양,한온시스템,HLB,CJ ENM,동진쎄미켐,동화기업,네이처셀,주성엔지니어링
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31,,,,,,,,,,,...,,,,,,,,,,
2000-02-29,,,,,,,,,,,...,,,,,,,,,,
2000-03-31,,,,,,,,,,,...,,,,,,,,,,
2000-04-30,,,,,,,,,,,...,,,,,,,,,,
2000-05-31,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-31,0.096598,-0.027025,0.093534,0.294771,0.284103,0.260543,0.112072,0.070313,-0.020492,0.045026,...,0.003782,0.008978,0.020595,0.099343,-0.097196,0.043517,0.122576,0.483533,0.048673,0.248111
2022-02-28,-0.067365,-0.092993,0.138365,0.395263,0.184868,0.050754,-0.046515,0.090025,0.073279,0.124718,...,-0.002512,0.124630,-0.033632,0.295209,0.038276,0.031274,0.050619,-0.092836,0.123418,-0.064293
2022-03-31,0.084328,0.118508,-0.041989,0.011329,-0.104420,0.104813,0.070547,0.026787,-0.004848,-0.040314,...,-0.009931,0.036941,-0.040603,-0.037232,0.194703,0.198822,-0.106929,-0.083426,-0.075117,0.064243
2022-04-30,-0.027485,-0.048815,0.019608,-0.078429,-0.094652,0.076757,0.061227,-0.019565,-0.022049,-0.100839,...,0.020062,0.185751,-0.051995,-0.093148,-0.112834,-0.075896,-0.084297,-0.145631,-0.074112,-0.001448


### Clustering

In [12]:
X = pd.DataFrame()
for i in range(0,len(assets.index),12):
    X = X.append(assets.iloc[i])
X

Unnamed: 0,삼성전자,LG에너지솔루션,SK하이닉스,삼성바이오로직스,NAVER,삼성SDI,현대차,카카오,LG화학,기아,...,네이처셀,주성엔지니어링,SFA반도체,덕산네오룩스,메가스터디교육,NICE평가정보,피엔티,케어젠,와이지엔터테인먼트,에이비엘바이오
2000-01-31,46421500000.0,,21282870000.0,,36402020.0,5663565000.0,31987210000.0,91971350.0,,8828701000.0,...,20534862.0,200839000.0,22005400.0,,,55336160.0,,,,
2001-01-31,52114880000.0,,14413340000.0,,46006520.0,5630804000.0,39872300000.0,145093400.0,3636326000.0,9812333000.0,...,16418794.0,213811300.0,36239720.0,,,73681400.0,,,,
2002-01-31,64954990000.0,,11159630000.0,,125207900.0,5725090000.0,45945470000.0,162588400.0,4618144000.0,10815200000.0,...,15552957.0,140145100.0,42084610.0,,,77608070.0,,,,
2003-01-31,68041740000.0,,8603848000.0,,200686700.0,6409798000.0,53426820000.0,212188800.0,5835172000.0,13513320000.0,...,8762283.0,117904500.0,78085480.0,,,88313960.0,,,,
2004-01-31,69004620000.0,,8846875000.0,,371348300.0,6722372000.0,58023130000.0,405195700.0,6897243000.0,14399870000.0,...,7919411.0,160394200.0,97280730.0,,,88110360.0,,1268583.0,,
2005-01-31,74461800000.0,,10697290000.0,,405577000.0,6700613000.0,66078770000.0,435389800.0,7291334000.0,16034490000.0,...,9562844.0,186300200.0,149899500.0,,,102288200.0,,3352043.0,,
2006-01-31,81366210000.0,,15384910000.0,,582258800.0,6888625000.0,70709480000.0,346731000.0,7638435000.0,17342940000.0,...,8779267.0,206571200.0,165374100.0,,,104933500.0,8474074.0,3826036.0,,
2007-01-31,93375140000.0,,17718300000.0,,856184500.0,7114692000.0,83847530000.0,220874900.0,8550754000.0,19461000000.0,...,6456017.0,274483800.0,376674700.0,,,116580300.0,11298160.0,5878208.0,10914310.0,
2008-01-31,105300600000.0,,16576220000.0,,1219211000.0,6768821000.0,103205800000.0,297382400.0,9735573000.0,25583550000.0,...,40045643.0,308759500.0,388982100.0,,,131710300.0,19455290.0,7487283.0,17392750.0,
2009-01-31,112179800000.0,,16303530000.0,,1605435000.0,7147735000.0,102324900000.0,307879600.0,10530590000.0,25962880000.0,...,28582445.0,367542800.0,413814600.0,,,,26952060.0,14338150.0,20277730.0,


In [13]:
ax,an = plot_dendrogram(returns=X, codependence='spearman',
                        linkage='ward', k=None, max_k=5,
                        leaf_order=True, ax=None)
ax;

NameError: name 'plot_dendrogram' is not defined

In [14]:
color = list(set(an['leaves_color_list']))
new_df = pd.DataFrame()
new_df['CODE'] = an['leaves']
new_df['SORTED'] = an['leaves_color_list']
new_df = new_df.sort_values(by=['CODE'])
new_df['COMPANY'] = X.columns
li = []
for i in color:
    li.append(new_df['COMPANY'][new_df['SORTED']==i])

len(li[3])

NameError: name 'an' is not defined

### Rolling
- Rolling 기간은 122달로 설정하였다.

In [None]:
rolling_period = 122
results = {}

for col in returns.columns:
    reSeries = {}
    for n in range(1, returns.shape[0]-rolling_period):
        
        temp = pd.DataFrame({"asset":assets0[col].iloc[n:n+rolling_period-1],
                            "liability":liability0[col].iloc[n:n+rolling_period-1],
                            "sale":sale[col].iloc[n:n+rolling_period-1],
                            "income":income[col].iloc[n:n+rolling_period-1],
                             "re0": returns[col].iloc[n-1:n+rolling_period-2],
                            "re": returns[col].iloc[n:n+rolling_period-1]})
        print(temp)
        temp = temp.dropna()
        future = returns[col].iloc[n+120]
        
        if temp.shape[0] < 50:
            continue
            
        model = DecisionTreeRegressor(max_depth=6)#
        model.fit(temp.drop("re", axis=1).iloc[:-1,:], temp["re"].iloc[:-1])
        
        r = model.predict(temp.drop("re", axis=1).iloc[[-1], :])
        
        reSeries[temp.index[-1]] = temp["re"].iloc[-1]-r[0]
        
    if len(reSeries) > 0:
        results[col] = pd.Series(reSeries)

In [None]:
list(li[0])

In [None]:
r2 = []
acc = []

for n in range(12, 48):
    results = pd.DataFrame(results)
    signal = (results).mean(axis=1)

    signal = (signal-signal.shift(1).rolling(n).mean())/signal.shift(1).rolling(n).std() #변경 말것!
    temp = pd.DataFrame({"returns":returns.shift(-1).mean(axis=1), "result":signal}).dropna()

    test = temp.iloc[int(temp.shape[0]/4):2*int(temp.shape[0]/4), :]
    test0 = temp.iloc[2*int(temp.shape[0]/4):, :]
    
    training = temp.iloc[:int(temp.shape[0]/4), :]

    mu = training.result.mean()
    sd = training.result.std()

    test.result = (test.result-mu)/sd
    test0.result = (test0.result-mu)/sd
    training.result = (training.result - mu) /sd

    test["returns+"] = np.where(test["returns"] > 0, test["returns"], 0)
    test0["returns+"] = np.where(test0["returns"] > 0, test0["returns"], 0)

    test["port"] = test["returns"] * np.where(-test.result>2, 1, 0)
    test0["port"] = test0["returns"] *np.where(-test0.result>2, 1, 0)
    
    acc.append(accuracy_score(np.where(test.returns>0, 1, 0), np.where(-test.result>0, 1, 0)))

    from sklearn.metrics import accuracy_score

    print(f"{np.corrcoef(test.returns, -test.result)[1,0 ] ** 2 * 100:.4f}%")
    r2.append(np.corrcoef(test.returns, -test.result)[1,0 ] ** 2)

In [None]:
pd.Series(r2, index=range(12, 48)).argmax()+12

In [None]:
plt.plot(range(12, 48), r2)

In [None]:
n = 33

results = pd.DataFrame(results)

signal = (results).mean(axis=1)
#
signal = (signal-signal.shift(1).rolling(n).mean())/signal.shift(1).rolling(n).std() #변경 말것!
temp = pd.DataFrame({"returns":returns.shift(-1).mean(axis=1), "result":signal}).dropna()
test = temp.iloc[int(temp.shape[0]/4):2*int(temp.shape[0]/4), :]
test0 = temp.iloc[2*int(temp.shape[0]/4):, :]

mu = temp.result.iloc[:int(temp.shape[0]/4)].mean()
sd = temp.result.iloc[:int(temp.shape[0]/4)].std()

test.result = (test.result-mu)/sd

test0.result = (test0.result-mu)/sd


test["returns+"] = np.where(test["returns"] > 0, test["returns"], 0)
test0["returns+"] = np.where(test0["returns"] > 0, test0["returns"], 0)

test["port"] = test["returns"] * -1*test.result#np.where(-1*test.result >0,, 0)
test0["port"] = test0["returns"] * np.where( -1*test0.result > 0, 1, 0)

from sklearn.metrics import classification_report

print(f"{np.corrcoef(test.returns, -test.result)[1,0 ] ** 2 * 100:.4f}%")
    
print(classification_report(np.where(test.returns> 0, 1, 0), np.where(-test.result >0, 1, 0)))

In [None]:
print(f"{np.corrcoef(test0.returns, -test0.result)[1,0 ] ** 2 * 100:.4f}%")
    
print(classification_report(np.where(test0.returns> 0, 1, 0), np.where(-test0.result >0, 1, 0)))

In [None]:
plt.plot(test0.result,test0.returns, "o")

In [None]:
plt.plot((1+test0["port"]*1).cumprod())
plt.plot((1+test0.returns*1).cumprod())

In [None]:
plt.plot(test0.returns ,test0["port"], "o")

In [None]:
print(test0["port"].mean() * np.sqrt(12)/ test0["port"].std())
print(test0["returns"].mean() * np.sqrt(12)/ test0["returns"].std())

In [None]:
test0.corr()

In [10]:
sm.OLS(test0["port"], sm.add_constant(test0[["returns", "returns+"]])).fit().summary()

NameError: name 'test0' is not defined

In [11]:
plt.plot((1+test0["returns"]).cumprod())

NameError: name 'test0' is not defined

In [22]:
np.mean(np.where(test0["returns"]>0, 1, 0))

0.6491228070175439

In [23]:
from itertools import combinations as c

def estimation(bench, port):
    n0 = 0
    n = 0
    for t1, t2, t3 in c(range(len(bench)), 3):
        n0 += 1 
        rms = bench.iloc[[t1, t2, t3]].sort_values() 
        trueT1, trueT2, trueT3 = rms.index 
        betaH = (port.loc[trueT3] - port.loc[trueT2]) / (bench.loc[trueT3] - bench.loc[trueT2]) 
        betaL = (port.loc[trueT2] - port.loc[trueT1]) / (bench.loc[trueT2] - bench.loc[trueT1])
        
        if betaH > betaL:       
            n += 1          
    return 2*(n / n0) -1
          
def kernel(rms, ris):
    
    rm1, rm2, rm3 = rms
    ri1, ri2, ri3 = ris   
    premise = rm1 < rm2 and rm2 < rm3
    
    if not premise:       
        return 0
    
    betaH = (ri3 - ri2) / (rm3 - rm2)        
    betaL = (ri2 - ri1) / (rm2 - rm1)
    conclusion = betaH > betaL
       
    if  premise and not conclusion:       
        return 0   
    else:
      
        return 1

def stanardE(bench, port, esti):
    sum0 = 0
    for t1 in bench.index:   
        sum1 = 0     
        n0 = 0    
        for t2, t3 in c(bench.index, 2):            
            n0 += 1           
            sum1 += kernel(bench.loc[[t1, t2, t3]], port.loc[[t1, t2, t3]])
        sum1 = (sum1 / n0 - esti) ** 2       
        sum0 += sum1
    var = 9 / len(bench.index) * sum0
    return np.sqrt(var)
    
from scipy.stats import norm

def nonParaMT(bench, port):
    
    esti = estimation(bench, port)
    se = stanardE(bench, port, esti)
    statistic = esti * np.sqrt(len(bench)) /se
    pVal = 1 - norm.cdf(statistic, loc=0, scale=1)
    return esti, se , statistic, pVal

In [24]:
nonParaMT(test0["returns"], test0["port"])

(-0.25037593984962403,
 0.9963193242547572,
 -1.8972801655271554,
 0.9711045143312425)