<a href="https://colab.research.google.com/github/fininsight/text-mining-tutorial/blob/master/99_%EA%B8%B0%ED%83%80%EC%BD%94%EB%93%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 국내 주식 수정주가 조회

## 1) 네이버 차트를 활용한 수정주가 수집

In [151]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

In [3]:
def getStockPrice(StockCode, Count = 252) :
    ret = pd.DataFrame(columns=['date', 'open', 'high', 'low', 'close','volumn'])
    
    url = "https://fchart.stock.naver.com/sise.nhn?symbol={}&timeframe=day&count={}&requestType=0"
    url = url.format(StockCode, Count)
    response = requests.get(url)
    
    e = ET.fromstring(response.text.replace('\n','').replace('\t',''))
    stock_history = []
    for item in e.find('chartdata').findall('item') :
        data = item.get('data').split('|')
        ret.loc[len(ret)] = data
        
    return ret

In [4]:
data = getStockPrice('005930')
data.head()

Unnamed: 0,date,open,high,low,close,volumn
0,20180627,47450,48500,47000,47950,15274752
1,20180628,46850,47150,46600,46800,12784800
2,20180629,46250,47150,46200,46650,14099635
3,20180702,46500,47150,45500,45550,13112253
4,20180703,45750,46450,45750,46150,10959655


In [None]:
# 로그 수익율  = ln(당일종가 / 전일종가)

In [41]:
# log 함수 사용하기 위해 math 임포트
import math
math.log(1)

0.0

# 로그 수익률 구하기

In [96]:
def money(df):
    df_ = pd.DataFrame(columns = ['ln_price'], index = data.index)
    df_['ln_price'][0] = 0
    c_price = pd.to_numeric(df['close'])
    for i in range(1, len(df)):
        df_['ln_price'][i] = math.log(c_price[i] / c_price[i-1])
    
    return df_

In [97]:
df_ln = money(data)
df_ln.head()

Unnamed: 0,ln_price
0,0.0
1,-0.0242756
2,-0.00321028
3,-0.0238623
4,0.0130863


In [98]:
df_sum = df_ln.sum(axis = 0)


# 평균 수익률 구하기

In [99]:
def avg_price(df):
    sum_p = df.sum(axis = 0)
    
    return (sum_p / len(df))[0]

In [100]:
avg_price(df_ln)

-0.00020812093401802339

# 표준편차 구하기

In [101]:
def std_price(df):
    avg = avg_price(df)
    c_price = pd.to_numeric(df['ln_price'])
    sum_a = 0
    for i in range(len(df)):
        sum_a += (c_price[i] - avg)**2
    
    return (sum_a / len(df))**(1/2)

In [102]:
std_price(df_ln)

0.01539639183499893

In [182]:
np.std(df_ln)

ln_price    0.015396
dtype: float64

# 변동성구하기

In [147]:
def std_price2(df):
    avg = avg_price(df)
    c_price = pd.to_numeric(df['ln_price'])
    sum_a = 0
    print(type(c_price))
    print(type(c_price[0]))
    print(type(avg))
    for i in range(len(df)):
        sum_a += (c_price[i] - avg)**2
    
    return (sum_a / len(df))**(1/2) * (252)**(1/2)

In [148]:
std_price2(df_ln)

<class 'pandas.core.series.Series'>
<class 'numpy.float64'>
<class 'numpy.float64'>


0.24441014329867547

# 종목별 변동성 구하기

In [105]:
def std_by_code(code):
    data = getStockPrice(code)
    df_ln = money(data)
    std_c = std_price2(df_ln)
    
    return std_c

In [106]:
std_by_code('005930')

0.2443644205461667

# 공분산 구하기

In [183]:
def twin_std(df1, df2):
    df_f = money(df1)
    df_s = money(df2)
    
    avg1 = avg_price(df_f)
    avg2 = avg_price(df_s)
    
    f_price = pd.to_numeric(df_f['ln_price'])
    s_price = pd.to_numeric(df_s['ln_price'])
    
    sum_a = 0
    for i in range(len(df_f)):
        sum_a += (f_price[i] - avg1)*(s_price[i] - avg2)
    
    return (sum_a / (len(df_f)-1) )

In [138]:
def kospi():
    base_url = 'https://finance.naver.com'
    src="/sise/sise_index_day.nhn?code=KPI200"
    url = base_url + src
    
    result_list = []
    for num in range(1, 100):
        params = {
            'page' : num
        }
        resp = requests.get(url, params)
        soup = BeautifulSoup(resp.content, 'html.parser')
        tr_list = soup.find('table').find_all('tr')
        tr_list2 = tr_list[2:5] + tr_list[9:]
        for tr in tr_list2:

            if not tr.text.strip():
                continue

            date = tr.find('td', class_='date').text
            date = pd.to_datetime(date).date()
            price = tr.find('td', class_='number_1').text

            new_dict = {
                'date' : date,
                'price' : price
            }
            result_list.append(new_dict)
        df1 = pd.DataFrame(result_list)
        
    return df1

In [139]:
df2 = kospi()

In [169]:
df2.columns = ['Date', 'close']
df2 = df2[:252]
df2['close'] = pd.to_numeric(df2['close'])
df2['close']

0      274.24
1      274.54
2      272.37
3      275.97
4      277.27
5      277.50
6      277.75
7      275.53
8      275.28
9      275.58
10     275.48
11     276.01
12     275.31
13     271.36
14     270.09
15     270.48
16     271.13
17     272.74
18     273.21
19     272.10
20     268.27
21     267.99
22     267.66
23     268.16
24     263.89
25     263.93
26     261.37
27     264.70
28     264.40
29     264.42
        ...  
222    297.18
223    295.24
224    295.61
225    293.55
226    298.69
227    297.45
228    297.29
229    297.78
230    296.95
231    294.85
232    295.49
233    294.56
234    296.95
235    295.90
236    296.47
237    296.88
238    297.37
239    298.65
240    294.41
241    294.43
242    296.19
243    295.20
244    293.11
245    291.63
246    292.26
247    293.35
248    292.93
249    299.66
250    298.26
251    302.25
Name: close, Length: 252, dtype: float64

In [173]:
df1 = getStockPrice('005930')
df1['close'] = pd.to_numeric(df1['close'])
df1['close']

0      47950
1      46800
2      46650
3      45550
4      46150
5      46250
6      45950
7      44900
8      45600
9      46300
10     46000
11     45500
12     46500
13     46050
14     45850
15     46550
16     46900
17     47450
18     46500
19     46150
20     46150
21     46900
22     46900
23     46500
24     46250
25     46550
26     45550
27     45750
28     45800
29     46700
       ...  
222    42700
223    42650
224    42550
225    41800
226    42550
227    42500
228    43800
229    43450
230    43900
231    44200
232    44800
233    44850
234    44600
235    43750
236    44000
237    43900
238    44350
239    45350
240    45500
241    45700
242    45500
243    45600
244    45700
245    46500
246    47000
247    46600
248    46250
249    45400
250    46000
251    45450
Name: close, Length: 252, dtype: int64

In [177]:
np.cov(df1['close'], df2['close'])

array([[5.46122937e+06, 7.88748748e+03],
       [7.88748748e+03, 1.38653247e+02]])

In [184]:
df_twin = twin_std(df1, df2)
df_twin

2.3581027000156343e-05

# 상관계수 구하기  
공분산 / 표준편차(a)* 표준편차(b)

In [None]:
# 표준편차
def std_price(df): 
    avg = avg_price(df)
    c_price = pd.to_numeric(df['ln_price'])
    sum_a = 0
    for i in range(len(df)):
        sum_a += (c_price[i] - avg)**2
    
    return (sum_a / len(df))**(1/2)

In [157]:
"""
twin_std(df1, df2): # 공분산
df_f = money(df1)
df_s = money(df2)
"""

'\ntwin_std(df1, df2): # 공분산\ndf_f = money(df1)\ndf_s = money(df2)\n'

In [180]:
def corr_price(df1, df2):
    twin = twin_std(df1, df2)
    std1 = std_price(money(df1))
    std2 = std_price(money(df2))
    
    return twin / (std1*std2)

In [181]:
corr_price(df1, df2)

<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>
<class 'numpy.float64'>


0.16890818306296737

In [174]:
np.corrcoef(df1['close'], df2['close'])

array([[1.        , 0.28663449],
       [0.28663449, 1.        ]])

In [162]:
help(np.corrcoef)

Help on function corrcoef in module numpy:

corrcoef(x, y=None, rowvar=True, bias=<no value>, ddof=<no value>)
    Return Pearson product-moment correlation coefficients.
    
    Please refer to the documentation for `cov` for more detail.  The
    relationship between the correlation coefficient matrix, `R`, and the
    covariance matrix, `C`, is
    
    .. math:: R_{ij} = \frac{ C_{ij} } { \sqrt{ C_{ii} * C_{jj} } }
    
    The values of `R` are between -1 and 1, inclusive.
    
    Parameters
    ----------
    x : array_like
        A 1-D or 2-D array containing multiple variables and observations.
        Each row of `x` represents a variable, and each column a single
        observation of all those variables. Also see `rowvar` below.
    y : array_like, optional
        An additional set of variables and observations. `y` has the same
        shape as `x`.
    rowvar : bool, optional
        If `rowvar` is True (default), then each row represents a
        variable, with obse

In [94]:
!pip install scrapy

Collecting scrapy
  Downloading https://files.pythonhosted.org/packages/3e/45/414e87ac8209d537c91575538c5307c20217a6943f555e0ee39f6db4bb0f/Scrapy-1.6.0-py2.py3-none-any.whl (231kB)
Collecting parsel>=1.5 (from scrapy)
  Downloading https://files.pythonhosted.org/packages/96/69/d1d5dba5e4fecd41ffd71345863ed36a45975812c06ba77798fc15db6a64/parsel-1.5.1-py2.py3-none-any.whl
Collecting Twisted>=13.1.0 (from scrapy)
  Downloading https://files.pythonhosted.org/packages/79/59/035de19362320e632301ed7bbde23e4c8cd6fc5e2f1cf8d354cdba857854/Twisted-19.2.1.tar.bz2 (3.1MB)
Collecting cssselect>=0.9 (from scrapy)
  Downloading https://files.pythonhosted.org/packages/7b/44/25b7283e50585f0b4156960691d951b05d061abf4a714078393e51929b30/cssselect-1.0.3-py2.py3-none-any.whl
Collecting PyDispatcher>=2.0.5 (from scrapy)
  Downloading https://files.pythonhosted.org/packages/cd/37/39aca520918ce1935bea9c356bcbb7ed7e52ad4e31bff9b943dfc8e7115b/PyDispatcher-2.0.5.tar.gz
Collecting queuelib (from scrapy)
  Download