# Python 網路爬蟲 PART 2 - 擷取 台灣證券交易所 (TWSE) 股票資料
> ### [ REFERENCE ]

> + "Python 網路爬蟲 PART 1 - 台灣證券交易所 (TWSE) 股票資料"
>
> + Vincent Tsai, "**`TWSE_Python_crawler`**" 
    + Github: "**`TWSE_Python_crawler`**" https://github.com/tsaisunghao/TWSE_Python_crawler
    + Youtube: "Python網路爬蟲 爬取台灣證券交易所歷史交易紀錄(加強版)" https://youtu.be/dvw749iveJA


##  [ Module ]: TWSE_Crawler.py


In [None]:
"""
#----------------------------------------------------
# [ Module ]: TWSE_Crawler.py
# < TWSE 個股資料下載模組 >： 2330 台積電 (TSMC) 2019/05/22
#----------------------------------------------------

def twse_crawler(year=2019, mm=5, dd=22, stockNo=2330):
    ''' 例如： year=2019 mm=05 dd=02 stockNo=2330 '''
    import bs4 as bs  #  beautifulsoup 4
    import urllib
    import urllib.request

    #----------------------------------------------------
    # 1. 連線至台灣證交所 (TWSE)，擷取 個股資料
    #----------------------------------------------------
    url_twse = 'http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date='
    url_history = url_twse + str(year) + str(mm).zfill(2) + str(dd).zfill(2) + '&stockNo=' + str(stockNo)
    print(url_history)
    webpage_history = urllib.request.urlopen(url_history)
    web_html = bs.BeautifulSoup(webpage_history, 'html.parser')
    # print(web_html)

    import json
    stock = json.loads(web_html.text)  #  讀取 JSON 資料
    # stock

    #----------------------------------------------------
    # 2. 轉換 個股資料成 data frame 資料格式
    #----------------------------------------------------
    stock_info = list(stock.values())  #  轉換成 list 格式
    stock_info
    # stock_info[2]
    # stock_info[3]

    import pandas as pd
    import numpy as np

    stock_price = pd.DataFrame(stock_info[4])
    stock_price

    ##  將 data frame 資料加上 column name
    stock_price.columns = stock_info[3]
    stock_price

    ## 將第一個 column name “日期” 改成 個股 title 
    stock_info[3][0] = stock_info[2]
    stock_price.columns = stock_info[3]
    stock_price

    ## 將 index 欄位 改成 個股 title 
    df_price = stock_price.set_index(stock_price.columns[0])
    df_price

    #----------------------------------------------------
    # < 輸出個股資料至 csv 檔案 >
    #----------------------------------------------------
    import os.path
    mydir ='./'      #  檔案路徑
    #  建立檔案路徑 + 檔案名稱 stock.csv
    csv_file = os.path.join(mydir, "stock" + str(stockNo) + '_' + str(year) + str(mm).zfill(2) + ".csv")
    df_price.to_csv(csv_file, encoding='utf_8_sig')   #  輸出 csv 檔案 
    print(' 輸出 csv 檔案 : ', csv_file)
    return df_price  #   回傳 個股 data frame 資料
"""

In [1]:
import TWSE_Crawler
dir(TWSE_Crawler)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'twse_crawler']

In [2]:
from TWSE_Crawler import *
twse_crawler() # twse_crawler(year=2019, mm=5, dd=22, stockNo=2330)

http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20190522&stockNo=2330
 輸出 csv 檔案 :  ./stock2330_201905.csv


Unnamed: 0_level_0,成交股數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,成交筆數
108年05月 2330 台積電 各日成交資訊,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
108/05/02,27376394,7122135656,261.5,262.5,258.5,259.0,0.0,9110
108/05/03,30200246,7940572444,262.0,265.0,260.5,265.0,6.0,10068
108/05/06,33688187,8717053470,260.0,260.0,258.0,259.0,-6.0,12535
108/05/07,25686126,6719199455,259.5,263.0,259.0,262.5,3.5,8339
108/05/08,25902364,6750083504,260.0,261.5,259.5,260.0,-2.5,7905
108/05/09,34166574,8798140796,259.5,259.5,256.0,256.5,-3.5,10995
108/05/10,18868212,4844594484,257.0,259.0,255.0,256.0,-0.5,8009
108/05/13,29317535,7374249589,253.0,254.0,249.5,250.5,-5.5,11974
108/05/14,45620708,11345067455,247.5,251.0,245.0,248.5,-2.0,14283
108/05/15,37223479,9327051888,251.0,252.0,249.0,249.0,0.5,12041


## < TWSE 個股資料下載＆彙整 範例 >  2330 台積電 (TSMC) : 2019/01 ~ 2019/05

In [3]:
import time
import pandas as pd

for i in range(5):  # 2019/01 ~ 2019/05
    if i == 0:
        stocks = twse_crawler(year=2019, mm=i+1, dd=22, stockNo=2330)
    else:
        stock_i = twse_crawler(year=2019, mm=i+1, dd=22, stockNo=2330)
        stocks = pd.concat([stocks, stock_i])
    time.sleep(6)  # 延遲 6 秒

http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20190122&stockNo=2330
 輸出 csv 檔案 :  ./stock2330_201901.csv
http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20190222&stockNo=2330
 輸出 csv 檔案 :  ./stock2330_201902.csv
http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20190322&stockNo=2330
 輸出 csv 檔案 :  ./stock2330_201903.csv
http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20190422&stockNo=2330
 輸出 csv 檔案 :  ./stock2330_201904.csv
http://www.twse.com.tw/exchangeReport/STOCK_DAY?response=json&date=20190522&stockNo=2330
 輸出 csv 檔案 :  ./stock2330_201905.csv


In [4]:
stocks.head()

Unnamed: 0,成交股數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,成交筆數
108/01/02,32900482,7276419230,226.5,226.5,219.0,219.5,-6.0,12329
108/01/03,34615620,7459051790,214.0,218.0,214.0,215.5,-4.0,14549
108/01/04,67043521,13987136785,211.5,211.5,206.5,208.0,-7.5,28786
108/01/07,35695176,7591116569,212.0,214.0,211.0,213.0,5.0,11224
108/01/08,23794481,5019703557,212.0,212.5,210.0,211.0,-2.0,9377


In [5]:
stocks.tail()

Unnamed: 0,成交股數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,成交筆數
108/05/27,37447033,8697538216,234.0,235.0,231.0,231.0,-2.0,13895
108/05/28,99322033,22910765567,232.0,232.0,230.5,230.5,-0.5,10122
108/05/29,32260236,7385029780,228.0,230.5,227.0,229.5,-1.0,10233
108/05/30,40375328,9292745636,230.0,231.5,229.0,231.0,1.5,8772
108/05/31,49163217,11559578381,232.0,237.5,231.0,235.5,4.5,14365


In [6]:
stocks.shape

(96, 8)

## < 輸出 TWSE 個股彙整資料 範例 >  2330 台積電 (TSMC) : 2019/01 ~ 2019/05

In [7]:
import os.path

stockNo = 2330   #  2330 台積電 (TSMC)
year = 2019

mydir ='./'      #  檔案路徑
csv_file = os.path.join(mydir, "stock" + str(stockNo) + '_' + str(year) + ".csv") #  建立檔案路徑 + 檔案名稱 stock.csv
stocks.to_csv(csv_file, index=True, encoding='utf_8_sig')
print(' 輸出 csv 檔案 : ', csv_file)

 輸出 csv 檔案 :  ./stock2330_2019.csv


##  [ Exercise 6-1 ]: 
> ### 請下載 中鋼、日月光、鴻海、廣達、富邦 2019年 個股資料，並輸出至 csv 檔案。