# 每日股價爬蟲！

今天我們要來建立一個股票爬蟲，最後的成品如下：

In [2]:
import requests
import pandas as pd
from io import StringIO

def crawl_price(date):
    
    # 將 date 變成字串 舉例：'20180525' 
    datestr = date.strftime('%Y%m%d')
    
    # 從網站上依照 datestr 將指定日期的股價抓下來
    r = requests.post('http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=' + datestr + '&type=ALLBUT0999')
    
    # 將抓下來的資料（r.text），其中的等號給刪除
    content = r.text.replace('=', '')
    
    # 將 column 數量小於等於 10 的行數都刪除
    lines = content.split('\n')
    lines = list(filter(lambda l:len(l.split('",')) > 10, lines))
    
    # 將每一行再合成同一行，並用肉眼看不到的換行符號'\n'分開
    content = "\n".join(lines)
    
    # 假如沒下載到，則回傳None（代表抓不到資料）
    if content == '':
        return None
    
    # 將content變成檔案：StringIO，並且用pd.read_csv將表格讀取進來
    df = pd.read_csv(StringIO(content))
    
    # 將表格中的元素都換成字串，並把其中的逗號刪除
    df = df.astype(str)
    df = df.apply(lambda s: s.str.replace(',', ''))
    
    # 將爬取的日期存入 dataframe
    df['date'] = pd.to_datetime(date)
    
    # 將「證券代號」的欄位改名成「stock_id」
    df = df.rename(columns={'證券代號':'stock_id'})
    
    # 將 「stock_id」與「date」設定成index 
    df = df.set_index(['stock_id', 'date'])
    
    # 將所有的表格元素都轉換成數字，error='coerce'的意思是說，假如無法轉成數字，則用 NaN 取代
    df = df.apply(lambda s:pd.to_numeric(s, errors='coerce'))
    
    # 刪除不必要的欄位
    df = df[df.columns[df.isnull().all() == False]]
    
    return df


import datetime
crawl_price(datetime.datetime(2018,1,2))

Unnamed: 0_level_0,Unnamed: 1_level_0,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
stock_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0050,2018-01-02,3613199,1795,298033251,82.15,82.60,82.15,82.60,0.45,82.55,212,82.60,123,0.00
0051,2018-01-02,9100,8,287540,31.70,31.70,31.31,31.67,0.00,31.60,100,31.67,9,0.00
0052,2018-01-02,3150,5,168550,53.55,53.55,53.50,53.50,0.65,53.50,1,53.65,10,0.00
0053,2018-01-02,11100,11,392110,35.29,35.36,35.25,35.36,0.03,35.39,100,35.71,8,0.00
0054,2018-01-02,2000,2,47940,23.97,23.97,23.97,23.97,0.02,23.93,100,23.97,2,0.00
0055,2018-01-02,65000,21,1059920,16.20,16.35,16.20,16.35,0.09,16.33,50,16.35,4,0.00
0056,2018-01-02,1868451,733,46856990,25.00,25.14,25.00,25.13,0.13,25.13,4,25.14,49,0.00
0057,2018-01-02,0,0,0,,,,,0.00,49.30,1,49.53,20,0.00
0058,2018-01-02,0,0,0,,,,,0.00,46.31,1,46.43,10,0.00
0059,2018-01-02,6000,6,241040,40.15,40.20,40.15,40.20,0.00,40.13,1,40.30,1,0.00


# 接下來就來一步步分析

首先呢，必須知道網址在哪裡，把網址上的資料存成csv檔

In [3]:
import requests
response = requests.get('http://www.twse.com.tw/exchangeReport/MI_INDEX?response=csv&date=20180309&type=ALLBUT0999&_=1520785530355')


# 試試看csv能不能直接存到 pandas 的 dataframe 中

發現有點小問題，由於pandas發現每row的長度不一樣，造成pandas無法存取

In [4]:
lines = response.text.split('\n')
lines[100]

'"電機機械類報酬指數","270.03","-","0.92","-0.34",\r'

# 用For 迴圈篩選每一行

In [54]:

# 將 newlines 檢查是否有 17個欄位，例如：
'  "你",   "好",   "嗎",'
# 我們想要把它被切開並計算切開後字串被分割成幾個元素，用眼睛看，我們知道上述這行有3個欄位
# 我們不能直接用「,」來切開字串，因為我們假如考慮以下狀況：
'  "你",   ",好,",   "嗎",'
# 上述狀況，正確來說，我們應該要切開三個值（以被「"」包起來和「,」作為判斷），分別為「你」、「,好,」、「嗎」
# 假如直接用「,」分開的話，反而會切成「你」「"」「好」「,」「嗎」，總共五段
# 所以我們發現以「",」來切開字串，就能準確的將上述字串切開，所以在切割時，我們用「",」將每一行切開，並看切成幾個
# 切成17個的話，我們就保留

newlines = []

for line in lines:
    
    # 用「",」切開每一行，看是否被切成17個
    if len(line.split('",')) == 17:
        
        # 將 line 加到新的 newlines 中
        newlines.append(line)

print('原本的行數（lines）')
print(len(lines))
print('刪除不需要的行數後，變少了(newlines)')
print(len(newlines))

原本的行數（lines）
1208
刪除不需要的行數後，變少了(newlines)
1046


# 終於做出dataframe 了！

In [47]:
# 先創造一個字元c(換行符)
c = '\n'
# 利用此字元c，將每一行給連在一起
s = c.join(newlines)
# 將 s 裡面的 等號 刪除
s = s.replace('=', '')

# 將 s 用StringIO變成檔案，並用 pd.read_csv 來讀取檔案
df = pd.read_csv(StringIO(s))

# 顯示前五個
df.head()

Unnamed: 0,證券代號,證券名稱,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌(+/-),漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比,Unnamed: 16
0,50,元大台灣50,2832064,986,234740002,82.90,83.00,82.70,82.85,+,0.1,82.85,131,82.9,18,0.0,
1,51,元大中型100,18000,7,571930,31.79,31.79,31.70,31.70,+,0.11,31.72,50,31.8,1,0.0,
2,52,富邦科技,3000,2,171300,57.10,57.10,57.10,57.10,,0.0,57.0,1,57.3,1,0.0,
3,53,元大電子,38282,7,1399294,36.56,36.57,36.55,36.55,+,0.12,36.49,50,36.83,8,0.0,
4,54,元大台商50,0,0,0,--,--,--,--,,0.0,23.96,50,24.18,9,0.0,


# 用 pandas 中的好用 function，將資料作整理！

上面的資料有點怪怪的，例如：
1. 它們顯示起來像是數字，但其實還是字串！
2. 某些數字中間有','，很煩！
3. 有幾行是來亂的：Unnamed: 16，啥玩意兒？

In [48]:
# 將所有df中的元素都變成字串，並將字串中的逗號「,」刪除
df = df.astype(str)
df = df.applymap(lambda s: s.replace(',', ''))

# 將 df 證券代號變成 index
df = df.set_index('證券代號')

# 將 df 中的元素從字串變成數字
df = df.apply(lambda s: pd.to_numeric(s, errors='coerce'))

# 要刪除沒有用的columns
# 其中 axis=1 為是說每條columns去檢查有沒有NaN
# how='all' 是說假如全部都是 NaN 則刪除該 column
# （原本的方法） df = df[df.columns[df.isnull().sum() != len(df)]]

df.dropna(axis=1, how='all', inplace=True)

df.head()

Unnamed: 0_level_0,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
證券代號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50,2832064,986,234740002,82.9,83.0,82.7,82.85,0.1,82.85,131,82.9,18,0.0
51,18000,7,571930,31.79,31.79,31.7,31.7,0.11,31.72,50,31.8,1,0.0
52,3000,2,171300,57.1,57.1,57.1,57.1,0.0,57.0,1,57.3,1,0.0
53,38282,7,1399294,36.56,36.57,36.55,36.55,0.12,36.49,50,36.83,8,0.0
54,0,0,0,,,,,0.0,23.96,50,24.18,9,0.0


# 計算長紅棒

In [52]:
# 紅棒的長度，1代表不漲不跌，小於一代表收盤價比較小（股價跌），大於一代表收盤價比較大（股票漲）
close_open = df['收盤價'] / df['開盤價']
close_open.head(5)

證券代號
0050    0.999397
0051    0.997169
0052    1.000000
0053    0.999726
0054         NaN
dtype: float64

In [50]:
# 選出 收盤 比 開盤 還要高 5% 以上的股票
df[close_open > 1.05]

Unnamed: 0_level_0,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
證券代號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1418,12875,19,122285,9.28,9.93,9.28,9.87,0.03,9.51,1,10.0,2,0.0
2351,8440209,5206,720426520,80.8,87.7,80.7,87.7,7.9,87.7,77,,0,21.29
2364,324265,114,846146,2.47,2.69,2.47,2.69,0.24,2.69,279,,0,0.0
2509,2107044,903,50794532,23.0,25.4,22.8,25.0,1.8,24.9,21,25.0,43,0.0
3024,1779477,465,12197815,6.5,7.0,6.5,6.9,0.4,6.9,7,6.92,10,2.35
3189,13233193,5936,717789315,52.0,55.0,52.0,55.0,5.0,55.0,4405,,0,50.0
6168,7982914,3320,170744825,20.7,21.95,20.7,21.85,1.15,21.75,5,21.85,9,20.42
8478,473003,386,30676398,61.5,66.7,60.9,65.6,4.8,65.5,2,65.6,40,19.07


# 存成CSV檔

In [56]:
# 將檔案存檔成csv（可以用excel打開）
# 用dataframe存檔，避免中文亂碼，記得要將encoding='utf_8_sig'喔！
df.to_csv('daily_price.csv', encoding='utf_8_sig')

# 讀檔
# 我們指名 index 為 證券代號
df = pd.read_csv('daily_price.csv', index_col=['證券代號'])

print('index為證券代號')
print('     v')
df.head()

index為證券代號
     v


Unnamed: 0_level_0,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
證券代號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50,2832064,986,234740002,82.9,83.0,82.7,82.85,0.1,82.85,131,82.9,18,0.0
51,18000,7,571930,31.79,31.79,31.7,31.7,0.11,31.72,50,31.8,1,0.0
52,3000,2,171300,57.1,57.1,57.1,57.1,0.0,57.0,1,57.3,1,0.0
53,38282,7,1399294,36.56,36.57,36.55,36.55,0.12,36.49,50,36.83,8,0.0
54,0,0,0,,,,,0.0,23.96,50,24.18,9,0.0


# 存到 sqlite3 中

In [55]:
# 將 sql 通道打開
import sqlite3
conn = sqlite3.connect('test.sqlite3')

# 存檔 if_exists='replace' 是說假如sql中已經有 daily_price 這個 dataframe，則取代它
df.to_sql('daily_price', conn, if_exists='replace')

# 讀檔
df = pd.read_sql('select * from daily_price', conn, index_col=['證券代號'])
df.head()

Unnamed: 0_level_0,成交股數,成交筆數,成交金額,開盤價,最高價,最低價,收盤價,漲跌價差,最後揭示買價,最後揭示買量,最後揭示賣價,最後揭示賣量,本益比
證券代號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
50,2832064,986,234740002,82.9,83.0,82.7,82.85,0.1,82.85,131,82.9,18,0.0
51,18000,7,571930,31.79,31.79,31.7,31.7,0.11,31.72,50,31.8,1,0.0
52,3000,2,171300,57.1,57.1,57.1,57.1,0.0,57.0,1,57.3,1,0.0
53,38282,7,1399294,36.56,36.57,36.55,36.55,0.12,36.49,50,36.83,8,0.0
55,63240,16,1051359,16.7,16.7,16.6,16.7,0.09,16.6,57,16.7,3,0.0


# 總結一下剛剛教的function：
1. pd.to_numeric(series) <--- 將series轉型成數字。
2. df.apply(func) <--- 將 dataframe 中的每一條 series 都用 func 處理一番。
3. lambda x: y <--- 一個無名氏function，讀入 x 吐出 y。
4. df.set_index(col_name) <--- 將某個column直接變成index
5. df[x] <--- 選取 df 中的 x ，假如 x 是 a (list or series) of (string or boolean)，
假如為 boolean，則長度得跟columns的數目一樣常喔！


