In [3]:
# pd.read_html version

import os
import requests
import pandas as pd
import numpy as np
from time import sleep,time
from bs4 import BeautifulSoup
from urllib.parse import quote

from fake_useragent import UserAgent

START_TIME = '2010-07-01'
END_TIME = '2010-07-31'
ALL_STATION_PATH = '全台觀測站.csv'
FINAL_DF_COLS = ['站號', '日期', '觀測時間(hour)', '測站氣壓(hPa)', '海平面氣壓(hPa)', '氣溫(℃)', '露點溫度(℃)',
                 '相對溼度(%)', '風速(m/s)', '風向(360degree)', '最大陣風(m/s)', '最大陣風風向(360degree)',
                 '降水量(mm)', '降水時數(hr)', '日照時數(hr)', '全天空日射量(MJ/㎡)', '能見度(km)', '紫外線指數',
                 '總雲量(0~10)']
FINAL_DF_COLS_v2 = ['站號', '日期', '觀測時間', '測站氣壓', '海平面氣壓', '氣溫', '露點溫度',
                 '相對溼度', '風速', '風向', '最大陣風', '最大陣風風向',
                 '降水量', '降水時數', '日照時數', '全天空日射量', '能見度', '紫外線指數',
                 '總雲量']
FINAL_DF_PATH = '2010-07.csv'
NO_DATA_PATH = "nodata-2010-07.txt"
FAIL_PARSE_PATH = "fail-2010-07.txt"
RETRY_TIME = 3

ua = UserAgent()
#COOKIES_ENABLED = False

# 爬取主函式
def crawler(url,station,date):

    retry = 0
    while retry <= RETRY_TIME: 
        headers = {'User-Agent': ua.random}
        resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(resp.text, features="html.parser")
        try:
            # find no data page
            error = soup.find("label", class_="imp").string
            if error == '本段時間區間內無觀測資料。':
                print(station+':'+date+' 無觀測資料')
                with open (NO_DATA_PATH,'a') as f:
                    f.write(url+'\n')
                return
            break
        except Exception as e:
            if retry == RETRY_TIME :
                err = "Error occurs at ({0}, {1}): {2}".format(station, date, str(e))
                print(err)
                with open (FAIL_PARSE_PATH, 'a') as f:
                    f.write(url+'\n')
                return
            retry += 1
            sleep(3)


    form =[]

    # title
    titles = soup.find_all("th")
    titles = titles[11:28]
    strtitle=[]
    for title in titles:
        title = title.contents
        title=title[0] #+title[2]+title[4]
        strtitle.append(title)

    # parameter
    soup = soup.tbody
    tmps = soup.find_all("tr")
    tmps = tmps[2:]
    for tmp in tmps:
        tmp = tmp.find_all("td")
        parameter =[]
        for strtmp in tmp:
            strtmp = ''.join(filter(lambda x: (x.isdigit() or x == '.'  or x == 'T'), strtmp.string))
            parameter.append(strtmp)
        form.append(parameter)

    form = pd.DataFrame(form[1:], columns=strtitle)
    #form.to_csv("./data/"+station+'/'+year+'/'+date+".csv", encoding ="utf-8")
    return form


def parseURL(station, stname, datepicker):
    # 將站名轉成網址內的編碼形式
    stname = quote(quote(stname))
    # 解析網址內的主要三個部分
    try:
        url = "https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station={0}&stname={1}&datepicker={2}".format(station,stname,datepicker)
        df = pd.read_html(url,encoding='utf8')[1]
    except ValueError:
        url = "http://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station={0}&stname={1}&datepicker={2}".format(station,stname,datepicker)
        df = pd.read_html(url,encoding='utf8')[1]    
    except:
        pass
        
    url = "http://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station={0}&stname={1}&datepicker={2}".format(station,stname,datepicker)
    df = pd.read_html(url,encoding='utf8')[1]
    # Replace with Null
    df = df.replace(to_replace='/', value=np.nan)
    df = df.replace(to_replace='...', value=np.nan)
    df = df.replace(to_replace='X', value=np.nan)
    df = df.replace(to_replace='V', value=np.nan)
    df = df.replace(to_replace='T', value=np.nan)
    # 回傳df
    return df

def parseURL_v2(station, stname, datepicker):
    date = datepicker
    url="http://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station="+station+"&stname=&datepicker="+date    
    df = crawler(url,station,date)
    # Replace with Null
    df = df.replace(to_replace='/', value=np.nan)
    df = df.replace(to_replace='...', value=np.nan)
    df = df.replace(to_replace='X', value=np.nan)
    df = df.replace(to_replace='V', value=np.nan)
    df = df.replace(to_replace='T', value=np.nan)
    # 回傳df
    return df

def changeColName(df):
    # 原本爬下來欄位名稱是長度為3的tuple，在這把欄位轉成單個string（只取3個欄位名稱的第2種）
    column_names = df.columns
    new_column_names = [col[1] for col in column_names]
    df.columns = new_column_names
    return df

def addStationDate(station, datepicker, df):
    # 記錄舊欄位名稱順序
    col_list = list(df.columns)
    # 新增站號、日期欄位
    df['站號'] = [station] * len(df)
    df['日期'] = [datepicker] * len(df)
    # 重新排序欄位順序（將站號、日期加在最前面兩欄）
    new_col_list = ['站號'] + ['日期'] + col_list
    df = df[new_col_list]
    return df

def getAllStation(file_path):
    # 將全台觀測站中，站號對應站號名稱的資料存成dict
    df_2 = pd.read_csv(file_path)
    station_dict = dict()
    for i in range(len(df_2)):
        station_dict[df_2['站號'][i]] = df_2['站名'][i]  
    return station_dict

def getDateList(start, end):
    # 設定日期區間
    return pd.date_range(start, end).strftime('%Y-%m-%d').tolist()

def createTotalDF(total_df_cols):
    # 建立 total_df 以供將後續報表串接
    tmp_dict = dict()
    for col in total_df_cols:
        tmp_dict[col] = list()
    total_df = pd.DataFrame(tmp_dict)
    return total_df

def collectData(total_df, station_dict, date_list):
    init_time = time()
    count = 0
    for station in station_dict.keys():
        stname = station_dict[station]
        print(station, stname)
        for date in date_list:
            try:
                # Step 1: 爬網頁
                #df_raw = parseURL(station, stname, date)
                df_raw = parseURL_v2(station, stname, date)

                # Step 2: 改欄位名稱
                #df_raw = changeColName(df_raw)

                # Step 3: 新增站號和日期放在欄位最前面
                df_new = addStationDate(station, date, df_raw)

                # Step 4: 將所有df串接成同個表格
                total_df = pd.concat([total_df, df_new])
    
            except Exception as e:
                err = "Error occurs at ({0}, {1}, {2}): {3}".format(station, stname, date, str(e))
                print(err)

        print("Time trace: {0}".format(time() - init_time))
        count += 1
        if count % 50 == 0:
            print(count)
    end_time = time()
    print("Running time(s): {0}".format(end_time - init_time))
    return total_df
    
def toCsv(df, file_path):
    try:
        df.to_csv(file_path, encoding='utf_8_sig', index=False)
        print('Save to csv successfully')
    except Exception as e:
        print("Save to csv fail: {0}".format(str(e)))
        
def main():
    # 取得全台觀測站
    station_dict = getAllStation(file_path=ALL_STATION_PATH) 
    
    # 取得日期區間
    date_list = getDateList(start=START_TIME, end=END_TIME)

    # 取得 total_df 以供將後續報表串接
    total_tmp_df = createTotalDF(FINAL_DF_COLS_v2)

    # 執行爬蟲
    final_all_df = collectData(total_tmp_df, station_dict, date_list)
    final_all_df.columns = FINAL_DF_COLS

    # 存成csv
    toCsv(final_all_df, FINAL_DF_PATH)
    
if __name__ == 'main':
    main()

In [None]:
main()

466850 五分山雷達站
Time trace: 16.935624837875366
466880 板橋
Time trace: 45.84733700752258
466900 淡水
Time trace: 74.82391405105591
466910 鞍部
Time trace: 94.75236988067627
466920 臺北
Time trace: 119.99184370040894
466930 竹子湖
Time trace: 153.25356078147888
466940 基隆
Time trace: 169.97426891326904
466950 彭佳嶼
Time trace: 184.5372838973999
466990 花蓮
Time trace: 201.78898978233337
467050 新屋
467050:2010-07-01 無觀測資料
Error occurs at (467050, 新屋, 2010-07-01): 'NoneType' object has no attribute 'replace'
467050:2010-07-02 無觀測資料
Error occurs at (467050, 新屋, 2010-07-02): 'NoneType' object has no attribute 'replace'
467050:2010-07-03 無觀測資料
Error occurs at (467050, 新屋, 2010-07-03): 'NoneType' object has no attribute 'replace'
467050:2010-07-04 無觀測資料
Error occurs at (467050, 新屋, 2010-07-04): 'NoneType' object has no attribute 'replace'
467050:2010-07-05 無觀測資料
Error occurs at (467050, 新屋, 2010-07-05): 'NoneType' object has no attribute 'replace'
467050:2010-07-06 無觀測資料
Error occurs at (467050, 新屋, 2010-07-06)