In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime
import os, gc

import requests, urllib
from bs4 import BeautifulSoup

# Crawl data
[觀測資料查詢系統](https://e-service.cwb.gov.tw/HistoryDataQuery/index.jsp)
## 用BeautifulSoup解析html爬取資料

In [None]:
url = 'https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station=466920&stname=%25E8%2587%25BA%25E5%258C%2597&datepicker=2020-08-08'
response = requests.get(url)

In [None]:
soup = BeautifulSoup(response.text)

In [None]:
soup

In [None]:
soup.select('table#MyTable tr.second_tr')[0].select('th')

In [None]:
t_name = soup.select('table#MyTable tr.second_tr')[0].select('th')
for i, td in enumerate(soup.select('table#MyTable')[0].select('tr')[3].select('td')):
    print(f"{t_name[i].text} : {td.text}")

In [None]:
daily_data = pd.DataFrame(columns=[i.text for i in t_name])

for i, hr in enumerate(range(3, 27)):
    hr_data = soup.select('table#MyTable')[0].select('tr')[hr].select('td')
    daily_data.loc[i] = [i.text for i in hr_data]

In [None]:
daily_data

## 用pandas爬取html表格資料

In [None]:
source_url = "https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station=466920&stname=%25E8%2587%25BA%25E5%258C%2597&datepicker="
date = '2020-08-08'
url = source_url + date

In [None]:
daily_data = pd.read_html(requests.get(url).text, encoding='utf-8')

In [None]:
daily_data[0]

In [None]:
daily_data = daily_data[1]

In [None]:
daily_data

In [None]:
# daily_data.columns = daily_data.loc[2].values
daily_data.columns = [i[-1] for i in daily_data.columns.values]

In [None]:
daily_data

In [None]:
daily_data = daily_data.loc[3:, :]

In [None]:
daily_data

In [None]:
def convert_time(x):
    return datetime.datetime(int(date.split('-')[0]), int(date.split('-')[1]), int(date.split('-')[2]), int(x)-1)

date = '2020-08-08'
convert_time('01')

In [None]:
daily_data['ObsTime'] = daily_data['ObsTime'].apply(convert_time)

In [None]:
daily_data

In [None]:
daily_data.set_index('ObsTime')['Temperature'].astype(float).plot()

## 爬取多時間段的氣象觀測資料至pandas

In [None]:
def get_between_day(date_start, date_end):
    date_list = []
    date_start = datetime.datetime.strptime(date_start, "%Y-%m-%d")
    date_end = datetime.datetime.strptime(date_end, "%Y-%m-%d")
    while date_start <= date_end:
        date_str = date_start.strftime("%Y-%m-%d")
        date_list.append(date_str)
        date_start += datetime.timedelta(days=1)
    return date_list

In [None]:
source_url = "https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station=466920&stname=%25E8%2587%25BA%25E5%258C%2597&datepicker="
date_list = get_between_day('2020-08-01', '2020-08-08')

codis_df_all = pd.DataFrame()
for date in date_list:
    url = source_url + date
    codis_df_temp = pd.read_html(requests.get(url).text, encoding='utf-8')[1]
    codis_df_temp.columns = codis_df_temp.loc[2].values
    codis_df_temp = codis_df_temp.loc[3:, :]
    # 把原本1~24的時間欄位變成 datetime 格式 > year, month, day, hour
    codis_df_temp['ObsTime'] = codis_df_temp['ObsTime'].apply(convert_time)
    codis_df_all = pd.concat([codis_df_all, codis_df_temp], axis=0)
    
codis_df_all['ObsTime'] = codis_df_all['ObsTime'] + datetime.timedelta(hours=1)

In [None]:
codis_df_all.set_index('ObsTime')['Temperature'].astype(float).plot()

In [None]:
source_url = "https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station=466920&stname=%25E8%2587%25BA%25E5%258C%2597&datepicker="
date_list = get_between_day('2015-01-01', '2015-12-31')

codis_df_all = pd.DataFrame()
for date in date_list:
    url = source_url + date
    codis_df_temp = pd.read_html(requests.get(url).text, encoding='utf-8')[1]
    codis_df_temp.columns = codis_df_temp.loc[2].values
    codis_df_temp = codis_df_temp.loc[3:, :]
    # 把原本1~24的時間欄位變成 datetime 格式 > year, month, day, hour
    codis_df_temp['ObsTime'] = codis_df_temp['ObsTime'].apply(convert_time)
    codis_df_temp = codis_df_temp[['ObsTime', 'StnPres', 'Temperature', 'RH', 'WS', 'WD', 'Precp']]
    codis_df_all = pd.concat([codis_df_all, codis_df_temp], axis=0)
    
codis_df_all['ObsTime'] = codis_df_all['ObsTime'] + datetime.timedelta(hours=1)
codis_df_all.to_csv(f'data/codis_{date[:4]}.csv', index=False)