In [1]:
import os
import urllib
import datetime
import numpy as np
import pandas as pd

## Const
Create path and list website link.

In [2]:
BATH_PATH = os.path.dirname(os.path.abspath('__file__'))

ETC_PATH = os.path.join(BATH_PATH, "data/central-weather-bureau(observation)")

START = datetime.date(2022, 3, 1)

END = datetime.date(2022, 3, 2 + 1)

WEBPAGE = "https://e-service.cwb.gov.tw/wdps/obs/state.htm"

## Meta
Read webpage and get meta data.

In [3]:
# Read html content as DataFrame
response = pd.read_html(WEBPAGE)[0] # 0: 現存測站, 1: 已撤銷測站, 2: 暫停供應資料測站

# Extract columns of station number, station id, altitude (meter), city, lontitude, latitude, and station address
meta = response.loc[:, ["站號", "站名", "海拔高度(m)", "城市", "經度", "緯度", "地址", "資料起始日期"]]
meta = meta.rename(columns={
    "站號": "StationID", "站名": "StationName", "海拔高度(m)": "Altitude", 
    "城市": "City", "經度": "Lon", "緯度": "Lat", "地址": "Address", 
    "資料起始日期": "StartDate"
})

# Save as meta data 
meta.to_json(os.path.join(ETC_PATH, "meta.json"), orient='records', force_ascii=False)
meta.head(3)

Unnamed: 0,StationID,StationName,Altitude,City,Lon,Lat,Address,StartDate
0,466850,五分山雷達站,756.0,新北市,121.781205,25.071182,瑞芳區靜安路四段1巷1號,1988/07/01
1,466880,板橋,9.7,新北市,121.442017,24.997647,板橋區大觀路二段265巷62號,2002/01/01
2,466900,淡水,19.0,新北市,121.448906,25.164889,淡水區中正東路42巷6號,1942/10/01


## Downloader 

In [6]:
def data_downloader(stn, st_name, date, altitude):
    st_name = urllib.parse.quote(urllib.parse.quote(st_name))
    url = f"https://e-service.cwb.gov.tw/HistoryDataQuery/DayDataController.do?command=viewMain&station={stn}&stname={st_name}&datepicker={date}&altitude={altitude}"
    data = pd.read_html(url, encoding='utf-8')[1] # 0: Page Title, 1: Data Content
    data.columns = [i[2] for i in np.array(data.columns)] # 0: Short Name, 1: Long Name(tw), 2: Long Name(en) 
    data.insert(loc=0, column="Date", value=date)
    return data

In [7]:
# Read the meta data 
meta = pd.read_json(os.path.join(ETC_PATH, "meta.json"))

# Only keep the ground weather stations(^46) and the start date is after then START
meta = meta[meta['StationID'].str.contains("^46")].reset_index(drop=True)
meta = meta[pd.to_datetime(meta['StartDate']) <= pd.to_datetime(START)].reset_index(drop=True)
meta.head(3)

Unnamed: 0,StationID,StationName,Altitude,City,Lon,Lat,Address,StartDate
0,466850,五分山雷達站,756.0,新北市,121.781205,25.071182,瑞芳區靜安路四段1巷1號,1988/07/01
1,466880,板橋,9.7,新北市,121.442017,24.997647,板橋區大觀路二段265巷62號,2002/01/01
2,466900,淡水,19.0,新北市,121.448906,25.164889,淡水區中正東路42巷6號,1942/10/01


In [None]:
# According to the list of stations to access to data
for i, row in meta.iterrows():
    delta = pd.date_range(start=START, end=END).tolist()
    data = pd.concat([data_downloader(row["StationID"], row["StationName"], str(date.date()), row["Altitude"]) for date in delta]) \
        .sort_values(['Date', 'ObsTime']) \
        .reset_index(drop=True)
    data.insert(loc=1, column="Station", value=row["StationID"])
    data.insert(loc=1, column="Lat", value=row["Lat"])
    data.insert(loc=1, column="Lon", value=row["Lon"])
    data.insert(loc=1, column="City", value=row["City"])
    data.to_csv(os.path.join(ETC_PATH, F'{row["StationID"]}.csv'), index=False)
