In [22]:
import csv
import os
import time
import zipfile
import glob
import requests
import xml.etree.ElementTree as ET
from function import *
from tqdm import tqdm

# Set path
DOWNLOAD_FOLDER = 'data/raw_data'
CSV_FILE = 'data/open_data_download_url.csv'

# Time Series data Preprocess / concat
- Tide
- Earthquake
- Taipei_border
- Monthly precipitation
- River Level
- Ground water level
- sinkhole data

# Spatial data preprocess
- Soil_liquefaction
- pipes
- Road Properties
- pavement
- Building volume and floor count
- Underground structure

# Time Series Data Preprocessing

## Tide

In [11]:
tide_path = os.path.join(DOWNLOAD_FOLDER, 'C-B0052-001.json')
tide_df = pd.read_json(tide_path, encoding="utf-8")
tide_df.head()

Unnamed: 0,cwaopendata
@xmlns,urn:cwa:gov:tw:cwacommon:0.1
Identifier,9f9d5ed5-c958-410f-8638-5cde5724aa21
Dataid,C-B0052-001
DatasetName,海象氣候統計
Sender,od@cwa.gov.tw


In [13]:
# Data Overview
station_data = tide_df['cwaopendata']['Resources']['Resource']['Data']['SeaSurfaceObs']['Location']

# Data Extraction
station_num = len(station_data)
tide_record = []

for station in range(1, station_num):
    data_year_list = station_data[station]['StationObsStatistics']['DataYear']
    month_num = len(station_data[station]['StationObsStatistics']['Monthly'])

    for month in range(12, month_num):
        year_index = (month // 12) - 1
        temp = {
            'StationID': station_data[station]['Station']['StationID'],
            'StationName': station_data[station]['Station']['StationName'],
            'StationNameEN': station_data[station]['Station']['StationNameEN'],
            'StationLatitude': station_data[station]['Station']['StationLatitude'],
            'StationLongitude': station_data[station]['Station']['StationLongitude'],
            'StationAttribute': station_data[station]['Station']['StationAttribute'],
            'Description': station_data[station]['Station']['Description'],
            'County': station_data[station]['Station']['County']['CountyName'],
            'Town': station_data[station]['Station']['Town']['TownName'],
            'DataYear': data_year_list[year_index],
            'DataMonth': station_data[station]['StationObsStatistics']['Monthly'][month]['DataMonth'],
            'HighestHighWaterLevel': station_data[station]['StationObsStatistics']['Monthly'][month]['HighestHighWaterLevel'],
            'HighestAstronomicalTide': station_data[station]['StationObsStatistics']['Monthly'][month]['HighestAstronomicalTide'],
            'MeanHighWaterLevel': station_data[station]['StationObsStatistics']['Monthly'][month]['MeanHighWaterLevel'],
            'MeanTideLevel': station_data[station]['StationObsStatistics']['Monthly'][month]['MeanTideLevel'],
            'MeanLowWaterLevel': station_data[station]['StationObsStatistics']['Monthly'][month]['MeanLowWaterLevel'],
            'LowestAstronomicalTide': station_data[station]['StationObsStatistics']['Monthly'][month]['LowestAstronomicalTide'],
            'LowestLowWaterLevel': station_data[station]['StationObsStatistics']['Monthly'][month]['LowestLowWaterLevel'],
            'MeanTidalRange': station_data[station]['StationObsStatistics']['Monthly'][month]['MeanTidalRange'],
            'MaxAstronomicalTidalRange': station_data[station]['StationObsStatistics']['Monthly'][month]['MaxAstronomicalTidalRange'],
            'MeanHighWaterOfSpringTide': station_data[station]['StationObsStatistics']['Monthly'][month]['MeanHighWaterOfSpringTide'],
            'MeanLowWaterOfSpringTide': station_data[station]['StationObsStatistics']['Monthly'][month]['MeanLowWaterOfSpringTide'],
        }
        tide_record.append(temp)

tide_record = pd.DataFrame(tide_record)
print(tide_record.shape)
tide_record.head()

(8016, 22)


Unnamed: 0,StationID,StationName,StationNameEN,StationLatitude,StationLongitude,StationAttribute,Description,County,Town,DataYear,...,HighestAstronomicalTide,MeanHighWaterLevel,MeanTideLevel,MeanLowWaterLevel,LowestAstronomicalTide,LowestLowWaterLevel,MeanTidalRange,MaxAstronomicalTidalRange,MeanHighWaterOfSpringTide,MeanLowWaterOfSpringTide
0,1102,淡水潮位站,Tamsui,25.18,121.42,潮位站,相對臺灣高程基準TWVD2001基隆海平面;Relative to the TWVD2001...,新北市,淡水區,2004,...,1.269,1.324,0.215,-0.796,-1.09,-1.144,2.12,2.359,0.97,-0.665
1,1102,淡水潮位站,Tamsui,25.18,121.42,潮位站,相對臺灣高程基準TWVD2001基隆海平面;Relative to the TWVD2001...,新北市,淡水區,2004,...,1.216,1.25,0.161,-0.78,-1.028,-1.156,2.03,2.244,0.96,-0.685
2,1102,淡水潮位站,Tamsui,25.18,121.42,潮位站,相對臺灣高程基準TWVD2001基隆海平面;Relative to the TWVD2001...,新北市,淡水區,2004,...,1.345,1.326,0.211,-0.878,-0.92,-1.198,2.204,2.265,0.982,-0.636
3,1102,淡水潮位站,Tamsui,25.18,121.42,潮位站,相對臺灣高程基準TWVD2001基隆海平面;Relative to the TWVD2001...,新北市,淡水區,2004,...,1.532,1.413,0.246,-0.869,-0.968,-1.235,2.282,2.5,1.087,-0.582
4,1102,淡水潮位站,Tamsui,25.18,121.42,潮位站,相對臺灣高程基準TWVD2001基隆海平面;Relative to the TWVD2001...,新北市,淡水區,2004,...,1.606,1.431,0.29,-0.744,-0.968,-1.076,2.175,2.574,1.168,-0.518


## road case

## Earthquake

In [20]:
quake_path = os.path.join(DOWNLOAD_FOLDER, 'E-A0073-002', '*')
[x.split("地震目錄-歷史地震目錄\\")[-1] for x in sorted(glob.glob(quake_path))]

['data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1990.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1991.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1992.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1993.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1994.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1995.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1996.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1997.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1998.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-1999.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2000.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2001.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2002.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2003.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2004.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2005.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-2006.xml',
 'data/raw_data\\E-A0073-002\\CWA-EQ-Catalog-200

In [24]:
# Path & Year Setting
year = [2018, 2019, 2020, 2021, 2022, 2023]
earthquake_df = pd.DataFrame()

for year in year:
    xml_path = os.path.join(DOWNLOAD_FOLDER, 'E-A0073-002', f'CWA-EQ-Catalog-{year}.xml')
    # Read XML
    tree = ET.parse(xml_path)
    root = tree.getroot()
    df_list = []
    for i in range(2, len(root[8][1])):
        data = root[8][1][i]
        if len(data) == 14:
            temp = {
                "發震時間(OriginTime)": data[0].text,
                "震央經度(EpicenterLng)": float(data[1].text), 
                "震央緯度(EpicenterLat)": float(data[2].text),
                "震源深度(FocalDepth)": data[3].text,
                "芮氏規模(LocalMagnitude)": data[4].text,
                "定位測站個數(StationNumber)": data[5].text,
                "定位相位個數(PhaseNumber)": data[6].text,
                "震央距(MinimumEpicenterDistance)": data[7].text,
                "最大空餘角度(gap)": data[8].text,
                "震波走時殘差(rms)": data[9].text, 
                "水平標準偏差(erh)": data[10].text,
                "垂直標準偏差(erz)": data[11].text if len(data) > 13 else None,
                "定位品質(Quality)": data[12].text if len(data) > 13 else data[11].text,
                "定位回顧狀態(ReviewStatus)": data[13].text if len(data) > 13 else data[12].text,
            }
            df_list.append(temp)
        elif len(data) == 13:
            temp = {
                "發震時間(OriginTime)": data[0].text,
                "震央經度(EpicenterLng)": float(data[1].text), 
                "震央緯度(EpicenterLat)": float(data[2].text),
                "震源深度(FocalDepth)": data[3].text,
                "芮氏規模(LocalMagnitude)": data[4].text,
                "定位測站個數(StationNumber)": data[5].text,
                "定位相位個數(PhaseNumber)": data[6].text,
                "震央距(MinimumEpicenterDistance)": data[7].text,
                "最大空餘角度(gap)": data[8].text,
                "震波走時殘差(rms)": data[9].text, 
                "水平標準偏差(erh)": data[10].text,
                "垂直標準偏差(erz)": None,
                "定位品質(Quality)": data[11].text,
                "定位回顧狀態(ReviewStatus)": data[12].text,
            }
            df_list.append(temp)
        elif len(data) == 12:
            temp = {
                "發震時間(OriginTime)": data[0].text,
                "震央經度(EpicenterLng)": float(data[1].text), 
                "震央緯度(EpicenterLat)": float(data[2].text),
                "震源深度(FocalDepth)": data[3].text,
                "芮氏規模(LocalMagnitude)": data[4].text,
                "定位測站個數(StationNumber)": data[5].text,
                "定位相位個數(PhaseNumber)": data[6].text,
                "震央距(MinimumEpicenterDistance)": data[7].text,
                "最大空餘角度(gap)": data[8].text,
                "震波走時殘差(rms)": data[9].text, 
                "水平標準偏差(erh)": None,
                "垂直標準偏差(erz)": None,
                "定位品質(Quality)": data[10].text,
                "定位回顧狀態(ReviewStatus)": data[11].text
            }
            df_list.append(temp)
        elif len(data) < 12:
            temp = {
                "發震時間(OriginTime)": data[0].text,
                "震央經度(EpicenterLng)": float(data[1].text), 
                "震央緯度(EpicenterLat)": float(data[2].text),
                "震源深度(FocalDepth)": data[3].text,
                "芮氏規模(LocalMagnitude)": data[4].text,
                "定位測站個數(StationNumber)": data[5].text,
                "定位相位個數(PhaseNumber)": data[6].text,
                "震央距(MinimumEpicenterDistance)": data[7].text,
                "最大空餘角度(gap)": data[8].text,
                "震波走時殘差(rms)": None, 
                "水平標準偏差(erh)": None,
                "垂直標準偏差(erz)": None,
                "定位品質(Quality)": data[9].text,
                "定位回顧狀態(ReviewStatus)": data[10].text
            }
            df_list.append(temp)
        df_year = pd.DataFrame(df_list)
    earthquake_df = pd.concat([earthquake_df, df_year], axis=0)
    
earthquake_df.shape    #35.8s

(13922, 14)

In [None]:
# Tide data
tide_df = ROOT_PATH + r"\03 Data\Processed\潮汐\臺灣各地歷史潮位觀測逐年月統計.csv"
tide_df = pd.read_csv(tide_df, encoding="utf-8")

# Sinkhole data
raw_road_case = ROOT_PATH + r"\03 Data\Processed\道管系統坑洞案件_108-112_Chu加案件標記_20240201.csv"
raw_road_case = pd.read_csv(raw_road_case, encoding="utf-8")

# Earthquake data
raw_earthquake = ROOT_PATH + r"\03 Data\Processed\地震資料\cwa_earthquake_catalog_2018-2023.csv"
raw_earthquake = pd.read_csv(raw_earthquake, encoding="utf-8")

# TP shapefile
raw_tp = ROOT_PATH + r"\03 Data\Raw\臺北市區界圖_20220915\G97_A_CADIST_P.shp"
raw_tp = gpd.read_file(raw_tp, encoding="utf-8")

# Monthly precipitation
raw_precipitation = ROOT_PATH + r"\03 Data\Processed\cwb_monthly_precipitation_2019-2023.csv"
raw_precipitation = pd.read_csv(raw_precipitation, encoding="utf-8")

# River-Level
raw_river_level = ROOT_PATH + r"\03 Data\Processed\河川水位\river_level.csv"
raw_river_level = pd.read_csv(raw_river_level, encoding='utf-8')

# Underground-water-level
raw_ugwater_level = ROOT_PATH + r"\03 Data\Processed\地下水\underground_water_level.csv"
raw_ugwater_level = pd.read_csv(raw_ugwater_level, encoding='utf-8')