In [398]:
import pandas as pd
import zipfile
%matplotlib inline
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import seaborn as sns
from pandas.plotting import scatter_matrix
import math
import os
import numpy as np

from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SelectFromModel



import matplotlib.font_manager	
plt.rcParams['font.sans-serif'] = ['Taipei Sans TC Beta']
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.expand_frame_repr', False)


In [399]:
def twd97_to_lonlat(x=174458.0,y=2525824.0):
    """
    Parameters
    ----------
    x : float
        TWD97 coord system. The default is 174458.0.
    y : float
        TWD97 coord system. The default is 2525824.0.
    Returns
    -------
    list
        [longitude, latitude]
    """
    
    a = 6378137
    b = 6356752.314245
    long_0 = 121 * math.pi / 180.0
    k0 = 0.9999
    dx = 250000
    dy = 0
    
    e = math.pow((1-math.pow(b, 2)/math.pow(a,2)), 0.5)
    
    x -= dx
    y -= dy
    
    M = y / k0
    
    mu = M / ( a*(1-math.pow(e, 2)/4 - 3*math.pow(e,4)/64 - 5 * math.pow(e, 6)/256))
    e1 = (1.0 - pow((1   - pow(e, 2)), 0.5)) / (1.0 +math.pow((1.0 -math.pow(e,2)), 0.5))
    
    j1 = 3*e1/2-27*math.pow(e1,3)/32
    j2 = 21 * math.pow(e1,2)/16 - 55 * math.pow(e1, 4)/32
    j3 = 151 * math.pow(e1, 3)/96
    j4 = 1097 * math.pow(e1, 4)/512
    
    fp = mu + j1 * math.sin(2*mu) + j2 * math.sin(4* mu) + j3 * math.sin(6*mu) + j4 * math.sin(8* mu)
    
    e2 = math.pow((e*a/b),2)
    c1 = math.pow(e2*math.cos(fp),2)
    t1 = math.pow(math.tan(fp),2)
    r1 = a * (1-math.pow(e,2)) / math.pow( (1-math.pow(e,2)* math.pow(math.sin(fp),2)), (3/2))
    n1 = a / math.pow((1-math.pow(e,2)*math.pow(math.sin(fp),2)),0.5)
    d = x / (n1*k0)
    
    q1 = n1* math.tan(fp) / r1
    q2 = math.pow(d,2)/2
    q3 = ( 5 + 3 * t1 + 10 * c1 - 4 * math.pow(c1,2) - 9 * e2 ) * math.pow(d,4)/24
    q4 = (61 + 90 * t1 + 298 * c1 + 45 * math.pow(t1,2) - 3 * math.pow(c1,2) - 252 * e2) * math.pow(d,6)/720
    lat = fp - q1 * (q2 - q3 + q4)
    
    
    q5 = d
    q6 = (1+2*t1+c1) * math.pow(d,3) / 6
    q7 = (5 - 2 * c1 + 28 * t1 - 3 * math.pow(c1,2) + 8 * e2 + 24 * math.pow(t1,2)) * math.pow(d,5) / 120
    lon = long_0 + (q5 - q6 + q7) / math.cos(fp)
    
    lat = (lat*180) / math.pi
    lon = (lon*180) / math.pi
    return [lon, lat]

In [400]:
zip_file_path = "30_Training Dataset_V2.zip"
target_csv_file = "training_data.csv"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    if target_csv_file in zip_ref.namelist():
        zip_ref.extract(target_csv_file, path="temp_folder")
        csv_path = f"temp_folder/{target_csv_file}"
        train = pd.read_csv(csv_path)

zip_file_path = "30_Public Dataset_Public Sumission Template_v2.zip"
target_csv_file = "public_dataset.csv"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    if target_csv_file in zip_ref.namelist():
        zip_ref.extract(target_csv_file, path="temp_folder")
        csv_path = f"temp_folder/{target_csv_file}"
        test = pd.read_csv(csv_path)

In [401]:
train['long'], train['lat'] = zip(*train.apply(lambda row: twd97_to_lonlat(row['橫坐標'], row['縱坐標']), axis=1))
test['long'], test['lat'] = zip(*test.apply(lambda row: twd97_to_lonlat(row['橫坐標'], row['縱坐標']), axis=1))


---

## Bus

In [402]:
columns_to_read = ['縣市', '站點UID', 'lat', 'lng']
bus_df = pd.read_csv('30_Training Dataset_V2\external_data\公車站點資料.csv',
                     usecols = columns_to_read)
bus_df['縣市'] = bus_df['縣市'].astype('category')
bus_df.rename(columns={"lng": "經度_車站", "lat" : "緯度_車站"}, inplace=True)

In [403]:
def haversine(lat1, lon1, lat2, lon2):
    # 將經緯度從度數轉換為弧度
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # 地球的半徑（平均值，單位：公里）
    earth_radius = 6371.0

    # Haversine公式
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = earth_radius * c

    return distance

In [404]:
def deal_bus_station(data: pd.DataFrame,
                     bus_data: pd.DataFrame):
    result_df = pd.DataFrame(columns=['ID', '站點數'])

    for city in data['縣市'].unique():
        sub_data = data.loc[(data['縣市'] == city), ['ID', '縣市', 'long', 'lat']]
        merge_bus = sub_data.merge(bus_data, on='縣市')
        merge_bus['distance'] = merge_bus.apply(lambda x: haversine(x['lat'], x['long'], x['緯度_車站'], x['經度_車站']), axis=1)
        
        merge_bus = merge_bus.loc[merge_bus['distance'] < 1.0]
        
        grouped = merge_bus.groupby('ID')['縣市'].count().reset_index()
        grouped.rename(columns={'縣市': '站點數'}, inplace=True)
        
        result_df = pd.concat([result_df, 
                            grouped],
                            ignore_index=True)

    
    return result_df



In [405]:
bus_df

Unnamed: 0,縣市,站點UID,緯度_車站,經度_車站
0,台北市,TPE10072,25.060670,121.563390
1,台北市,TPE10073,25.062714,121.522301
2,台北市,TPE10074,25.062830,121.563620
3,台北市,TPE10075,25.062669,121.525381
4,台北市,TPE10076,25.062286,121.566155
...,...,...,...,...
111756,高雄市,KHH9951,22.896550,120.519450
111757,高雄市,KHH996,22.569945,120.360106
111758,高雄市,KHH997,22.569945,120.360106
111759,高雄市,KHH998,22.569993,120.357784


In [406]:
#result_df_test = deal_bus_station(test, bus_df)

In [407]:
#result_df_test.to_csv("external_data_2/bus_station_test.csv", index=False)

In [408]:
#result_df_train= deal_bus_station(train, bus_df)

In [409]:
#result_df_train.to_csv("external_data_2/bus_station_train.csv", index=False)

---

In [410]:
path = "other data source/全域/2022臺灣地區地名資料_具有地標意義公共設施類_"
path2 = "other data source/全域/2023臺灣地區地名資料_具有地標意義公共設施類_"

In [411]:
公共設施_中彰投 = pd.read_csv(path + "中彰投.csv")
公共設施_東部及外島 = pd.read_csv(path + "東部及外島.csv")
公共設施_北北基 = pd.read_csv(path2 + "北北基.csv")
公共設施_桃竹苗 = pd.read_csv(path2 + "桃竹苗.csv")
公共設施_雲嘉南 = pd.read_csv(path2 + "雲喜南.csv")
公共設施_高屏 = pd.read_csv(path2 + "高屏.csv")

In [412]:
Facilities_Landmarks = pd.concat([公共設施_中彰投, 公共設施_東部及外島, 公共設施_北北基, 公共設施_桃竹苗, 公共設施_雲嘉南, 公共設施_高屏])


In [413]:
Facilities_Landmarks.dropna(subset=["place_name"], inplace=True)

In [414]:
Facilities_Landmarks = Facilities_Landmarks[["place_name", "county", "town", "longtitude", "latitude"]]

Facilities_Landmarks.rename(columns={"longtitude": "經度", "latitude" : "緯度", "county" : "縣市"}, inplace=True)

* BAD

In [415]:
Bad_Facilities_Landmarks = Facilities_Landmarks[Facilities_Landmarks["place_name"].str.contains('垃圾|廠|工場|焚|回收|動物之|流浪|儲|電所|墳|墓|殯|宮|廟|塚|工業區|骨塔|骨碑|精神|之家|監獄|夜市|市場|看守所|觀護所|派出所|寺|豬')]
Bad_Facilities_Landmarks.dropna(subset=["經度"], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Bad_Facilities_Landmarks.dropna(subset=["經度"], inplace=True)


In [416]:
Bad_high_F_L = Bad_Facilities_Landmarks[Bad_Facilities_Landmarks["place_name"].str.contains('垃圾|廠|工場|焚|回收|動物之|流浪|儲|墳|墓|殯|塚|工業區|骨塔|骨碑|監獄|豬')]
Bad_mid_F_L = Bad_Facilities_Landmarks[Bad_Facilities_Landmarks["place_name"].str.contains('電所')]
Bad_low_F_L =Bad_Facilities_Landmarks[
    ~Bad_Facilities_Landmarks.index.isin(Bad_high_F_L.index) &
    ~Bad_Facilities_Landmarks.index.isin(Bad_mid_F_L.index)
]

In [417]:
#some cleaning
Bad_high_F_L = Bad_high_F_L[~Bad_high_F_L['place_name'].str.contains('小學|服務|郵局|游泳池')]
Bad_low_F_L = Bad_low_F_L[~Bad_low_F_L['place_name'].str.contains('小學|服務|郵局|游泳池')]

In [418]:
def deal_distance_and_count(data: pd.DataFrame,
                     extenal_data: pd.DataFrame, new_col: str, key: str):
    result_df = pd.DataFrame(columns=['ID', new_col])

    for city in data[key].unique():
        sub_data = data.loc[(data[key] == city), ['ID', key, 'long', 'lat']]
        merge_F_L = sub_data.merge(extenal_data, on=key)
        merge_F_L['distance'] = merge_F_L.apply(lambda x: haversine(x['lat'], x['long'], x['緯度'], x['經度']), axis=1)
        
        merge_F_L = merge_F_L.loc[merge_F_L['distance'] < 0.4]
        
        grouped = merge_F_L.groupby('ID')[key].count().reset_index()
        grouped.rename(columns={key: new_col}, inplace=True)
        
        result_df = pd.concat([result_df, 
                            grouped],
                            ignore_index=True)

    
    return result_df

In [433]:
def merged_bad_score(data : pd.DataFrame):
    bad_high = deal_distance_and_count(data, Bad_high_F_L, "高度壞地標數", key = "縣市")
    bad_mid = deal_distance_and_count(data, Bad_mid_F_L, "中度壞地標數", key = "縣市")
    bad_low = deal_distance_and_count(data, Bad_low_F_L, "輕度壞地標數", key = "縣市")
    
    merged_bad = pd.merge(bad_high, bad_mid, on="ID", how="outer")
    merged_bad = pd.merge(merged_bad, bad_low, on="ID", how="outer")
    merged_bad = merged_bad.fillna(0)
    merged_bad["bad_score"] = merged_bad["高度壞地標數"] * 1.2 + merged_bad["中度壞地標數"] * 1.15 + merged_bad["輕度壞地標數"] * 1.1
    return  merged_bad

In [434]:
merged_bad_train = merged_bad_score(train)

Unnamed: 0,ID,高度壞地標數,中度壞地標數,輕度壞地標數,bad_score
0,TR-11367,1,0,2,3.4
1,TR-11633,1,0,2,3.4
2,TR-3018,1,0,2,3.4
3,TR-3488,1,0,0,1.2
4,TR-4052,1,0,3,4.5
...,...,...,...,...,...
3486,TR-3390,0,0,2,2.2
3487,TR-3828,0,0,2,2.2
3488,TR-6190,0,0,1,1.1
3489,TR-7657,0,0,1,1.1


In [436]:
merged_bad_test = merged_bad_score(test)

In [438]:
merged_bad_train.to_csv("external_data_2/bad_facilities_landmarks_train.csv", index=False)
merged_bad_test.to_csv("external_data_2/bad_facilities_landmarks_test.csv", index=False)

* Good

圖書館

百貨、商圈



In [439]:
good_Facilities_Landmarks = Facilities_Landmarks[Facilities_Landmarks["place_name"].str.contains('圖書|公園|綠地|園區|遊憩')]

In [440]:
merged_good_train = deal_distance_and_count(train, good_Facilities_Landmarks, "公園圖書館", "縣市")
merged_good_test= deal_distance_and_count(test, good_Facilities_Landmarks, "公園圖書館", "縣市")

In [442]:
merged_good_train.to_csv("external_data_2/good_facilities_landmarks_train.csv", index=False)
merged_good_test.to_csv("external_data_2/good_facilities_landmarks_test.csv", index=False)

In [387]:
Facilities_Landmarks[Facilities_Landmarks["place_name"].str.contains('遊憩')]

Unnamed: 0,place_name,縣市,town,經度,緯度
119,猴探井遊憩區,南投縣,南投市,120.6321,23.9085
683,名間鄉921斜塔遊憩區,南投縣,名間鄉,120.711,23.8329
1224,昆陽遊憩區停車場,南投縣,仁愛鄉,121.2727,24.1222
2285,虎山岩古蹟遊憩區停車場,彰化縣,花壇鄉,120.5604,24.0435
6201,黎明兒童遊憩場,臺中市,南屯區,120.6303,24.1564
164,臺東大鳥遊憩區,臺東縣,大武鄉,120.912446,22.381453
165,金龍湖遊憩區,臺東縣,大武鄉,120.890106,22.350005
212,花東縱谷風景區紅葉溫泉遊憩區,臺東縣,延平鄉,121.068878,22.890836
394,花東縱谷風景區羅山遊憩區,花蓮縣,富里鄉,121.270861,23.202046
599,奎壁山遊憩區,澎湖縣,湖西鄉,119.671633,23.599468
