In [1]:
import shap # 0.46.0
import pandas as pd
import numpy as np
import requests
import time
import os

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import xgboost as xgb # 2.1.1
import matplotlib.pyplot as plt # 3.9.2
import holidays # 0.56
from geopy.geocoders import Nominatim
import pickle

train = pd.read_csv('../train_taxi_tims_U.csv', encoding='utf-8-sig', engine='python')
test = pd.read_csv('../test_taxi_tims_U.csv', encoding='utf-8-sig', engine='python')

train = train[(train['x_axis'] != 0) & (train['y_axis'] != 0) & (train['to_x_axis'] != 0) & (train['to_y_axis'] != 0) & (train['distance'] != 0)]
test = test[(test['x_axis'] != 0) & (test['y_axis'] != 0) & (test['to_x_axis'] != 0) & (test['to_y_axis'] != 0) & (test['distance'] != 0)]

train['datetime'] = pd.to_datetime(train['datetime'])
train['minute'] = train['datetime'].dt.minute
train['minute'] = train['minute'] 
train['hour'] = train['datetime'].dt.hour
train['weekday'] = train['datetime'].dt.weekday
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
train['year'] = train['datetime'].dt.year
train.drop(['datetime'], axis=1, inplace=True)

test['datetime'] = pd.to_datetime(test['datetime'])
test['minute'] = test['datetime'].dt.minute
test['minute'] = test['minute']
test['hour'] = test['datetime'].dt.hour
test['weekday'] = test['datetime'].dt.weekday
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day
test['year'] = test['datetime'].dt.year
test.drop(['datetime'], axis=1, inplace=True)

API = "gGryFchORUuq8hXITjVLWQ"
def get_weather(year, month, day, hour, minute):
    url = f"https://apihub.kma.go.kr/api/typ01/url/kma_sfctm2.php?tm1={time_str}&stn=133&help=1&authKey={API}"

    response = requests.get(url)
    raw_data = [i for i in response.text.split("\n")[-3].split(' ')if i!='']
    WS, TEMP, HUMI, RN = raw_data[3], raw_data[11], raw_data[13], raw_data[15]
    return WS, TEMP, HUMI, RN

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if 'weather_data.csv' not in os.listdir():
    data = []
    for month in range(3,12):
        url = f"https://apihub.kma.go.kr/api/typ01/url/kma_sfctm3.php?tm1=2023{str(month).zfill(2)}010000&tm2=2023{str(month+1).zfill(2)}010000&stn=133&help=0&authKey={API}"
        response = requests.get(url)
        raw_data = [[i for i in response.text.split("\n")[j].split(' ')if i!=''] for j in range(4,len(response.text.split("\n")))]
        data.append(raw_data)

    for month in range(1,5):
        url = f"https://apihub.kma.go.kr/api/typ01/url/kma_sfctm3.php?tm1=2024{str(month).zfill(2)}010000&tm2=2024{str(month+1).zfill(2)}010000&stn=133&help=0&authKey={API}"
        response = requests.get(url)
        raw_data = [[i for i in response.text.split("\n")[j].split(' ')if i!=''] for j in range(4,len(response.text.split("\n")))]
        data.append(raw_data)

    total_data = []
    for i in range(len(data)):
        for j in range(len(data[i])-1):
            try:
                new_data = [data[i][j][0], data[i][j][3], data[i][j][11], data[i][j][13], data[i][j][15]]
                total_data.append(new_data)
            except:
                continue

    #make the data into csv
    total_data = pd.DataFrame(total_data, columns=['datetime', 'WS', 'TEMP', 'HUMI', 'RN'])
    total_data.to_csv('weather_data.csv', index=False)

else:
    total_data = pd.read_csv('weather_data.csv', encoding='utf-8-sig', engine='python')

# change ['datetime'] to datetime type with format 'YYYYMMDDHHMM'
total_data['datetime'] = pd.to_datetime(total_data['datetime'], format='%Y%m%d%H%M')
total_data['hour'] = total_data['datetime'].dt.hour
total_data['month'] = total_data['datetime'].dt.month
total_data['day'] = total_data['datetime'].dt.day
total_data['year'] = total_data['datetime'].dt.year
total_data['holiday'] = total_data['datetime'].apply(lambda x: 1 if x in holidays.KR() else 0)
total_data.drop(['datetime'], axis=1, inplace=True)

train = pd.merge(train, total_data, on=['year', 'month', 'day', 'hour'], how='left')
test = pd.merge(test, total_data, on=['year', 'month', 'day', 'hour'], how='left')


In [3]:
cluster_num = 100
if 'kmeans_model.pkl' not in os.listdir():
    # cluster by x_axis and y_axis
    kmeans = KMeans(n_clusters=cluster_num, random_state=5).fit(train[['x_axis', 'y_axis']])
    train['cluster'] = kmeans.predict(train[['x_axis', 'y_axis']])
    test['cluster'] = kmeans.predict(test[['x_axis', 'y_axis']])

    # save the kmeans model
    import pickle
    with open('kmeans_model.pkl', 'wb') as f:
        pickle.dump(kmeans, f)

else:
    with open('kmeans_model.pkl', 'rb') as f:
        kmeans = pickle.load(f)
    train['cluster'] = kmeans.predict(train[['x_axis', 'y_axis']])
    test['cluster'] = kmeans.predict(test[['x_axis', 'y_axis']])

In [4]:
# exclude the cluster that has less than 900 data
cluster = train.groupby('cluster').size().reset_index()
cluster.columns = ['cluster', 'count']
# make the list of excluded cluster
excluded_cluster = cluster[cluster['count'] < 900]['cluster'].tolist()
remaining_cluster = cluster[cluster['count'] >= 900]['cluster'].tolist()

cluster = cluster[cluster['count'] >= 900]
train = train[train['cluster'].isin(cluster['cluster'])]
test = test[test['cluster'].isin(cluster['cluster'])]

In [7]:
excluded_cluster

[13, 15, 18, 27, 37, 41, 45, 47, 48, 50, 57, 60, 66, 70, 71, 85, 90, 97, 98]

In [3]:
ktv = pd.read_csv('/home/sb/taxi/data/Drinks/dj_ktv.csv', encoding='utf-8-sig', engine='python')
karaoke = pd.read_csv('/home/sb/taxi/data/Drinks/dj_karaoke.csv', encoding='utf-8-sig', engine='python')
hospital = pd.read_csv('/home/sb/taxi/data/Hospitals/dj_hospital.csv', encoding='utf-8-sig', engine='python')
small_hospital = pd.read_csv('/home/sb/taxi/data/Hospitals/dj_small_hospital.csv', encoding='utf-8-sig', engine='python')
camping = pd.read_csv('/home/sb/taxi/data/Hotels/dj_camping.csv', encoding='utf-8-sig', engine='python')
country_hotel = pd.read_csv('/home/sb/taxi/data/Hotels/dj_country_hotel.csv', encoding='utf-8-sig', engine='python')
foreign_hotel = pd.read_csv('/home/sb/taxi/data/Hotels/dj_foreign_hotel.csv', encoding='utf-8-sig', engine='python')
hotel = pd.read_csv('/home/sb/taxi/data/Hotels/dj_hotel.csv', encoding='utf-8-sig', engine='python')
tour_hotel = pd.read_csv('/home/sb/taxi/data/Hotels/dj_tour_hotel.csv', encoding='utf-8-sig', engine='python')

In [4]:
# rename 경도 to x_axis, 위도 to y_axis
ktv.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
karaoke.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
hospital.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
small_hospital.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
camping.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
country_hotel.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
foreign_hotel.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
hotel.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)
tour_hotel.rename(columns={'경도':'x_axis', '위도':'y_axis'}, inplace=True)

In [7]:
ktv['cluster'] = kmeans.predict(ktv[['x_axis', 'y_axis']])
karaoke['cluster'] = kmeans.predict(karaoke[['x_axis', 'y_axis']])
hospital['cluster'] = kmeans.predict(hospital[['x_axis', 'y_axis']])
small_hospital['cluster'] = kmeans.predict(small_hospital[['x_axis', 'y_axis']])
camping['cluster'] = kmeans.predict(camping[['x_axis', 'y_axis']])
country_hotel['cluster'] = kmeans.predict(country_hotel[['x_axis', 'y_axis']])
foreign_hotel['cluster'] = kmeans.predict(foreign_hotel[['x_axis', 'y_axis']])
hotel['cluster'] = kmeans.predict(hotel[['x_axis', 'y_axis']])
tour_hotel['cluster'] = kmeans.predict(tour_hotel[['x_axis', 'y_axis']])

In [8]:
hospital['업태구분명'] = pd.Categorical(hospital['업태구분명']).codes
number_of_hospital_type = len(hospital['업태구분명'].unique())

small_hospital['업태구분명'] = pd.Categorical(small_hospital['업태구분명']).codes
number_of_small_hospital_type = len(small_hospital['업태구분명'].unique())

new_columns = ['sum_drinks', 'sum_hospitals', 'sum_hotels', 'sum_drinks_area', 
               'sum_hospital_rooms'] + [f'sum_hospital_type_{i}' for i in range(number_of_hospital_type)] + [f'sum_small_hospital_type_{i}' for i in range(number_of_small_hospital_type)]
train[new_columns] = 0

drink_dfs = [ktv, karaoke]
hospital_dfs = [hospital, small_hospital]
hotel_dfs = [hotel]


def sum_for_cluster(df, cluster, column=None):
    mask = df['cluster'] == cluster
    return len(df[mask]) if column is None else df.loc[mask, column].sum()

In [9]:
for i in range(cluster_num):
    if i in excluded_cluster:
        continue
    cluster_mask = train['cluster'] == i
    test_cluster_mask = test['cluster'] == i
    
    # Sum drinks and drink areas
    train.loc[cluster_mask, 'sum_drinks'] = sum(sum_for_cluster(df, i) for df in drink_dfs)
    train.loc[cluster_mask, 'sum_drinks_area'] = sum(sum_for_cluster(df, i, '시설총규모') for df in drink_dfs)
    
    # Sum hospitals and hospital rooms
    train.loc[cluster_mask, 'sum_hospitals'] = sum(sum_for_cluster(df, i) for df in hospital_dfs)
    train.loc[cluster_mask, 'sum_hospital_rooms'] = sum(sum_for_cluster(df, i, '병상수') for df in hospital_dfs)
    
    # Sum hospital types
    for j in range(number_of_hospital_type):
        train.loc[cluster_mask, f'sum_hospital_type_{j}'] = sum(
            sum_for_cluster(df[df['업태구분명'] == j], i) for df in hospital_dfs
        )
    
    for k in range(number_of_small_hospital_type):
        train.loc[cluster_mask, f'sum_small_hospital_type_{k}'] = sum(
            sum_for_cluster(df[df['업태구분명'] == k], i) for df in hospital_dfs
        )
    
    # Sum hotels
    train.loc[cluster_mask, 'sum_hotels'] = sum(sum_for_cluster(df, i) for df in hotel_dfs)

    # Test data
    test.loc[test_cluster_mask, 'sum_drinks'] = sum(sum_for_cluster(df, i) for df in drink_dfs)
    test.loc[test_cluster_mask, 'sum_drinks_area'] = sum(sum_for_cluster(df, i, '시설총규모') for df in drink_dfs)

    test.loc[test_cluster_mask, 'sum_hospitals'] = sum(sum_for_cluster(df, i) for df in hospital_dfs)
    test.loc[test_cluster_mask, 'sum_hospital_rooms'] = sum(sum_for_cluster(df, i, '병상수') for df in hospital_dfs)

    for j in range(number_of_hospital_type):
        test.loc[test_cluster_mask, f'sum_hospital_type_{j}'] = sum(
            sum_for_cluster(df[df['업태구분명'] == j], i) for df in hospital_dfs
        )

    for k in range(number_of_small_hospital_type):
        test.loc[test_cluster_mask, f'sum_small_hospital_type_{k}'] = sum(
            sum_for_cluster(df[df['업태구분명'] == k], i) for df in hospital_dfs
        )

    test.loc[test_cluster_mask, 'sum_hotels'] = sum(sum_for_cluster(df, i) for df in hotel_dfs)

  train.loc[cluster_mask, 'sum_drinks_area'] = sum(sum_for_cluster(df, i, '시설총규모') for df in drink_dfs)


In [10]:
# 동시간대에 cluster 별로 trip의 수를 count
train['count'] = 1
train_count = train.groupby(['cluster', 'hour', 'day', 'month', 'year']).count().reset_index()
train_count = train_count[['cluster', 'hour', 'day', 'month', 'year','count']]

train = pd.merge(train, train_count, on=['cluster', 'hour', 'day', 'month', 'year'], how='left')
train['count'] = train['count_y']
train.drop(['count_x', 'count_y'], axis=1, inplace=True)

test['count'] = 1
test_count = test.groupby(['cluster', 'hour', 'day', 'month', 'year']).count().reset_index()
test_count = test_count[['cluster', 'hour', 'day', 'month', 'year','count']]

test = pd.merge(test, test_count, on=['cluster', 'hour', 'day', 'month', 'year'], how='left')
test['count'] = test['count_y']
test.drop(['count_x', 'count_y'], axis=1, inplace=True)

In [12]:
# xgb regression
model = xgb.XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.05, random_state=5)
train_columns = ['hour', 'weekday', 'month', 'day', 'WS', 'TEMP', 'HUMI', 'RN', 'cluster', 'holiday'] + new_columns

model.fit(train[train_columns], train['count'])
#save the model

with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [14]:
# test accuracy
pred = model.predict(test[train_columns])
print(np.mean(np.abs(pred - test['count'])))

1.7349732606222685


In [15]:
# shap
explainer = shap.Explainer(model, test[train_columns])
#save the explainer
with open('explainer.pkl', 'wb') as f:
    pickle.dump(explainer, f)

In [18]:
import folium
import pandas as pd

# cluster centers
data = {
    'name': ['cluster_' + str(i) for i in range(cluster_num) if i in remaining_cluster],
    'latitude': [kmeans.cluster_centers_[i][1] for i in range(cluster_num) if i in remaining_cluster],
    'longitude': [kmeans.cluster_centers_[i][0] for i in range(cluster_num) if i in remaining_cluster]
}
print(data)
# Create a DataFrame
df = pd.DataFrame(data)

# Calculate the center of the map
center_lat = df['latitude'].mean()
center_lon = df['longitude'].mean()

# Create a map
m = folium.Map(location=[center_lat, center_lon], zoom_start=12)

# Add markers to the map
for idx, row in df.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=row['name'],
        tooltip=row['name']
    ).add_to(m)

# Save the map
m.save("map_with_markers.html")

print("Map has been saved as 'map_with_markers.html'")

{'name': ['cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9', 'cluster_10', 'cluster_11', 'cluster_12', 'cluster_14', 'cluster_16', 'cluster_17', 'cluster_19', 'cluster_20', 'cluster_21', 'cluster_22', 'cluster_23', 'cluster_24', 'cluster_25', 'cluster_26', 'cluster_28', 'cluster_29', 'cluster_30', 'cluster_31', 'cluster_32', 'cluster_33', 'cluster_34', 'cluster_35', 'cluster_36', 'cluster_38', 'cluster_39', 'cluster_40', 'cluster_42', 'cluster_43', 'cluster_44', 'cluster_46', 'cluster_49', 'cluster_51', 'cluster_52', 'cluster_53', 'cluster_54', 'cluster_55', 'cluster_56', 'cluster_58', 'cluster_59', 'cluster_61', 'cluster_62', 'cluster_63', 'cluster_64', 'cluster_65', 'cluster_67', 'cluster_68', 'cluster_69', 'cluster_72', 'cluster_73', 'cluster_74', 'cluster_75', 'cluster_76', 'cluster_77', 'cluster_78', 'cluster_79', 'cluster_80', 'cluster_81', 'cluster_82', 'cluster_83', 'cluster_84', 'cluster_86', 'clust