In [1]:
import numpy as np
import pandas as pd
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime, timedelta
import math

In [2]:
# UTC_time 변경 함수

def date_convert(date_to_convert):
    pru = datetime.datetime.strptime(date_to_convert, '%a %b %d %H:%M:%S +0000 %Y')+ datetime.timedelta(hours=-240//60)
    return pru.strftime('%b %d %Y %H:%M:%S')

def tzo_convert(tzo_to_convert):
    return  timedelta(hours=tzo_to_convert/60)

In [6]:
tky_filepath = 'C:/Users/piai/Desktop/code_file/dataset_WWW2019/dataset_TSMC2014_TKY.csv'
custom_date_parser = lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y")

tky_data = pd.read_csv(tky_filepath,parse_dates=['utcTimestamp'],date_parser=custom_date_parser)

tky_data["utcTimestampOffset"] = tky_data.utcTimestamp + tky_data.timezoneOffset.apply(tzo_convert)

tky_data["hour"] = tky_data.utcTimestampOffset.dt.hour

# 야행성인지 체크하는 컬럼 생성. 저녁 6시부터 새벽 6시까지 활동한 POI는 야행성 Ture로 표시.
tky_data["nightlife"] = tky_data['hour'].apply(lambda x: 'True' if (18 <= x < 24) or (0 <= x < 6)else 'False')

# 요일이 무엇인지 0~6 순서로 설정. 월요일이 0, 일요일이 6.
tky_data["dayofweek"] = tky_data.utcTimestampOffset.dt.dayofweek

# 평일과 주말을 구분. 평일이면 True, 주말이면 False.
tky_data["weekday"] = tky_data['dayofweek'].apply(lambda x: 'True' if x <= 4 else 'False')

In [7]:
# 유저 아이디 별로 정렬 후
# 유저 아이디 행 수만큼만 len 돌려서 time delta 하고,
# 이대로 컬럼 하나 추가하기.
tky_data_sorted = tky_data.sort_values(by=['userId', 'utcTimestampOffset'], ascending=[True, True])
tky_data_sorted.reset_index(inplace= True, drop= True)

In [8]:
tky_data_sorted['time_diff'] = (tky_data_sorted.utcTimestampOffset.shift(-1) - tky_data_sorted.utcTimestampOffset)

In [9]:
tky_data_sorted['duplicate'] = False
tky_data_sorted['timedelta_min'] = tky_data_sorted['time_diff']/timedelta(minutes=1)

In [11]:
tky_data_sorted

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min
0,1,4b396b34f964a5204f5c25e3,4bf58dd8d48988d129951735,Train Station,35.749647,139.805145,540,2012-04-07 23:33:08,2012-04-08 08:33:08,8,False,6,False,0 days 00:30:27,False,30.450000
1,1,4b305a74f964a5201ef924e3,4bf58dd8d48988d1fd931735,Subway,35.669687,139.767254,540,2012-04-08 00:03:35,2012-04-08 09:03:35,9,False,6,False,0 days 00:07:13,False,7.216667
2,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668088,139.767315,540,2012-04-08 00:10:48,2012-04-08 09:10:48,9,False,6,False,0 days 01:07:12,False,67.200000
3,1,4bef4d2fb0b376b030d8dab3,4d954b0ea243a5684a65b473,Convenience Store,35.668338,139.766756,540,2012-04-08 01:18:00,2012-04-08 10:18:00,10,False,6,False,0 days 00:00:21,False,0.350000
4,1,4b835f06f964a520330431e3,4bf58dd8d48988d1ff931735,Convention Center,35.668088,139.767315,540,2012-04-08 01:18:21,2012-04-08 10:18:21,10,False,6,False,0 days 05:22:46,True,322.766667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573698,2293,4c2e057d7d85a593408a53f3,4bf58dd8d48988d11b941735,Bar,35.640912,139.669393,540,2013-02-05 17:18:35,2013-02-06 02:18:35,2,True,2,True,0 days 22:09:00,False,1329.000000
573699,2293,4c2e057d7d85a593408a53f3,4bf58dd8d48988d11b941735,Bar,35.640912,139.669393,540,2013-02-06 15:27:35,2013-02-07 00:27:35,0,True,3,True,0 days 22:06:50,False,1326.833333
573700,2293,4b46f6bbf964a520042a26e3,4bf58dd8d48988d108941735,Dumpling Restaurant,35.667517,139.706108,540,2013-02-07 13:34:25,2013-02-07 22:34:25,22,True,3,True,3 days 22:48:04,False,5688.066667
573701,2293,4ec0d5979911a2ec5c6cdda6,4bf58dd8d48988d129951735,Train Station,35.628227,139.738712,540,2013-02-11 12:22:29,2013-02-11 21:22:29,21,True,0,True,1 days 00:26:01,False,1466.016667


In [12]:
for i in range(1, len(tky_data_sorted) -1):
    if i == len(tky_data_sorted) - 1:
        continue
    if tky_data_sorted.loc[i, "userId"] == tky_data_sorted.loc[i-1, "userId"]:
        if ((tky_data_sorted.loc[i, 'venueCategory'] == tky_data_sorted.loc[i-1, 'venueCategory']) & (tky_data_sorted.loc[i, 'timedelta_min'] < 60)) | (tky_data_sorted.loc[i-1, 'timedelta_min'] < 3):
            # 같은 아이디 내에서 카테고리가 연달아 중복되는데, POI 인증 시간차가 60min 미만인 경우 or 다음 POI까지 인증이 3min 미만인 경우(POI는 다르게 중복 인증하는 실수를 한 경우)는 중복이라고 체크.
            tky_data_sorted.loc[i, 'duplicate'] = True
    if (i > 0) & (tky_data_sorted.loc[i, "userId"] != tky_data_sorted.loc[i-1, "userId"]):
        tky_data_sorted.loc[i-1, 'time_diff'] = pd.to_timedelta(0, unit='h')

In [13]:
# 중복 제거
condition = tky_data_sorted[tky_data_sorted['duplicate'] == True].index
tky_data_sorted.drop(condition, inplace= True)

In [14]:
# POI 인증별 시간차 다시 update
tky_data_sorted['time_diff'] = (tky_data_sorted.utcTimestampOffset.shift(-1) - tky_data_sorted.utcTimestampOffset)
tky_data_sorted['timedelta_min'] = tky_data_sorted['time_diff']/timedelta(minutes=1)
tky_data_sorted.reset_index(inplace= True, drop= True)

In [15]:
poi_list = pd.read_csv('C:/Users/piai/Desktop/code_file/category_list_for_categorize_for_lstm_update.txt', header = None)
poi_list.columns = ['venueCategory','Category']
poi_list.head()

Unnamed: 0,venueCategory,Category
0,Post Office,Other
1,Jazz Club,Liquid
2,Gym,Exercise
3,Indian Restaurant,Meal
4,Sandwich Place,Meal


## 인증 텀을 120min까지로 늘리고, Category에서 Transportation, Work, School를 제외하자. 

In [16]:
tky_data_sorted = pd.merge(tky_data_sorted,poi_list, how='left', left_on='venueCategory', right_on='venueCategory')

In [17]:
tmp_df = tky_data_sorted[(tky_data_sorted['timedelta_min'] < 120) & (tky_data_sorted['Category'] != 'Transportation') & (tky_data_sorted['Category'] != 'Work') & (tky_data_sorted['Category'] != 'School')]
tmp_df2 = tky_data_sorted.loc[[(i + 1) for i in list(tmp_df.index)]]
tmp_df3 = pd.concat([tmp_df, tmp_df2])

tmp_df3 = tmp_df3.sort_values(['userId', 'utcTimestampOffset'])
tmp_df3.drop_duplicates(inplace=True)

# tmp_df3를 가지고 세션 데이터를 생성해보자.
tmp_df3['SessionId'] = None

In [18]:
tmp_df3['Category'].value_counts()

Shopping          40122
Meal              29506
Transportation    24356
Other             12837
Entertain          9201
Leisure            8150
Liquid             4754
Cafe               3690
Work               3065
Snack              1865
Sightseeing        1773
School             1417
Retail             1146
Hobby              1086
Exercise           1058
Residence          1032
Resort              709
Event               560
Name: Category, dtype: int64

In [19]:
tmp_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150248 entries, 6 to 442223
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype          
---  ------              --------------   -----          
 0   userId              150248 non-null  int64          
 1   venueId             150248 non-null  object         
 2   venueCategoryId     150248 non-null  object         
 3   venueCategory       150248 non-null  object         
 4   latitude            150248 non-null  float64        
 5   longitude           150248 non-null  float64        
 6   timezoneOffset      150248 non-null  int64          
 7   utcTimestamp        150248 non-null  datetime64[ns] 
 8   utcTimestampOffset  150248 non-null  datetime64[ns] 
 9   hour                150248 non-null  int64          
 10  nightlife           150248 non-null  object         
 11  dayofweek           150248 non-null  int64          
 12  weekday             150248 non-null  object         
 13  time_diff     

In [20]:
tmp_df3.reset_index(inplace= True, drop= True)

In [21]:
index = 0
SessionId = 0
last = tky_data_sorted.loc[len(tky_data_sorted)-1, 'userId']
for i in range(len(tmp_df3)):
    tmp_df3.loc[i, 'SessionId'] = SessionId
    if tmp_df3.loc[i, 'timedelta_min'] >= 120:
        SessionId += 1

In [22]:
tmp_df3

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,,0
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150243,2293,4d398db4cc48224be878344f,4bf58dd8d48988d116941735,Bar,35.642875,139.669431,540,2012-07-18 13:29:20,2012-07-18 22:29:20,22,True,2,True,10 days 22:10:09,False,15730.150000,Liquid,45020
150244,2293,4c2e057d7d85a593408a53f3,4bf58dd8d48988d11b941735,Bar,35.640912,139.669393,540,2012-07-29 16:12:14,2012-07-30 01:12:14,1,True,0,True,0 days 01:38:42,False,98.700000,Liquid,45021
150245,2293,4c2e057d7d85a593408a53f3,4bf58dd8d48988d11b941735,Bar,35.640912,139.669393,540,2012-07-29 17:50:56,2012-07-30 02:50:56,2,True,0,True,0 days 13:02:20,False,782.333333,Liquid,45021
150246,2293,4c30356316adc928cbebbe9c,4bf58dd8d48988d103951735,Clothing Store,35.662496,139.698742,540,2012-10-25 11:12:48,2012-10-25 20:12:48,20,True,3,True,0 days 00:37:58,False,37.966667,Shopping,45022


In [23]:
import datetime as dt
from pathlib import Path
import os

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [24]:
tmp_df3.loc[tmp_df3['venueCategory'].str.contains('Caf'), 'Category'] = 'Cafe'
tmp_df3.loc[tmp_df3['venueCategory'].str.contains('Gas Station'), 'Category'] = 'Other'
tmp_df3[tmp_df3['Category'].isnull()]

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId


In [25]:
tmp_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150248 entries, 0 to 150247
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype          
---  ------              --------------   -----          
 0   userId              150248 non-null  int64          
 1   venueId             150248 non-null  object         
 2   venueCategoryId     150248 non-null  object         
 3   venueCategory       150248 non-null  object         
 4   latitude            150248 non-null  float64        
 5   longitude           150248 non-null  float64        
 6   timezoneOffset      150248 non-null  int64          
 7   utcTimestamp        150248 non-null  datetime64[ns] 
 8   utcTimestampOffset  150248 non-null  datetime64[ns] 
 9   hour                150248 non-null  int64          
 10  nightlife           150248 non-null  object         
 11  dayofweek           150248 non-null  int64          
 12  weekday             150248 non-null  object         
 13  time_diff     

In [26]:
id2idx = {item_id : index for index, item_id in enumerate(tmp_df3['Category'].unique())}

In [27]:
def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))
    # id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

In [28]:
# short_session을 제거한 다음 unpopular item을 제거하면 다시 길이가 1인 session이 생길 수 있다.
# 이를 위해 반복문을 통해 지속적으로 제거한다.
def cleanse_recursive(data: pd.DataFrame, shortest, least_click) -> pd.DataFrame:
    while True:
        before_len = len(data)
        data = cleanse_short_session(data, shortest)
        data = cleanse_unpopular_item(data, least_click)
        after_len = len(data)
        if before_len == after_len:
            break
    return data


def cleanse_short_session(data: pd.DataFrame, shortest):
    session_len = data.groupby('SessionId').size()
    session_use = session_len[session_len >= shortest].index
    data = data[data['SessionId'].isin(session_use)]
    return data


def cleanse_unpopular_item(data: pd.DataFrame, least_click):
    item_popular = data.groupby('Category').size()
    item_use = item_popular[item_popular >= least_click].index
    data = data[data['Category'].isin(item_use)]
    return data

In [29]:
data = cleanse_recursive(tmp_df3, shortest=3, least_click=1)
data.head(10)

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,Cafe,0
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0
5,1,4f6d5c5be4b04cb293be0d42,4d954b0ea243a5684a65b473,Convenience Store,35.633796,139.791518,540,2012-05-05 21:23:08,2012-05-06 06:23:08,6,False,6,False,0 days 00:13:13,False,13.216667,Shopping,0
6,1,4b6a4f2df964a520b0d12be3,4bf58dd8d48988d1ff931735,Convention Center,35.629867,139.794266,540,2012-05-05 21:36:21,2012-05-06 06:36:21,6,False,6,False,12 days 23:49:22,False,18709.366667,Work,0
7,1,4b569776f964a5201d1628e3,4bf58dd8d48988d121951735,Paper / Office Supplies Store,35.697973,139.78663,540,2012-05-19 05:28:49,2012-05-19 14:28:49,14,False,5,False,0 days 00:18:53,False,18.883333,Other,1
8,1,4b565cbff964a520380d28e3,4bf58dd8d48988d129951735,Train Station,35.697466,139.785976,540,2012-05-19 05:47:42,2012-05-19 14:47:42,14,False,5,False,0 days 00:23:42,False,23.7,Transportation,1
9,1,4b5982cbf964a520a58a28e3,4bf58dd8d48988d171941735,Event Space,35.699972,139.77081,540,2012-05-19 06:11:24,2012-05-19 15:11:24,15,False,5,False,0 days 00:03:13,False,3.216667,Entertain,1


In [30]:
data

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,Cafe,0
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150231,2292,4b7bd270f964a520786f2fe3,4bf58dd8d48988d172941735,Post Office,35.681496,139.634128,540,2013-01-26 04:40:43,2013-01-26 13:40:43,13,False,5,False,0 days 00:45:40,False,45.666667,Other,45015
150232,2292,4b5aacc7f964a52058d028e3,4bf58dd8d48988d118951735,Food & Drink Shop,35.659990,139.642751,540,2013-01-26 05:26:23,2013-01-26 14:26:23,14,False,5,False,0 days 06:14:24,False,374.400000,Meal,45015
150235,2292,4dabf77943a11281971b3ea9,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670478,139.491913,540,2013-02-11 03:08:28,2013-02-11 12:08:28,12,False,0,True,0 days 00:47:37,False,47.616667,Shopping,45017
150236,2292,4b933252f964a520913a34e3,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670607,139.501058,540,2013-02-11 03:56:05,2013-02-11 12:56:05,12,False,0,True,0 days 01:32:23,False,92.383333,Shopping,45017


In [31]:
id2idx = {item_id : index for index, item_id in enumerate(data['Category'].unique())}

In [32]:
def indexing(df, id2idx):
    df['item_idx'] = df['Category'].map(lambda x: id2idx.get(x, -1))
    # id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

In [33]:
total_dataset = indexing(data, id2idx)
total_dataset

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0,1
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,Cafe,0,2
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150231,2292,4b7bd270f964a520786f2fe3,4bf58dd8d48988d172941735,Post Office,35.681496,139.634128,540,2013-01-26 04:40:43,2013-01-26 13:40:43,13,False,5,False,0 days 00:45:40,False,45.666667,Other,45015,5
150232,2292,4b5aacc7f964a52058d028e3,4bf58dd8d48988d118951735,Food & Drink Shop,35.659990,139.642751,540,2013-01-26 05:26:23,2013-01-26 14:26:23,14,False,5,False,0 days 06:14:24,False,374.400000,Meal,45015,0
150235,2292,4dabf77943a11281971b3ea9,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670478,139.491913,540,2013-02-11 03:08:28,2013-02-11 12:08:28,12,False,0,True,0 days 00:47:37,False,47.616667,Shopping,45017,1
150236,2292,4b933252f964a520913a34e3,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670607,139.501058,540,2013-02-11 03:56:05,2013-02-11 12:56:05,12,False,0,True,0 days 01:32:23,False,92.383333,Shopping,45017,1


In [34]:
tmp_total = total_dataset[:]

In [35]:
tmp_total

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0,1
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,Cafe,0,2
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150231,2292,4b7bd270f964a520786f2fe3,4bf58dd8d48988d172941735,Post Office,35.681496,139.634128,540,2013-01-26 04:40:43,2013-01-26 13:40:43,13,False,5,False,0 days 00:45:40,False,45.666667,Other,45015,5
150232,2292,4b5aacc7f964a52058d028e3,4bf58dd8d48988d118951735,Food & Drink Shop,35.659990,139.642751,540,2013-01-26 05:26:23,2013-01-26 14:26:23,14,False,5,False,0 days 06:14:24,False,374.400000,Meal,45015,0
150235,2292,4dabf77943a11281971b3ea9,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670478,139.491913,540,2013-02-11 03:08:28,2013-02-11 12:08:28,12,False,0,True,0 days 00:47:37,False,47.616667,Shopping,45017,1
150236,2292,4b933252f964a520913a34e3,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670607,139.501058,540,2013-02-11 03:56:05,2013-02-11 12:56:05,12,False,0,True,0 days 01:32:23,False,92.383333,Shopping,45017,1


In [36]:
# 데이터가 주어지면 세션이 시작되는 인덱스를 담는 값과 세션을 새로 인덱싱한 값을 갖는 클래스를 만든다.
class SessionDataset:
    
    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()     # 각 세션이 시작된 인덱스를 가진 변수
        self.session_idx = np.arange(self.df['SessionId'].nunique())  # indexing to SessionId
    
    def get_click_offsets(self):
        """
        Return the indexes of the first click of each session IDs,
        """
        offsets = np.zeros(self.df['SessionId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('SessionId').size().cumsum()
        return offsets

In [37]:
tmp_total = SessionDataset(tmp_total).df

In [38]:
tmp_total

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0,1
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,Cafe,0,2
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150231,2292,4b7bd270f964a520786f2fe3,4bf58dd8d48988d172941735,Post Office,35.681496,139.634128,540,2013-01-26 04:40:43,2013-01-26 13:40:43,13,False,5,False,0 days 00:45:40,False,45.666667,Other,45015,5
150232,2292,4b5aacc7f964a52058d028e3,4bf58dd8d48988d118951735,Food & Drink Shop,35.659990,139.642751,540,2013-01-26 05:26:23,2013-01-26 14:26:23,14,False,5,False,0 days 06:14:24,False,374.400000,Meal,45015,0
150235,2292,4dabf77943a11281971b3ea9,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670478,139.491913,540,2013-02-11 03:08:28,2013-02-11 12:08:28,12,False,0,True,0 days 00:47:37,False,47.616667,Shopping,45017,1
150236,2292,4b933252f964a520913a34e3,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670607,139.501058,540,2013-02-11 03:56:05,2013-02-11 12:56:05,12,False,0,True,0 days 01:32:23,False,92.383333,Shopping,45017,1


In [39]:
tmp_total

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
0,1,4b73cca1f964a5203fbc2de3,4bf58dd8d48988d111941735,Japanese Restaurant,35.700941,139.770555,540,2012-04-08 07:46:06,2012-04-08 16:46:06,16,False,6,False,0 days 01:11:56,False,71.933333,Meal,0,0
1,1,4b9dd2aaf964a52020bf36e3,4bf58dd8d48988d122951735,Electronics Store,35.701281,139.771293,540,2012-04-08 08:58:02,2012-04-08 17:58:02,17,False,6,False,0 days 01:07:53,False,67.883333,Shopping,0,1
2,1,4dd86e71e4cd37c893c5c0b2,4bf58dd8d48988d16d941735,Café,35.700932,139.770652,540,2012-04-08 10:05:55,2012-04-08 19:05:55,19,True,6,False,0 days 00:39:13,False,39.216667,Cafe,0,2
3,1,4b593c7ef964a520828228e3,4bf58dd8d48988d16e941735,Fast Food Restaurant,35.699262,139.773906,540,2012-04-08 10:45:08,2012-04-08 19:45:08,19,True,6,False,0 days 01:32:52,False,92.866667,Meal,0,0
4,1,4b19f917f964a520abe623e3,4bf58dd8d48988d129951735,Train Station,35.698596,139.773018,540,2012-04-08 12:18:00,2012-04-08 21:18:00,21,True,6,False,0 days 00:06:53,False,6.883333,Transportation,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150231,2292,4b7bd270f964a520786f2fe3,4bf58dd8d48988d172941735,Post Office,35.681496,139.634128,540,2013-01-26 04:40:43,2013-01-26 13:40:43,13,False,5,False,0 days 00:45:40,False,45.666667,Other,45015,5
150232,2292,4b5aacc7f964a52058d028e3,4bf58dd8d48988d118951735,Food & Drink Shop,35.659990,139.642751,540,2013-01-26 05:26:23,2013-01-26 14:26:23,14,False,5,False,0 days 06:14:24,False,374.400000,Meal,45015,0
150235,2292,4dabf77943a11281971b3ea9,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670478,139.491913,540,2013-02-11 03:08:28,2013-02-11 12:08:28,12,False,0,True,0 days 00:47:37,False,47.616667,Shopping,45017,1
150236,2292,4b933252f964a520913a34e3,4bf58dd8d48988d1f8941735,Furniture / Home Store,35.670607,139.501058,540,2013-02-11 03:56:05,2013-02-11 12:56:05,12,False,0,True,0 days 01:32:23,False,92.383333,Shopping,45017,1


In [40]:
# arr = np.empty((0,3), int)

# for i in list(tmp_total['SessionId'].drop_duplicates()):
#     tmp = tmp_total[tmp_total['SessionId'] == i]
#     if len(tmp) >= 3:
#         # print('input')
#         input = tmp.iloc[:2]['item_idx']
#         # print(tmp.iloc[:2]['item_idx'])
#         # print('output')
#         output = tmp[2:3]['item_idx']
#         # print(tmp[2:3]['item_idx'])
#         in_and_out = tmp[:3]['item_idx']
#         array = np.array(in_and_out)
#         arr = np.append(arr, [np.array(in_and_out)], axis=0)

KeyboardInterrupt: 

In [41]:
arr = np.empty((0,3), int)

for i in list(tmp_total['SessionId'].drop_duplicates()):
    tmp = tmp_total[tmp_total['SessionId'] == i]
    if len(tmp) >= 3:
        for j in range(len(tmp)):
            try:
                in_and_out = tmp[j:j+3]['item_idx']
                arr = np.append(arr, [np.array(in_and_out)], axis=0)
            except:
                continue

In [42]:
len(arr)

60202

In [43]:
id2idx

{'Meal': 0,
 'Shopping': 1,
 'Cafe': 2,
 'Transportation': 3,
 'Work': 4,
 'Other': 5,
 'Entertain': 6,
 'Leisure': 7,
 'Exercise': 8,
 'Sightseeing': 9,
 'School': 10,
 'Retail': 11,
 'Snack': 12,
 'Liquid': 13,
 'Hobby': 14,
 'Event': 15,
 'Resort': 16,
 'Residence': 17}

In [47]:
pd.Series(arr[:,-1][int(len(arr)*0.7):]).drop_duplicates().sort_values()

0        0
6        1
8        2
38       3
12       4
5        5
1        6
11       7
2156     8
438      9
319     10
392     11
483     12
3       13
90      14
2       15
628     16
340     17
dtype: int64

In [48]:
print(arr[:,:-1])
print(arr[:,-1])
# input과 output으로 잘 쪼개졌다.

[[ 0  1]
 [ 1  2]
 [ 2  0]
 ...
 [12  0]
 [ 0  5]
 [ 1  1]]
[2 0 3 ... 5 0 1]


In [49]:
print('X shape : ', arr[:,:-1].shape)
print('Y shape : ', arr[:,-1].shape)

X shape :  (60202, 2)
Y shape :  (60202,)


In [50]:
# split
data_train = arr[:int(len(arr)*0.7)]
data_val = arr[int(len(arr)*0.7):int(len(arr)*0.9)]
data_test = arr[int(len(arr)*0.9):]

In [51]:
x_train = data_train[:,:-1]
y_train = data_train[:,-1]

x_val = data_val[:,:-1]
y_val = data_val[:,-1]

x_test = data_test[:,:-1]
y_test = data_test[:,-1]

In [52]:
x_train

array([[0, 1],
       [1, 2],
       [2, 0],
       ...,
       [4, 0],
       [0, 5],
       [5, 1]], dtype=int64)

In [53]:
import matplotlib.pyplot as pltfrom
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, GRU, Dropout
from keras.preprocessing import sequence
from keras.utils import np_utils

import numpy
import tensorflow as tf
import matplotlib.pyplot as plt

y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)
y_test = np_utils.to_categorical(y_test)

In [54]:
regression_GRU = Sequential()

regression_GRU.add(Embedding(1000,100))
regression_GRU.add(GRU(units=100, activation="relu", return_sequences=True, input_shape = (x_train.shape[1], 5)))
regression_GRU.add(Dropout(0.2))

regression_GRU.add(GRU(units=60, activation="relu", return_sequences=True))
regression_GRU.add(Dropout(0.3))

regression_GRU.add(GRU(units=80, activation="relu", return_sequences=True))
regression_GRU.add(Dropout(0.4))

regression_GRU.add(GRU(units=120, activation="relu"))
regression_GRU.add(Dropout(0.5))

regression_GRU.add(Dense(units = 18,activation='softmax'))

In [55]:
model = Sequential()
model.add(Embedding(1000,100))
model.add(GRU(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(18, activation='softmax'))

In [56]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy','top_k_categorical_accuracy'])

In [57]:
y_train.shape

(42141, 18)

In [58]:
history = model.fit(x_train, y_train, batch_size=100, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [63]:
regression_GRU.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy','top_k_categorical_accuracy'])

In [64]:
#history = regression_GRU.fit(x_train, y_train, batch_size=100, epochs=20, validation_data=(x_val, y_val))
GRU_history = regression_GRU.fit(x_train, y_train, batch_size=100, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
print("\n 정확도 : %.4f" % (model.evaluate(x_test, y_test)[1]))


 정확도 : 0.3121


In [66]:
print("\n 정확도 : %.4f" % (regression_GRU.evaluate(x_test, y_test)[1]))


 정확도 : 0.3124


In [67]:
y_pred = regression_GRU.predict(x_test)



In [68]:
# Top-5 accuracy일 경우,

(sum(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=5))
/len(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=5)))

<tf.Tensor: shape=(), dtype=float32, numpy=0.8156452>

In [69]:
# Top-3 accuracy일 경우,

(sum(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=3))
/len(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=3)))

<tf.Tensor: shape=(), dtype=float32, numpy=0.6651719>