In [1]:
import numpy as np
import pandas as pd
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime, timedelta
import math

In [2]:
# UTC_time 변경 함수

def date_convert(date_to_convert):
    pru = datetime.datetime.strptime(date_to_convert, '%a %b %d %H:%M:%S +0000 %Y')+ datetime.timedelta(hours=-240//60)
    return pru.strftime('%b %d %Y %H:%M:%S')

def tzo_convert(tzo_to_convert):
    return  timedelta(hours=tzo_to_convert/60)

In [3]:
nyc_filepath = 'C:/Users/piai/Desktop/code_file/dataset_WWW2019/dataset_TSMC2014_NYC.csv\dataset_TSMC2014_NYC.csv'
custom_date_parser = lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y")

nyc_data = pd.read_csv(nyc_filepath,parse_dates=['utcTimestamp'],date_parser=custom_date_parser)

nyc_data["utcTimestampOffset"] = nyc_data.utcTimestamp + nyc_data.timezoneOffset.apply(tzo_convert)

nyc_data["hour"] = nyc_data.utcTimestampOffset.dt.hour

# 야행성인지 체크하는 컬럼 생성. 저녁 6시부터 새벽 6시까지 활동한 POI는 야행성 Ture로 표시.
nyc_data["nightlife"] = nyc_data['hour'].apply(lambda x: 'True' if (18 <= x < 24) or (0 <= x < 6)else 'False')

# 요일이 무엇인지 0~6 순서로 설정. 월요일이 0, 일요일이 6.
nyc_data["dayofweek"] = nyc_data.utcTimestampOffset.dt.dayofweek

# 평일과 주말을 구분. 평일이면 True, 주말이면 False.
nyc_data["weekday"] = nyc_data['dayofweek'].apply(lambda x: 'True' if x <= 4 else 'False')

In [4]:
nyc_data

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.719810,-74.002581,-240,2012-04-03 18:00:09,2012-04-03 14:00:09,14,False,1,True
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.606800,-74.044170,-240,2012-04-03 18:00:25,2012-04-03 14:00:25,14,False,1,True
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.883070,-240,2012-04-03 18:02:24,2012-04-03 14:02:24,14,False,1,True
3,395,4bc7086715a7ef3bef9878da,4bf58dd8d48988d104941735,Medical Center,40.745164,-73.982519,-240,2012-04-03 18:02:41,2012-04-03 14:02:41,14,False,1,True
4,87,4cf2c5321d18a143951b5cec,4bf58dd8d48988d1cb941735,Food Truck,40.740104,-73.989658,-240,2012-04-03 18:03:00,2012-04-03 14:03:00,14,False,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
227423,688,3fd66200f964a52000e71ee3,4bf58dd8d48988d1e7931735,Music Venue,40.733596,-74.003139,-300,2013-02-16 02:29:11,2013-02-15 21:29:11,21,True,4,True
227424,560,4bca32ff0687ef3be789dbcc,4bf58dd8d48988d16c941735,Burger Joint,40.745719,-73.993720,-300,2013-02-16 02:31:35,2013-02-15 21:31:35,21,True,4,True
227425,945,50a77716e4b0b5a9492f6f56,4bf58dd8d48988d103941735,Home (private),40.854365,-73.883070,-300,2013-02-16 02:33:16,2013-02-15 21:33:16,21,True,4,True
227426,671,4514efe0f964a520e7391fe3,4bf58dd8d48988d11d941735,Bar,40.735981,-74.029309,-300,2013-02-16 02:34:31,2013-02-15 21:34:31,21,True,4,True


In [4]:
# 유저 아이디 별로 정렬 후
# 유저 아이디 행 수만큼만 len 돌려서 time delta 하고,
# 이대로 컬럼 하나 추가하기.
nyc_data_sorted = nyc_data.sort_values(by=['userId', 'utcTimestampOffset'], ascending=[True, True])
nyc_data_sorted.reset_index(inplace= True, drop= True)

In [5]:
nyc_data_sorted['time_diff'] = (nyc_data_sorted.utcTimestampOffset.shift(-1) - nyc_data_sorted.utcTimestampOffset)

In [6]:
nyc_data_sorted['duplicate'] = False
nyc_data_sorted['timedelta_min'] = nyc_data_sorted['time_diff']/timedelta(minutes=1)

In [7]:
for i in range(1, len(nyc_data_sorted) -1):
    if i == len(nyc_data_sorted) - 1:
        continue
    if nyc_data_sorted.loc[i, "userId"] == nyc_data_sorted.loc[i-1, "userId"]:
        if ((nyc_data_sorted.loc[i, 'venueCategory'] == nyc_data_sorted.loc[i-1, 'venueCategory']) & (nyc_data_sorted.loc[i, 'timedelta_min'] < 60)) | (nyc_data_sorted.loc[i-1, 'timedelta_min'] < 3):
            # 같은 아이디 내에서 카테고리가 연달아 중복되는데, POI 인증 시간차가 60min 미만인 경우 or 다음 POI까지 인증이 3min 미만인 경우(POI는 다르게 중복 인증하는 실수를 한 경우)는 중복이라고 체크.
            nyc_data_sorted.loc[i, 'duplicate'] = True
    if (i > 0) & (nyc_data_sorted.loc[i, "userId"] != nyc_data_sorted.loc[i-1, "userId"]):
        nyc_data_sorted.loc[i-1, 'time_diff'] = pd.to_timedelta(0, unit='h')

In [8]:
# 중복 제거
condition = nyc_data_sorted[nyc_data_sorted['duplicate'] == True].index
nyc_data_sorted.drop(condition, inplace= True)

In [9]:
# POI 인증별 시간차 다시 update
nyc_data_sorted['time_diff'] = (nyc_data_sorted.utcTimestampOffset.shift(-1) - nyc_data_sorted.utcTimestampOffset)
nyc_data_sorted['timedelta_min'] = nyc_data_sorted['time_diff']/timedelta(minutes=1)
nyc_data_sorted.reset_index(inplace= True, drop= True)

In [10]:
poi_list = pd.read_csv('C:/Users/piai/Desktop/code_file/category_list_for_categorize_for_lstm_update.txt', header = None)
poi_list.columns = ['venueCategory','Category']
poi_list.head()

Unnamed: 0,venueCategory,Category
0,Post Office,Other
1,Jazz Club,Liquid
2,Gym,Exercise
3,Indian Restaurant,Meal
4,Sandwich Place,Meal


## 인증 텀을 120min까지로 늘리고, Category에서 Transportation, Work, School를 제외하자. 

In [11]:
nyc_data_sorted = pd.merge(nyc_data_sorted,poi_list, how='left', left_on='venueCategory', right_on='venueCategory')

In [12]:
tmp_df = nyc_data_sorted[(nyc_data_sorted['timedelta_min'] < 120) & (nyc_data_sorted['Category'] != 'Transportation') & (nyc_data_sorted['Category'] != 'Work') & (nyc_data_sorted['Category'] != 'School')]
tmp_df2 = nyc_data_sorted.loc[[(i + 1) for i in list(tmp_df.index)]]
tmp_df3 = pd.concat([tmp_df, tmp_df2])

tmp_df3 = tmp_df3.sort_values(['userId', 'utcTimestampOffset'])
tmp_df3.drop_duplicates(inplace=True)

# tmp_df3를 가지고 세션 데이터를 생성해보자.
tmp_df3['SessionId'] = None

In [13]:
tmp_df3['Category'].value_counts()

Meal              12777
Shopping           9102
Other              6905
Liquid             5592
Residence          5422
Entertain          3684
Leisure            3665
Work               3606
Transportation     2996
Cafe               2829
Snack              2335
Exercise           1982
Sightseeing        1749
Retail             1373
School             1106
Resort              930
Event               561
Hobby               556
Name: Category, dtype: int64

In [14]:
tmp_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68314 entries, 2 to 192125
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   userId              68314 non-null  int64          
 1   venueId             68314 non-null  object         
 2   venueCategoryId     68314 non-null  object         
 3   venueCategory       68314 non-null  object         
 4   latitude            68314 non-null  float64        
 5   longitude           68314 non-null  float64        
 6   timezoneOffset      68314 non-null  int64          
 7   utcTimestamp        68314 non-null  datetime64[ns] 
 8   utcTimestampOffset  68314 non-null  datetime64[ns] 
 9   hour                68314 non-null  int64          
 10  nightlife           68314 non-null  object         
 11  dayofweek           68314 non-null  int64          
 12  weekday             68314 non-null  object         
 13  time_diff           68314 non-

In [15]:
tmp_df3.reset_index(inplace= True, drop= True)

In [16]:
index = 0
SessionId = 0
last = nyc_data_sorted.loc[len(nyc_data_sorted)-1, 'userId']
for i in range(len(tmp_df3)):
    tmp_df3.loc[i, 'SessionId'] = SessionId
    if tmp_df3.loc[i, 'timedelta_min'] >= 120:
        SessionId += 1

In [20]:
tmp_df3

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId
0,1,4db44994cda1c57c82583709,4bf58dd8d48988d1f1931735,General Entertainment,40.739398,-73.993210,-240,2012-04-08 18:20:29,2012-04-08 14:20:29,14,False,6,False,0 days 01:41:41,False,101.683333,Entertain,0
1,1,4a541923f964a52008b31fe3,4bf58dd8d48988d14e941735,American Restaurant,40.785677,-73.976498,-240,2012-04-08 20:02:10,2012-04-08 16:02:10,16,False,6,False,0 days 20:18:42,False,1218.700000,Meal,0
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.600000,Liquid,1
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68309,1083,49edce76f964a520f9671fe3,4bf58dd8d48988d124941735,Office,40.755764,-73.985812,-300,2013-01-04 17:58:37,2013-01-04 12:58:37,12,False,4,True,0 days 04:16:32,False,256.533333,Work,24958
68310,1083,4b102c19f964a5208f6a23e3,4bf58dd8d48988d103951735,Clothing Store,40.749347,-73.986782,-300,2013-01-08 00:32:35,2013-01-07 19:32:35,19,True,0,True,0 days 00:39:42,False,39.700000,Shopping,24959
68311,1083,4eda64ced5fb8f213a5d740e,4bf58dd8d48988d176941735,Gym / Fitness Center,40.746119,-73.993070,-300,2013-01-08 01:12:17,2013-01-07 20:12:17,20,True,0,True,0 days 14:14:11,False,854.183333,Exercise,24959
68312,1083,49f4dca6f964a520626b1fe3,4bf58dd8d48988d1c1941735,Mexican Restaurant,40.735174,-73.979597,-300,2013-02-02 04:19:34,2013-02-01 23:19:34,23,True,4,True,0 days 00:29:02,False,29.033333,Meal,24960


In [17]:
import datetime as dt
from pathlib import Path
import os

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [18]:
tmp_df3.loc[tmp_df3['venueCategory'].str.contains('Caf'), 'Category'] = 'Cafe'
tmp_df3.loc[tmp_df3['venueCategory'].str.contains('Gas Station'), 'Category'] = 'Other'
tmp_df3[tmp_df3['Category'].isnull()]

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId


In [19]:
tmp_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68314 entries, 0 to 68313
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   userId              68314 non-null  int64          
 1   venueId             68314 non-null  object         
 2   venueCategoryId     68314 non-null  object         
 3   venueCategory       68314 non-null  object         
 4   latitude            68314 non-null  float64        
 5   longitude           68314 non-null  float64        
 6   timezoneOffset      68314 non-null  int64          
 7   utcTimestamp        68314 non-null  datetime64[ns] 
 8   utcTimestampOffset  68314 non-null  datetime64[ns] 
 9   hour                68314 non-null  int64          
 10  nightlife           68314 non-null  object         
 11  dayofweek           68314 non-null  int64          
 12  weekday             68314 non-null  object         
 13  time_diff           68314 non-n

In [20]:
id2idx = {item_id : index for index, item_id in enumerate(tmp_df3['Category'].unique())}

In [21]:
def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))
    # id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

In [22]:
# short_session을 제거한 다음 unpopular item을 제거하면 다시 길이가 1인 session이 생길 수 있다.
# 이를 위해 반복문을 통해 지속적으로 제거한다.
def cleanse_recursive(data: pd.DataFrame, shortest, least_click) -> pd.DataFrame:
    while True:
        before_len = len(data)
        data = cleanse_short_session(data, shortest)
        data = cleanse_unpopular_item(data, least_click)
        after_len = len(data)
        if before_len == after_len:
            break
    return data


def cleanse_short_session(data: pd.DataFrame, shortest):
    session_len = data.groupby('SessionId').size()
    session_use = session_len[session_len >= shortest].index
    data = data[data['SessionId'].isin(session_use)]
    return data


def cleanse_unpopular_item(data: pd.DataFrame, least_click):
    item_popular = data.groupby('Category').size()
    item_use = item_popular[item_popular >= least_click].index
    data = data[data['Category'].isin(item_use)]
    return data

In [23]:
data = cleanse_recursive(tmp_df3, shortest=3, least_click=1)
data.head(10)

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.6,Liquid,1
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1
7,1,4b8afdb0f964a520038d32e3,4bf58dd8d48988d1d5941735,Hotel,40.756247,-73.972692,-240,2012-04-14 20:03:42,2012-04-14 16:03:42,16,False,5,False,0 days 01:20:26,False,80.433333,Resort,3
8,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.96878,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,Meal,3
9,1,4e742aabc65bb91db3cadb79,4bf58dd8d48988d116941735,Bar,40.757305,-73.96864,-240,2012-04-14 22:07:43,2012-04-14 18:07:43,18,True,5,False,0 days 01:07:24,False,67.4,Liquid,3
10,1,3fd66200f964a520fbe71ee3,4bf58dd8d48988d16c941735,Burger Joint,40.758819,-73.968657,-240,2012-04-14 23:15:07,2012-04-14 19:15:07,19,True,5,False,1 days 16:38:53,False,2438.883333,Meal,3
11,1,4e691e82b9930387355c629d,4bf58dd8d48988d14e941735,American Restaurant,40.725163,-73.99216,-240,2012-04-21 16:28:11,2012-04-21 12:28:11,12,False,5,False,0 days 01:54:10,False,114.166667,Meal,4
12,1,3fd66200f964a520f0e51ee3,4bf58dd8d48988d10d951735,Record Shop,40.728912,-73.9993,-240,2012-04-21 18:22:21,2012-04-21 14:22:21,14,False,5,False,0 days 00:26:37,False,26.616667,Hobby,4
13,1,49dc03d9f964a520445f1fe3,4bf58dd8d48988d116941735,Bar,40.729658,-73.998515,-240,2012-04-21 18:48:58,2012-04-21 14:48:58,14,False,5,False,0 days 00:29:51,False,29.85,Liquid,4


In [24]:
data

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.600000,Liquid,1
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1
7,1,4b8afdb0f964a520038d32e3,4bf58dd8d48988d1d5941735,Hotel,40.756247,-73.972692,-240,2012-04-14 20:03:42,2012-04-14 16:03:42,16,False,5,False,0 days 01:20:26,False,80.433333,Resort,3
8,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.968780,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,Meal,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68297,1083,460d4b66f964a52005451fe3,4bf58dd8d48988d118951735,Food & Drink Shop,40.744818,-73.995152,-240,2012-07-16 16:41:15,2012-07-16 12:41:15,12,False,0,True,0 days 01:57:51,False,117.850000,Meal,24953
68298,1083,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,-240,2012-07-16 18:39:06,2012-07-16 14:39:06,14,False,0,True,0 days 03:53:46,False,233.766667,Work,24953
68301,1083,4ade3c4df964a5202a7421e3,4bf58dd8d48988d10c951735,Cosmetics Shop,40.739084,-73.991384,-240,2012-09-09 20:01:48,2012-09-09 16:01:48,16,False,6,False,0 days 00:57:09,False,57.150000,Shopping,24955
68302,1083,4a7dcf5cf964a520b5ef1fe3,4bf58dd8d48988d1f8941735,Furniture / Home Store,40.740019,-73.994696,-240,2012-09-09 20:58:57,2012-09-09 16:58:57,16,False,6,False,0 days 01:11:30,False,71.500000,Shopping,24955


In [25]:
id2idx = {item_id : index for index, item_id in enumerate(data['Category'].unique())}

In [26]:
def indexing(df, id2idx):
    df['item_idx'] = df['Category'].map(lambda x: id2idx.get(x, -1))
    # id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

In [27]:
total_dataset = indexing(data, id2idx)
total_dataset

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.600000,Liquid,1,0
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1,0
7,1,4b8afdb0f964a520038d32e3,4bf58dd8d48988d1d5941735,Hotel,40.756247,-73.972692,-240,2012-04-14 20:03:42,2012-04-14 16:03:42,16,False,5,False,0 days 01:20:26,False,80.433333,Resort,3,2
8,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.968780,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,Meal,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68297,1083,460d4b66f964a52005451fe3,4bf58dd8d48988d118951735,Food & Drink Shop,40.744818,-73.995152,-240,2012-07-16 16:41:15,2012-07-16 12:41:15,12,False,0,True,0 days 01:57:51,False,117.850000,Meal,24953,1
68298,1083,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,-240,2012-07-16 18:39:06,2012-07-16 14:39:06,14,False,0,True,0 days 03:53:46,False,233.766667,Work,24953,10
68301,1083,4ade3c4df964a5202a7421e3,4bf58dd8d48988d10c951735,Cosmetics Shop,40.739084,-73.991384,-240,2012-09-09 20:01:48,2012-09-09 16:01:48,16,False,6,False,0 days 00:57:09,False,57.150000,Shopping,24955,4
68302,1083,4a7dcf5cf964a520b5ef1fe3,4bf58dd8d48988d1f8941735,Furniture / Home Store,40.740019,-73.994696,-240,2012-09-09 20:58:57,2012-09-09 16:58:57,16,False,6,False,0 days 01:11:30,False,71.500000,Shopping,24955,4


In [28]:
tmp_total = total_dataset[:]

In [29]:
tmp_total

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.600000,Liquid,1,0
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1,0
7,1,4b8afdb0f964a520038d32e3,4bf58dd8d48988d1d5941735,Hotel,40.756247,-73.972692,-240,2012-04-14 20:03:42,2012-04-14 16:03:42,16,False,5,False,0 days 01:20:26,False,80.433333,Resort,3,2
8,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.968780,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,Meal,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68297,1083,460d4b66f964a52005451fe3,4bf58dd8d48988d118951735,Food & Drink Shop,40.744818,-73.995152,-240,2012-07-16 16:41:15,2012-07-16 12:41:15,12,False,0,True,0 days 01:57:51,False,117.850000,Meal,24953,1
68298,1083,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,-240,2012-07-16 18:39:06,2012-07-16 14:39:06,14,False,0,True,0 days 03:53:46,False,233.766667,Work,24953,10
68301,1083,4ade3c4df964a5202a7421e3,4bf58dd8d48988d10c951735,Cosmetics Shop,40.739084,-73.991384,-240,2012-09-09 20:01:48,2012-09-09 16:01:48,16,False,6,False,0 days 00:57:09,False,57.150000,Shopping,24955,4
68302,1083,4a7dcf5cf964a520b5ef1fe3,4bf58dd8d48988d1f8941735,Furniture / Home Store,40.740019,-73.994696,-240,2012-09-09 20:58:57,2012-09-09 16:58:57,16,False,6,False,0 days 01:11:30,False,71.500000,Shopping,24955,4


In [37]:
# 데이터가 주어지면 세션이 시작되는 인덱스를 담는 값과 세션을 새로 인덱싱한 값을 갖는 클래스를 만든다.
class SessionDataset:
    
    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()     # 각 세션이 시작된 인덱스를 가진 변수
        self.session_idx = np.arange(self.df['SessionId'].nunique())  # indexing to SessionId
    
    def get_click_offsets(self):
        """
        Return the indexes of the first click of each session IDs,
        """
        offsets = np.zeros(self.df['SessionId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('SessionId').size().cumsum()
        return offsets

In [38]:
tmp_total = SessionDataset(tmp_total).df

In [39]:
tmp_total

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.600000,Liquid,1,0
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1,0
7,1,4b8afdb0f964a520038d32e3,4bf58dd8d48988d1d5941735,Hotel,40.756247,-73.972692,-240,2012-04-14 20:03:42,2012-04-14 16:03:42,16,False,5,False,0 days 01:20:26,False,80.433333,Resort,3,2
8,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.968780,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,Meal,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68297,1083,460d4b66f964a52005451fe3,4bf58dd8d48988d118951735,Food & Drink Shop,40.744818,-73.995152,-240,2012-07-16 16:41:15,2012-07-16 12:41:15,12,False,0,True,0 days 01:57:51,False,117.850000,Meal,24953,1
68298,1083,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,-240,2012-07-16 18:39:06,2012-07-16 14:39:06,14,False,0,True,0 days 03:53:46,False,233.766667,Work,24953,10
68301,1083,4ade3c4df964a5202a7421e3,4bf58dd8d48988d10c951735,Cosmetics Shop,40.739084,-73.991384,-240,2012-09-09 20:01:48,2012-09-09 16:01:48,16,False,6,False,0 days 00:57:09,False,57.150000,Shopping,24955,4
68302,1083,4a7dcf5cf964a520b5ef1fe3,4bf58dd8d48988d1f8941735,Furniture / Home Store,40.740019,-73.994696,-240,2012-09-09 20:58:57,2012-09-09 16:58:57,16,False,6,False,0 days 01:11:30,False,71.500000,Shopping,24955,4


In [122]:
tmp_total

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,Category,SessionId,item_idx
2,1,46ea2358f964a520cf4a1fe3,4bf58dd8d48988d11d941735,Bar,40.760667,-73.994948,-240,2012-04-14 01:11:20,2012-04-13 21:11:20,21,True,4,True,0 days 01:56:36,False,116.600000,Liquid,1,0
3,1,4d081fb700e6b1f7d4060cd7,4bf58dd8d48988d113941735,Korean Restaurant,40.764104,-73.986725,-240,2012-04-14 03:07:56,2012-04-13 23:07:56,23,True,4,True,0 days 01:37:17,False,97.283333,Meal,1,1
4,1,40fb0f00f964a520d90a1fe3,4bf58dd8d48988d11b941735,Bar,40.760645,-73.986065,-240,2012-04-14 04:45:13,2012-04-14 00:45:13,0,True,5,False,0 days 11:12:07,False,672.116667,Liquid,1,0
7,1,4b8afdb0f964a520038d32e3,4bf58dd8d48988d1d5941735,Hotel,40.756247,-73.972692,-240,2012-04-14 20:03:42,2012-04-14 16:03:42,16,False,5,False,0 days 01:20:26,False,80.433333,Resort,3,2
8,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.968780,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,Meal,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68297,1083,460d4b66f964a52005451fe3,4bf58dd8d48988d118951735,Food & Drink Shop,40.744818,-73.995152,-240,2012-07-16 16:41:15,2012-07-16 12:41:15,12,False,0,True,0 days 01:57:51,False,117.850000,Meal,24953,1
68298,1083,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,-240,2012-07-16 18:39:06,2012-07-16 14:39:06,14,False,0,True,0 days 03:53:46,False,233.766667,Work,24953,10
68301,1083,4ade3c4df964a5202a7421e3,4bf58dd8d48988d10c951735,Cosmetics Shop,40.739084,-73.991384,-240,2012-09-09 20:01:48,2012-09-09 16:01:48,16,False,6,False,0 days 00:57:09,False,57.150000,Shopping,24955,4
68302,1083,4a7dcf5cf964a520b5ef1fe3,4bf58dd8d48988d1f8941735,Furniture / Home Store,40.740019,-73.994696,-240,2012-09-09 20:58:57,2012-09-09 16:58:57,16,False,6,False,0 days 01:11:30,False,71.500000,Shopping,24955,4


In [40]:
# arr = np.empty((0,3), int)

# for i in list(tmp_total['SessionId'].drop_duplicates()):
#     tmp = tmp_total[tmp_total['SessionId'] == i]
#     if len(tmp) >= 3:
#         # print('input')
#         input = tmp.iloc[:2]['item_idx']
#         # print(tmp.iloc[:2]['item_idx'])
#         # print('output')
#         output = tmp[2:3]['item_idx']
#         # print(tmp[2:3]['item_idx'])
#         in_and_out = tmp[:3]['item_idx']
#         array = np.array(in_and_out)
#         arr = np.append(arr, [np.array(in_and_out)], axis=0)

KeyboardInterrupt: 

In [30]:
arr = np.empty((0,3), int)

for i in list(tmp_total['SessionId'].drop_duplicates()):
    tmp = tmp_total[tmp_total['SessionId'] == i]
    if len(tmp) >= 3:
        for j in range(len(tmp)):
            try:
                in_and_out = tmp[j:j+3]['item_idx']
                arr = np.append(arr, [np.array(in_and_out)], axis=0)
            except:
                continue

In [31]:
len(arr)

18392

In [32]:
id2idx

{'Liquid': 0,
 'Meal': 1,
 'Resort': 2,
 'Hobby': 3,
 'Shopping': 4,
 'Retail': 5,
 'Snack': 6,
 'Cafe': 7,
 'Transportation': 8,
 'Sightseeing': 9,
 'Work': 10,
 'Leisure': 11,
 'Residence': 12,
 'Entertain': 13,
 'Other': 14,
 'Event': 15,
 'Exercise': 16,
 'School': 17}

In [194]:
pd.Series(arr[:,-1][int(len(arr)*0.7):]).drop_duplicates().sort_values()

0       0
8       1
336     2
19      3
1       4
94      5
2       6
40      7
14      8
18      9
4      10
49     11
10     12
41     13
70     14
5      15
39     16
133    17
dtype: int64

In [33]:
print(arr[:,:-1])
print(arr[:,-1])
# input과 output으로 잘 쪼개졌다.

[[ 0  1]
 [ 2  1]
 [ 1  0]
 ...
 [16  8]
 [ 8  1]
 [ 4  4]]
[ 0  0  1 ...  1 10  4]


In [34]:
print('X shape : ', arr[:,:-1].shape)
print('Y shape : ', arr[:,-1].shape)

X shape :  (18392, 2)
Y shape :  (18392,)


In [35]:
# split
data_train = arr[:int(len(arr)*0.7)]
data_val = arr[int(len(arr)*0.7):int(len(arr)*0.9)]
data_test = arr[int(len(arr)*0.9):]

In [47]:
x_train = data_train[:,:-1]
y_train = data_train[:,-1]

x_val = data_val[:,:-1]
y_val = data_val[:,-1]

x_test = data_test[:,:-1]
y_test = data_test[:,-1]

In [46]:
x_train

array([[ 0,  1],
       [ 2,  1],
       [ 1,  0],
       ...,
       [10, 11],
       [ 0,  0],
       [ 4,  1]], dtype=int64)

In [48]:
import matplotlib.pyplot as pltfrom
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, GRU, Dropout
from keras.preprocessing import sequence
from keras.utils import np_utils

import numpy
import tensorflow as tf
import matplotlib.pyplot as plt

y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)
y_test = np_utils.to_categorical(y_test)

In [49]:
regression_GRU = Sequential()

regression_GRU.add(Embedding(1000,100))
regression_GRU.add(GRU(units=100, activation="relu", return_sequences=True, input_shape = (x_train.shape[1], 5)))
regression_GRU.add(Dropout(0.2))

regression_GRU.add(GRU(units=60, activation="relu", return_sequences=True))
regression_GRU.add(Dropout(0.3))

regression_GRU.add(GRU(units=80, activation="relu", return_sequences=True))
regression_GRU.add(Dropout(0.4))

regression_GRU.add(GRU(units=120, activation="relu"))
regression_GRU.add(Dropout(0.5))

regression_GRU.add(Dense(units = 18,activation='softmax'))

In [50]:
model = Sequential()
model.add(Embedding(1000,100))
model.add(GRU(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(18, activation='softmax'))

In [51]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy','top_k_categorical_accuracy'])

In [52]:
y_train.shape

(12874, 18)

In [53]:
history = model.fit(x_train, y_train, batch_size=100, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:
regression_GRU.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [55]:
#history = regression_GRU.fit(x_train, y_train, batch_size=100, epochs=20, validation_data=(x_val, y_val))
GRU_history = regression_GRU.fit(x_train, y_train, batch_size=100, epochs=20, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [56]:
print("\n 정확도 : %.4f" % (regression_GRU.evaluate(x_test, y_test)[1]))


 정확도 : 0.1788


In [57]:
regression_GRU.compile('adam', 'categorical_crossentropy', ['accuracy', 'top_k_categorical_accuracy'])

In [58]:
GRU_history = regression_GRU.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
print("\n 정확도 : %.4f" % (regression_GRU.evaluate(x_test, y_test)[1]))


 정확도 : 0.1918


In [60]:
y_pred = regression_GRU.predict(x_test)



In [61]:
# Top-5 accuracy일 경우,

(sum(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=5))
/len(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=5)))

<tf.Tensor: shape=(), dtype=float32, numpy=0.61576086>

In [62]:
# Top-3 accuracy일 경우,

(sum(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=3))
/len(tf.keras.metrics.top_k_categorical_accuracy(y_test, y_pred, k=3)))

<tf.Tensor: shape=(), dtype=float32, numpy=0.4478261>