In [1]:
import numpy as np
import pandas as pd
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime, timedelta
import math

In [2]:
# UTC_time 변경 함수

def date_convert(date_to_convert):
    pru = datetime.datetime.strptime(date_to_convert, '%a %b %d %H:%M:%S +0000 %Y')+ datetime.timedelta(hours=-240//60)
    return pru.strftime('%b %d %Y %H:%M:%S')

def tzo_convert(tzo_to_convert):
    return  timedelta(hours=tzo_to_convert/60)

In [3]:
nyc_filepath = 'C:/Users/piai/Desktop/code_file/dataset_WWW2019/dataset_TSMC2014_NYC.csv\dataset_TSMC2014_NYC.csv'
custom_date_parser = lambda x: datetime.strptime(x, "%a %b %d %H:%M:%S +0000 %Y")

nyc_data = pd.read_csv(nyc_filepath,parse_dates=['utcTimestamp'],date_parser=custom_date_parser)

nyc_data["utcTimestampOffset"] = nyc_data.utcTimestamp + nyc_data.timezoneOffset.apply(tzo_convert)

nyc_data["hour"] = nyc_data.utcTimestampOffset.dt.hour

# 야행성인지 체크하는 컬럼 생성. 저녁 6시부터 새벽 6시까지 활동한 POI는 야행성 Ture로 표시.
nyc_data["nightlife"] = nyc_data['hour'].apply(lambda x: 'True' if (18 <= x < 24) or (0 <= x < 6)else 'False')

# 요일이 무엇인지 0~6 순서로 설정. 월요일이 0, 일요일이 6.
nyc_data["dayofweek"] = nyc_data.utcTimestampOffset.dt.dayofweek

# 평일과 주말을 구분. 평일이면 True, 주말이면 False.
nyc_data["weekday"] = nyc_data['dayofweek'].apply(lambda x: 'True' if x <= 4 else 'False')

In [4]:
# 유저 아이디 별로 정렬 후
# 유저 아이디 행 수만큼만 len 돌려서 time delta 하고,
# 이대로 컬럼 하나 추가하기.
nyc_data_sorted = nyc_data.sort_values(by=['userId', 'utcTimestampOffset'], ascending=[True, True])
nyc_data_sorted.reset_index(inplace= True, drop= True)

In [5]:
nyc_data_sorted['time_diff'] = (nyc_data_sorted.utcTimestampOffset.shift(-1) - nyc_data_sorted.utcTimestampOffset)

In [6]:
nyc_data_sorted['duplicate'] = False
nyc_data_sorted['timedelta_min'] = nyc_data_sorted['time_diff']/timedelta(minutes=1)

In [7]:
for i in range(1, len(nyc_data_sorted) -1):
    if i == len(nyc_data_sorted) - 1:
        continue
    if nyc_data_sorted.loc[i, "userId"] == nyc_data_sorted.loc[i-1, "userId"]:
        if ((nyc_data_sorted.loc[i, 'venueCategory'] == nyc_data_sorted.loc[i-1, 'venueCategory']) & (nyc_data_sorted.loc[i, 'timedelta_min'] < 60)) | (nyc_data_sorted.loc[i-1, 'timedelta_min'] < 3):
            # 같은 아이디 내에서 카테고리가 연달아 중복되는데, POI 인증 시간차가 60min 미만인 경우 or 다음 POI까지 인증이 3min 미만인 경우(POI는 다르게 중복 인증하는 실수를 한 경우)는 중복이라고 체크.
            nyc_data_sorted.loc[i, 'duplicate'] = True
    if (i > 0) & (nyc_data_sorted.loc[i, "userId"] != nyc_data_sorted.loc[i-1, "userId"]):
        nyc_data_sorted.loc[i-1, 'time_diff'] = pd.to_timedelta(0, unit='h')

In [8]:
# 중복 제거
condition = nyc_data_sorted[nyc_data_sorted['duplicate'] == True].index
nyc_data_sorted.drop(condition, inplace= True)

In [9]:
# POI 인증별 시간차 다시 update
nyc_data_sorted['time_diff'] = (nyc_data_sorted.utcTimestampOffset.shift(-1) - nyc_data_sorted.utcTimestampOffset)
nyc_data_sorted['timedelta_min'] = nyc_data_sorted['time_diff']/timedelta(minutes=1)
nyc_data_sorted.reset_index(inplace= True, drop= True)

In [10]:
nyc_data_sorted[nyc_data_sorted['timedelta_min'] < 0]

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min
104,1,4ace6c89f964a52078d020e3,4bf58dd8d48988d1ed931735,Airport,40.773839,-73.871220,-300,2012-12-15 00:13:02,2012-12-14 19:13:02,19,True,4,True,-255 days +14:30:44,False,-366329.266667
252,2,42ec1480f964a5209e261fe3,4bf58dd8d48988d1e2941735,Beach,40.567089,-73.865461,-300,2013-02-13 19:16:40,2013-02-13 14:16:40,14,False,2,True,-316 days +05:33:00,False,-454707.000000
369,3,50f0d0bfe4b07d6a11535185,4bf58dd8d48988d1c4941735,Restaurant,40.746390,-74.000923,-300,2013-02-06 00:49:03,2013-02-05 19:49:03,19,True,1,True,-309 days +22:00:51,False,-443639.150000
542,4,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,-300,2013-02-14 00:37:40,2013-02-13 19:37:40,19,True,2,True,-250 days +17:28:59,False,-358951.016667
634,5,4e0123ec62843b639cfab90d,4bf58dd8d48988d118951735,Food & Drink Shop,40.728917,-74.041350,-300,2012-11-20 00:44:50,2012-11-19 19:44:50,19,True,0,True,-8 days +09:32:28,False,-10947.533333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191233,1077,3fd66200f964a52036e31ee3,4bf58dd8d48988d16c941735,Burger Joint,40.728533,-73.987809,-300,2013-02-14 02:56:37,2013-02-13 21:56:37,21,True,2,True,-317 days +18:45:00,False,-455355.000000
191387,1078,50780a9be4b00277f24ad107,4bf58dd8d48988d103941735,Home (private),40.782438,-74.141613,-300,2013-02-14 02:45:52,2013-02-13 21:45:52,21,True,2,True,-317 days +23:31:07,False,-455068.883333
191517,1079,43a52546f964a520532c1fe3,4bf58dd8d48988d1ed931735,Airport,40.645089,-73.784523,-300,2012-12-17 18:26:18,2012-12-17 13:26:18,13,False,0,True,-256 days +21:14:34,False,-367365.433333
191743,1081,4e6f6d07d22de801639a8ba2,4bf58dd8d48988d1a1941735,College Academic Building,40.721924,-73.796819,-300,2013-02-14 01:16:34,2013-02-13 20:16:34,20,True,2,True,-317 days +23:29:16,False,-455070.733333


In [11]:
poi_list = pd.read_csv('C:/Users/piai/Desktop/code_file/category_list_for_categorize.txt', header = None)
poi_list.columns = ['venueCategory','Category']
poi_list.head()

Unnamed: 0,venueCategory,Category
0,Post Office,Other
1,Jazz Club,Liquid
2,Gym,Exercise
3,Indian Restaurant,Meal
4,Sandwich Place,Meal


In [24]:
tmp_df = nyc_data_sorted[nyc_data_sorted['timedelta_min'] < 60]
tmp_df2 = nyc_data_sorted.loc[[(i + 1) for i in list(tmp_df.index)]]
tmp_df3 = pd.concat([tmp_df, tmp_df2])

tmp_df3 = tmp_df3.sort_values(['userId', 'utcTimestampOffset'])
tmp_df3.drop_duplicates(inplace=True)

# tmp_df3를 가지고 세션 데이터를 생성해보자.
tmp_df3['SessionId'] = None

In [25]:
tmp_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60057 entries, 16 to 192125
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   userId              60057 non-null  int64          
 1   venueId             60057 non-null  object         
 2   venueCategoryId     60057 non-null  object         
 3   venueCategory       60057 non-null  object         
 4   latitude            60057 non-null  float64        
 5   longitude           60057 non-null  float64        
 6   timezoneOffset      60057 non-null  int64          
 7   utcTimestamp        60057 non-null  datetime64[ns] 
 8   utcTimestampOffset  60057 non-null  datetime64[ns] 
 9   hour                60057 non-null  int64          
 10  nightlife           60057 non-null  object         
 11  dayofweek           60057 non-null  int64          
 12  weekday             60057 non-null  object         
 13  time_diff           60057 non

In [26]:
index = 0
SessionId = 0
last = nyc_data_sorted.loc[len(nyc_data_sorted)-1, 'userId']
for i in range(len(tmp_df3)):
    tmp_df3.iloc[i, -1] = SessionId
    if tmp_df3.iloc[i, -2] >= 60:
        SessionId += 1

In [27]:
tmp_df3.head(20)

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,SessionId
16,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.96878,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,0
17,1,4e742aabc65bb91db3cadb79,4bf58dd8d48988d116941735,Bar,40.757305,-73.96864,-240,2012-04-14 22:07:43,2012-04-14 18:07:43,18,True,5,False,0 days 01:07:24,False,67.4,0
28,1,3fd66200f964a520f0e51ee3,4bf58dd8d48988d10d951735,Record Shop,40.728912,-73.9993,-240,2012-04-21 18:22:21,2012-04-21 14:22:21,14,False,5,False,0 days 00:26:37,False,26.616667,1
29,1,49dc03d9f964a520445f1fe3,4bf58dd8d48988d116941735,Bar,40.729658,-73.998515,-240,2012-04-21 18:48:58,2012-04-21 14:48:58,14,False,5,False,0 days 00:29:51,False,29.85,1
30,1,4a58cd1af964a520e8b71fe3,4bf58dd8d48988d122951735,Electronics Store,40.726034,-73.996309,-240,2012-04-21 19:18:49,2012-04-21 15:18:49,15,False,5,False,0 days 00:41:54,False,41.9,1
31,1,4ab53dabf964a520857320e3,4bf58dd8d48988d1fe941735,Music Store,40.726483,-73.984286,-240,2012-04-21 20:00:43,2012-04-21 16:00:43,16,False,5,False,0 days 00:17:36,False,17.6,1
32,1,4c78103edf08a1cd411fd65d,4bf58dd8d48988d14c941735,Wings Joint,40.725454,-73.987955,-240,2012-04-21 20:18:19,2012-04-21 16:18:19,16,False,5,False,0 days 06:03:06,False,363.1,1
38,1,4af33b2af964a520edeb21e3,4bf58dd8d48988d128951735,Gift Shop,40.775911,-73.980051,-240,2012-05-01 22:08:18,2012-05-01 18:08:18,18,True,1,True,0 days 00:07:05,False,7.083333,2
39,1,42586c80f964a520db201fe3,4bf58dd8d48988d121941735,Bar,40.775986,-73.979528,-240,2012-05-01 22:15:23,2012-05-01 18:15:23,18,True,1,True,3 days 16:44:32,False,5324.533333,2
60,1,4de3e4effa7651589f21983d,4bf58dd8d48988d11e941735,Bar,40.721488,-73.995029,-240,2012-05-29 23:15:10,2012-05-29 19:15:10,19,True,1,True,0 days 00:35:05,False,35.083333,3


In [28]:
import datetime as dt
from pathlib import Path
import os

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [29]:
tmp_df3 = pd.merge(tmp_df3,poi_list, how='left', left_on='venueCategory', right_on='venueCategory')

In [31]:
tmp_df3.loc[tmp_df3['venueCategory'].str.contains('Caf'), 'Category'] = 'Cafe'
tmp_df3.loc[tmp_df3['venueCategory'].str.contains('Gas Station'), 'Category'] = 'Other'
tmp_df3[tmp_df3['Category'].isnull()]

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,SessionId,Category


In [32]:
tmp_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60057 entries, 0 to 60056
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype          
---  ------              --------------  -----          
 0   userId              60057 non-null  int64          
 1   venueId             60057 non-null  object         
 2   venueCategoryId     60057 non-null  object         
 3   venueCategory       60057 non-null  object         
 4   latitude            60057 non-null  float64        
 5   longitude           60057 non-null  float64        
 6   timezoneOffset      60057 non-null  int64          
 7   utcTimestamp        60057 non-null  datetime64[ns] 
 8   utcTimestampOffset  60057 non-null  datetime64[ns] 
 9   hour                60057 non-null  int64          
 10  nightlife           60057 non-null  object         
 11  dayofweek           60057 non-null  int64          
 12  weekday             60057 non-null  object         
 13  time_diff           60057 non-n

In [34]:
id2idx = {item_id : index for index, item_id in enumerate(tmp_df3['Category'].unique())}

In [35]:
def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))
    # id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

In [36]:
# short_session을 제거한 다음 unpopular item을 제거하면 다시 길이가 1인 session이 생길 수 있다.
# 이를 위해 반복문을 통해 지속적으로 제거한다.
def cleanse_recursive(data: pd.DataFrame, shortest, least_click) -> pd.DataFrame:
    while True:
        before_len = len(data)
        data = cleanse_short_session(data, shortest)
        data = cleanse_unpopular_item(data, least_click)
        after_len = len(data)
        if before_len == after_len:
            break
    return data


def cleanse_short_session(data: pd.DataFrame, shortest):
    session_len = data.groupby('SessionId').size()
    session_use = session_len[session_len >= shortest].index
    data = data[data['SessionId'].isin(session_use)]
    return data


def cleanse_unpopular_item(data: pd.DataFrame, least_click):
    item_popular = data.groupby('Category').size()
    item_use = item_popular[item_popular >= least_click].index
    data = data[data['Category'].isin(item_use)]
    return data

In [37]:
data = cleanse_recursive(tmp_df3, shortest=2, least_click=5)
data

Unnamed: 0,userId,venueId,venueCategoryId,venueCategory,latitude,longitude,timezoneOffset,utcTimestamp,utcTimestampOffset,hour,nightlife,dayofweek,weekday,time_diff,duplicate,timedelta_min,SessionId,Category
0,1,4afca7e0f964a520d02422e3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,40.757197,-73.968780,-240,2012-04-14 21:24:08,2012-04-14 17:24:08,17,False,5,False,0 days 00:43:35,False,43.583333,0,Meal
1,1,4e742aabc65bb91db3cadb79,4bf58dd8d48988d116941735,Bar,40.757305,-73.968640,-240,2012-04-14 22:07:43,2012-04-14 18:07:43,18,True,5,False,0 days 01:07:24,False,67.400000,0,Liquid
2,1,3fd66200f964a520f0e51ee3,4bf58dd8d48988d10d951735,Record Shop,40.728912,-73.999300,-240,2012-04-21 18:22:21,2012-04-21 14:22:21,14,False,5,False,0 days 00:26:37,False,26.616667,1,Hobby
3,1,49dc03d9f964a520445f1fe3,4bf58dd8d48988d116941735,Bar,40.729658,-73.998515,-240,2012-04-21 18:48:58,2012-04-21 14:48:58,14,False,5,False,0 days 00:29:51,False,29.850000,1,Liquid
4,1,4a58cd1af964a520e8b71fe3,4bf58dd8d48988d122951735,Electronics Store,40.726034,-73.996309,-240,2012-04-21 19:18:49,2012-04-21 15:18:49,15,False,5,False,0 days 00:41:54,False,41.900000,1,Shopping
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60052,1083,4eda64ced5fb8f213a5d740e,4bf58dd8d48988d176941735,Gym / Fitness Center,40.746119,-73.993070,-300,2013-01-08 01:12:17,2013-01-07 20:12:17,20,True,0,True,0 days 14:14:11,False,854.183333,24675,Exercise
60053,1083,4c347eeb213c2d7fae85385d,4bf58dd8d48988d1fe931735,Bus Station,40.734500,-73.978673,-300,2013-01-30 14:00:55,2013-01-30 09:00:55,9,False,2,True,0 days 00:32:20,False,32.333333,24676,Transportation
60054,1083,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,-300,2013-01-30 14:33:15,2013-01-30 09:33:15,9,False,2,True,2 days 13:46:19,False,3706.316667,24676,Work
60055,1083,49f4dca6f964a520626b1fe3,4bf58dd8d48988d1c1941735,Mexican Restaurant,40.735174,-73.979597,-300,2013-02-02 04:19:34,2013-02-01 23:19:34,23,True,4,True,0 days 00:29:02,False,29.033333,24677,Meal


In [62]:
data = data[['SessionId', 'utcTimestampOffset', 'Category']]

In [66]:
data.columns = ['SessionId', 'Time', 'ItemId']

In [68]:
data.sort_values(by=['SessionId', 'Time'], inplace=True) # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
data.shape

(60057, 3)

In [103]:
data

Unnamed: 0,SessionId,Time,ItemId,SessionId_plus
0,1,2012-04-14 17:24:08,Meal,1
1,1,2012-04-14 18:07:43,Liquid,1
2,2,2012-04-21 14:22:21,Hobby,2
3,2,2012-04-21 14:48:58,Liquid,2
4,2,2012-04-21 15:18:49,Shopping,2
...,...,...,...,...
60052,24676,2013-01-07 20:12:17,Exercise,24676
60053,24677,2013-01-30 09:00:55,Transportation,24677
60054,24677,2013-01-30 09:33:15,Work,24677
60055,24678,2013-02-01 23:19:34,Meal,24678


In [107]:
data.drop(['SessionId_plus'], axis= 1, inplace= True)

In [108]:
def split_by_date(data: pd.DataFrame, n_days: int):
    final_time = data['Time'].max()
    session_last_time = data.groupby('SessionId')['Time'].max()
    session_in_train = session_last_time[session_last_time < final_time - dt.timedelta(n_days)].index
    session_in_test = session_last_time[session_last_time >= final_time - dt.timedelta(n_days)].index

    before_date = data[data['SessionId'].isin(session_in_train)]
    after_date = data[data['SessionId'].isin(session_in_test)]
    after_date = after_date[after_date['ItemId'].isin(before_date['ItemId'])]
    return before_date, after_date

In [131]:
tr, test = split_by_date(data, n_days=15)
tr, val = split_by_date(tr, n_days=15)

In [132]:
# split
data_train = tmp_df3.iloc[:40019]
data_val = tmp_df3.iloc[40019:49998]
data_test = tmp_df3[49998:]

In [133]:
# data에 대한 정보를 살펴보는 함수
def stats_info(data: pd.DataFrame, status: str):
    print(f'* {status} Set Stats Info\n'
          f'\t Events: {len(data)}\n'
          f'\t Sessions: {data["SessionId"].nunique()}\n'
          f'\t Items: {data["ItemId"].nunique()}\n'
          f'\t First Time : {data["Time"].min()}\n'
          f'\t Last Time : {data["Time"].max()}\n')

In [134]:
stats_info(tr, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 55659
	 Sessions: 22803
	 Items: 20
	 First Time : 2012-04-03 14:02:24
	 Last Time : 2013-01-16 21:21:33

* valid Set Stats Info
	 Events: 1330
	 Sessions: 571
	 Items: 20
	 First Time : 2012-04-03 14:04:38
	 Last Time : 2013-01-31 21:27:59

* test Set Stats Info
	 Events: 3068
	 Sessions: 1304
	 Items: 20
	 First Time : 2012-04-03 14:00:09
	 Last Time : 2013-02-15 21:34:31



In [113]:
# train set에 없는 아이템이 val, test기간에 생길 수 있으므로 train data를 기준으로 인덱싱함.
id2idx = {item_id : index for index, item_id in enumerate(tr['ItemId'].unique())}

def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))  # id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

tr = indexing(tr, id2idx)
val = indexing(val, id2idx)
test = indexing(test, id2idx)

In [114]:
# 데이터가 주어지면 세션이 시작되는 인덱스를 담는 값과 세션을 새로 인덱싱한 값을 갖는 클래스를 만든다.
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""
    
    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()     # 각 세션이 시작된 인덱스를 가진 변수
        self.session_idx = np.arange(self.df['SessionId'].nunique())  # indexing to SessionId
    
    def get_click_offsets(self):
        """
        Return the indexes of the first click of each session IDs,
        """
        offsets = np.zeros(self.df['SessionId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('SessionId').size().cumsum()
        return offsets

In [137]:
tr_dataset = SessionDataset(tr)
tr_dataset.df.tail(20)

Unnamed: 0,SessionId,Time,ItemId
60033,24667,2012-05-01 23:43:58,Transportation
60034,24668,2012-05-25 14:27:51,Transportation
60035,24668,2012-05-25 14:38:58,Meal
60036,24669,2012-05-31 11:12:51,Work
60037,24669,2012-05-31 11:39:24,Shopping
60038,24670,2012-06-17 00:11:11,Meal
60039,24670,2012-06-17 01:07:12,Liquid
60040,24671,2012-09-05 17:17:39,Transportation
60041,24671,2012-09-05 17:38:25,Transportation
60042,24672,2012-09-09 16:01:48,Shopping


In [116]:
tr_dataset.df['item_idx'].nunique()

20

In [143]:
# Session DataLoader

class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, dataset: SessionDataset, batch_size=50):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        start, end, mask, last_session, finished = self.initialize()  # initialize 메소드에서 확인해주세요.
        """
        start : Index Where Session Start
        end : Index Where Session End
        mask : indicator for the sessions to be terminated
        """

        while not finished:
            min_len = (end - start).min() - 1  # Shortest Length Among Sessions
            for i in range(min_len):
                # Build inputs & targets
                inp = self.dataset.df['item_idx'].values[start + i]
                target = self.dataset.df['item_idx'].values[start + i + 1]
                yield inp, target, mask

            start, end, mask, last_session, finished = self.update_status(start, end, min_len, last_session, finished)

    def initialize(self):
        first_iters = np.arange(self.batch_size)    # 첫 배치에 사용할 세션 Index를 가져옵니다.
        last_session = self.batch_size - 1    # 마지막으로 다루고 있는 세션 Index를 저장해둡니다.
        start = self.dataset.click_offsets[self.dataset.session_idx[first_iters]]       # data 상에서 session이 시작된 위치를 가져옵니다.
        end = self.dataset.click_offsets[self.dataset.session_idx[first_iters] + 1]  # session이 끝난 위치 바로 다음 위치를 가져옵니다.
        mask = np.array([])   # session의 모든 아이템을 다 돈 경우 mask에 추가해줄 것입니다.
        finished = False         # data를 전부 돌았는지 기록하기 위한 변수입니다.
        return start, end, mask, last_session, finished

    def update_status(self, start: np.ndarray, end: np.ndarray, min_len: int, last_session: int, finished: bool):  
        # 다음 배치 데이터를 생성하기 위해 상태를 update합니다.
        
        start += min_len   # __iter__에서 min_len 만큼 for문을 돌았으므로 start를 min_len 만큼 더해줍니다.
        mask = np.arange(self.batch_size)[(end - start) == 1]  
        # end는 다음 세션이 시작되는 위치인데 start와 한 칸 차이난다는 것은 session이 끝났다는 뜻입니다. mask에 기록해줍니다.

        for i, idx in enumerate(mask, start=1):  # mask에 추가된 세션 개수만큼 새로운 세션을 돌것입니다.
            new_session = last_session + i  
            if new_session > self.dataset.session_idx[-1]:  # 만약 새로운 세션이 마지막 세션 index보다 크다면 모든 학습데이터를 돈 것입니다.
                finished = True
                break
            # update the next starting/ending point
            start[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session]]     # 종료된 세션 대신 새로운 세션의 시작점을 기록합니다.
            end[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session] + 1]

        last_session += len(mask)  # 마지막 세션의 위치를 기록해둡니다.
        return start, end, mask, last_session, finished

In [161]:
tr_data_loader = SessionDataLoader(tr_dataset, batch_size=4)
len(tr_dataset.df)

55659

In [119]:
iter_ex = iter(tr_data_loader)

In [120]:
def mrr_k(pred, truth: int, k: int):
    indexing = np.where(pred[:k] == truth)[0]
    if len(indexing) > 0:
        return 1 / (indexing[0] + 1)
    else:
        return 0


def recall_k(pred, truth: int, k: int) -> int:
    answer = truth in pred[:k]
    return int(answer)

In [121]:
# Model Architecture

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GRU
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

In [122]:
def create_model(args):
    inputs = Input(batch_shape=(args.batch_size, 1, args.num_items))
    gru, _ = GRU(args.hsz, stateful=True, return_state=True, name='GRU')(inputs)
    dropout = Dropout(args.drop_rate)(gru)
    predictions = Dense(args.num_items, activation='softmax')(dropout)
    model = Model(inputs=inputs, outputs=[predictions])
    model.compile(loss=categorical_crossentropy, optimizer=Adam(args.lr), metrics=['accuracy'])
    model.summary()
    return model

In [123]:
class Args:
    def __init__(self, tr, val, test, batch_size, hsz, drop_rate, lr, epochs, k):
        self.tr = tr
        self.val = val
        self.test = test
        self.num_items = tr['ItemId'].nunique()
        self.num_sessions = tr['SessionId'].nunique()
        self.batch_size = batch_size
        self.hsz = hsz
        self.drop_rate = drop_rate
        self.lr = lr
        self.epochs = epochs
        self.k = k

args = Args(tr, val, test, batch_size=2048, hsz=50, drop_rate=0.1, lr=0.001, epochs=3, k=20)

In [124]:
model = create_model(args)

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(2048, 1, 20)]           0         
                                                                 
 GRU (GRU)                   [(2048, 50),              10800     
                              (2048, 50)]                        
                                                                 
 dropout_3 (Dropout)         (2048, 50)                0         
                                                                 
 dense_3 (Dense)             (2048, 20)                1020      
                                                                 
Total params: 11,820
Trainable params: 11,820
Non-trainable params: 0
_________________________________________________________________


In [144]:
# Model Training

# train 셋으로 학습하면서 valid 셋으로 검증합니다.
def train_model(model, args):
    train_dataset = SessionDataset(args.tr)
    train_loader = SessionDataLoader(train_dataset, batch_size=args.batch_size)

    for epoch in range(1, args.epochs + 1):
        total_step = len(args.tr) - args.tr['SessionId'].nunique()
        tr_loader = tqdm(train_loader, total=total_step // args.batch_size, desc='Train', mininterval=1)
        for feat, target, mask in tr_loader:
            reset_hidden_states(model, mask)  # 종료된 session은 hidden_state를 초기화합니다. 아래 메서드에서 확인해주세요.

            input_ohe = to_categorical(feat, num_classes=args.num_items)
            input_ohe = np.expand_dims(input_ohe, axis=1)
            target_ohe = to_categorical(target, num_classes=args.num_items)

            result = model.train_on_batch(input_ohe, target_ohe)
            tr_loader.set_postfix(train_loss=result[0], accuracy = result[1])

        val_recall, val_mrr = get_metrics(args.val, model, args, args.k)  # valid set에 대해 검증합니다.

        print(f"\t - Recall@{args.k} epoch {epoch}: {val_recall:3f}")
        print(f"\t - MRR@{args.k}    epoch {epoch}: {val_mrr:3f}\n")


def reset_hidden_states(model, mask):
    gru_layer = model.get_layer(name='GRU')  # model에서 gru layer를 가져옵니다.
    hidden_states = gru_layer.states[0].numpy()  # gru_layer의 parameter를 가져옵니다.
    for elt in mask:  # mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  # parameter를 초기화 합니다.
    gru_layer.reset_states(states=hidden_states)


def get_metrics(data, model, args, k: int):  # valid셋과 test셋을 평가하는 코드입니다. 
                                             # train과 거의 같지만 mrr, recall을 구하는 라인이 있습니다.
    dataset = SessionDataset(data)
    loader = SessionDataLoader(dataset, batch_size=args.batch_size)
    recall_list, mrr_list = [], []

    total_step = len(data) - data['SessionId'].nunique()
    for inputs, label, mask in tqdm(loader, total=total_step // args.batch_size, desc='Evaluation', mininterval=1):
        reset_hidden_states(model, mask)
        input_ohe = to_categorical(inputs, num_classes=args.num_items)
        input_ohe = np.expand_dims(input_ohe, axis=1)

        pred = model.predict(input_ohe, batch_size=args.batch_size)
        pred_arg = tf.argsort(pred, direction='DESCENDING')  # softmax 값이 큰 순서대로 sorting 합니다.

        length = len(inputs)
        recall_list.extend([recall_k(pred_arg[i], label[i], k) for i in range(length)])
        mrr_list.extend([mrr_k(pred_arg[i], label[i], k) for i in range(length)])

    recall, mrr = np.mean(recall_list), np.mean(mrr_list)
    return recall, mrr

In [145]:
# 학습 시간이 다소 오래 소요됩니다.
train_model(model, args)

Train: 100%|██████████| 16/16 [00:00<00:00, 35.02it/s, accuracy=0.199, train_loss=2.75]
Evaluation: 0it [00:00, ?it/s]


IndexError: index 320 is out of bounds for axis 0 with size 320

In [146]:
iter_ex = iter(tr_data_loader)

In [158]:
inputs, labels, mask =  next(iter_ex)
print(f'Model Input Item Idx are : {inputs}')
print(f'Label Item Idx are : {"":5} {labels}')
print(f'Previous Masked Input Idx are {mask}')

Model Input Item Idx are : [ 0  4  4 15]
Label Item Idx are :       [ 2  1  6 10]
Previous Masked Input Idx are [0 1 2 3]


In [159]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, dataset: SessionDataset, batch_size=50):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        start, end, mask, last_session, finished = self.initialize()  # initialize 메소드에서 확인해주세요.
        """
        start : Index Where Session Start
        end : Index Where Session End
        mask : indicator for the sessions to be terminated
        """

        while not finished:
            min_len = (end - start).min() - 1  # Shortest Length Among Sessions
            for i in range(min_len):
                # Build inputs & targets
                inp = self.dataset.df['item_idx'].values[start + i]
                target = self.dataset.df['item_idx'].values[start + i + 1]
                yield inp, target, mask

            start, end, mask, last_session, finished = self.update_status(start, end, min_len, last_session, finished)

    def initialize(self):
        first_iters = np.arange(self.batch_size)    # 첫 배치에 사용할 세션 Index를 가져옵니다.
        last_session = self.batch_size - 1    # 마지막으로 다루고 있는 세션 Index를 저장해둡니다.
        start = self.dataset.click_offsets[self.dataset.session_idx[first_iters]]       # data 상에서 session이 시작된 위치를 가져옵니다.
        end = self.dataset.click_offsets[self.dataset.session_idx[first_iters] + 1]  # session이 끝난 위치 바로 다음 위치를 가져옵니다.
        mask = np.array([])   # session의 모든 아이템을 다 돌은 경우 mask에 추가해줄 것입니다.
        finished = False         # data를 전부 돌았는지 기록하기 위한 변수입니다.
        return start, end, mask, last_session, finished

    def update_status(self, start: np.ndarray, end: np.ndarray, min_len: int, last_session: int, finished: bool):  
        # 다음 배치 데이터를 생성하기 위해 상태를 update합니다.
        
        start += min_len   # __iter__에서 min_len 만큼 for문을 돌았으므로 start를 min_len 만큼 더해줍니다.
        mask = np.arange(self.batch_size)[(end - start) == 1]  
        # end는 다음 세션이 시작되는 위치인데 start와 한 칸 차이난다는 것은 session이 끝났다는 뜻입니다. mask에 기록해줍니다.

        for i, idx in enumerate(mask, start=1):  # mask에 추가된 세션 개수만큼 새로운 세션을 돌것입니다.
            new_session = last_session + i  
            if new_session > self.dataset.session_idx[-1]:  # 만약 새로운 세션이 마지막 세션 index보다 크다면 모든 학습데이터를 돈 것입니다.
                finished = True
                break
            # update the next starting/ending point
            start[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session]]     # 종료된 세션 대신 새로운 세션의 시작점을 기록합니다.
            end[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session] + 1]

        last_session += len(mask)  # 마지막 세션의 위치를 기록해둡니다.
        return start, end, mask, last_session, finished

In [160]:
tr_data_loader = SessionDataLoader(tr_dataset, batch_size=4)
tr_dataset.df.head(15)

Unnamed: 0,SessionId,Time,ItemId
0,1,2012-04-14 17:24:08,Meal
1,1,2012-04-14 18:07:43,Liquid
2,2,2012-04-21 14:22:21,Hobby
3,2,2012-04-21 14:48:58,Liquid
4,2,2012-04-21 15:18:49,Shopping
5,2,2012-04-21 16:00:43,Shopping
6,2,2012-04-21 16:18:19,Meal
7,3,2012-05-01 18:08:18,Shopping
8,3,2012-05-01 18:15:23,Liquid
9,4,2012-05-29 19:15:10,Liquid
