# Session-based Recs with Transformers4Rec: Prelimanary Preprocessing

Followed a step by step tutorial:
https://nvidia-merlin.github.io/Transformers4Rec/main/examples/tutorial/index.html

## Imports

In [1]:
import os
import numpy as np
import gc
import shutil
import glob
import pandas as pd

## Read and Process E-Commerce Data

- publicly available eCommerce dataset
- each row represents an event
- events are related to products and users
- each event has many-to-many relation between products and users
https://www.kaggle.com/mkechinov/ecommerce-behavior-data-from-multi-category-store \
**ONLY 2019-Oct.csv USED FOR TRAINING**

In [2]:
# define where the data is kept
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", '../data/')

In [3]:
%%time
raw_df = pd.read_csv(os.path.join(INPUT_DATA_DIR, '2019-Oct.csv'))

CPU times: user 42 s, sys: 9.72 s, total: 51.7 s
Wall time: 54.2 s


In [4]:
raw_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [5]:
raw_df.shape

(42448764, 9)

In [6]:
raw_df.dtypes

event_time        object
event_type        object
product_id         int64
category_id        int64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object

### Convert timestamp from datetime

need to fix nanonseconds datetime $\rightarrow$ seconds as the units are off with the ETL

In [7]:
raw_df['event_time_dt'] = raw_df['event_time'].astype('datetime64[s]')

In [8]:
raw_df['event_time_ts'] = raw_df['event_time_dt'].astype('int')

  raw_df['event_time_ts'] = raw_df['event_time_dt'].astype('int')


In [9]:
raw_df['event_time_ts'] = raw_df['event_time_ts']/1e09

In [10]:
raw_df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_time_dt,event_time_ts
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,2019-10-01 00:00:00,1569888000.0
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2019-10-01 00:00:00,1569888000.0
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,2019-10-01 00:00:01,1569888000.0
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2019-10-01 00:00:01,1569888000.0
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2019-10-01 00:00:04,1569888000.0


In [11]:
# check for nulls
raw_df.isnull().any()

event_time       False
event_type       False
product_id       False
category_id      False
category_code     True
brand             True
price            False
user_id          False
user_session      True
event_time_dt    False
event_time_ts    False
dtype: bool

In [12]:
# remove rows where 'user_session' is null
raw_df = raw_df[raw_df['user_session'].isnull()==False]
len(raw_df)

42448762

In [13]:
# remove event_time column
raw_df = raw_df.drop(['event_time'],axis=1)

In [14]:
raw_df.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_time_dt,event_time_ts
0,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c,2019-10-01 00:00:00,1569888000.0
1,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc,2019-10-01 00:00:00,1569888000.0
2,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,2019-10-01 00:00:01,1569888000.0
3,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713,2019-10-01 00:00:01,1569888000.0
4,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d,2019-10-01 00:00:04,1569888000.0


### Categorify user_session column
- user_session not used as input feature for the model
- raw long strings $\rightarrow$ int values to avoid potential failures

In [15]:
cols = list(raw_df.columns)
cols.remove('user_session')
cols

['event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'event_time_dt',
 'event_time_ts']

***CONTINUE TO USE PANDAS AS VM DOES NOT SUPPORT GPU NVTABULAR***\
however, tutorial uses NVTabular

In [16]:
df = raw_df

In [17]:
# label encode user_session
# convert user_session objects to 'category'
df['user_session'] = df['user_session'].astype('category')

In [18]:
# assigning numerical values to each user_session
df['user_session_numeric'] = df['user_session'].cat.codes

In [19]:
del df['user_session']

In [20]:
df = df.rename(columns={'user_session_numeric':'user_session'})

In [21]:
df.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_dt,event_time_ts,user_session
0,view,44600062,2103807459595387724,,shiseido,35.79,541312140,2019-10-01 00:00:00,1569888000.0,4147850
1,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,2019-10-01 00:00:00,1569888000.0,5316338
2,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,2019-10-01 00:00:01,1569888000.0,3120153
3,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,2019-10-01 00:00:01,1569888000.0,4499321
4,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,2019-10-01 00:00:04,1569888000.0,7176696


In [22]:
raw_df = None
del(raw_df)

In [23]:
gc.collect()

0

### Remove consecutive repeated (user,item) interactions
- keep repeated interactions on the same items
- remove only consecutive interactions (browser refresh/different interactions e.g. click, add-to-cart, purchase)

In [24]:
df = df.sort_values(['user_session', 'event_time_ts']).reset_index(drop=True)

In [25]:
print("Count with in-session repeated interactions: {}".format(len(df)))

Count with in-session repeated interactions: 42448762


In [26]:
# sorts the dataframe by session and timestamp, to remove consecutive repetitions
df['product_id_past'] = df['product_id'].shift(1).fillna(0)
df['session_id_past'] = df['user_session'].shift(1).fillna(0)

In [27]:
# keep only non consecutive repeated in session interactions
df = df[~((df['user_session'] == df['session_id_past']) & (df['product_id'] == df['product_id_past']))]
print("Count after removed in-session repeated interactions: {}".format(len(df)))

Count after removed in-session repeated interactions: 30733301


In [28]:
del(df['product_id_past'])
del(df['session_id_past'])

In [29]:
gc.collect()

0

### Include the item first time seen feature (for recency calculation)
- prod_first_event_time_ts $\rightarrow$ timestamp that an item was seen first time

In [30]:
df = df.reset_index(drop=True)

In [31]:
item_first_interaction_df = df.groupby('product_id').agg({'event_time_ts':'min'}).reset_index().rename(columns={'event_time_ts':'prod_first_event_time_ts'})

In [32]:
item_first_interaction_df.head()
gc.collect()

0

In [33]:
df.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_dt,event_time_ts,user_session
0,view,54900011,2146660887203676486,apparel.costume,,64.35,515483062,2019-10-18 10:54:45,1571396000.0,0
1,view,1005115,2053013555631882655,electronics.smartphone,apple,955.84,513782162,2019-10-31 06:23:12,1572503000.0,1
2,view,1005105,2053013555631882655,electronics.smartphone,apple,1349.46,513782162,2019-10-31 06:23:52,1572503000.0,1
3,view,5100816,2053013553375346967,,xiaomi,29.6,513782162,2019-10-31 06:25:52,1572503000.0,1
4,view,1004858,2053013555631882655,electronics.smartphone,samsung,131.53,513782162,2019-10-31 06:26:58,1572503000.0,1


In [34]:
df = df.merge(item_first_interaction_df,on=['product_id'],how='left').reset_index(drop=True)

In [35]:
df.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_dt,event_time_ts,user_session,prod_first_event_time_ts
0,view,54900011,2146660887203676486,apparel.costume,,64.35,515483062,2019-10-18 10:54:45,1571396000.0,0,1570238000.0
1,view,1005115,2053013555631882655,electronics.smartphone,apple,955.84,513782162,2019-10-31 06:23:12,1572503000.0,1,1569888000.0
2,view,1005105,2053013555631882655,electronics.smartphone,apple,1349.46,513782162,2019-10-31 06:23:52,1572503000.0,1,1569888000.0
3,view,5100816,2053013553375346967,,xiaomi,29.6,513782162,2019-10-31 06:25:52,1572503000.0,1,1569888000.0
4,view,1004858,2053013555631882655,electronics.smartphone,samsung,131.53,513782162,2019-10-31 06:26:58,1572503000.0,1,1569888000.0


In [36]:
del(item_first_interaction_df)
item_first_interaction_df=None
gc.collect()

0

### Only use the first week of data from Oct 2019 dataset

In [37]:
# check the min date
df['event_time_dt'].min()

Timestamp('2019-10-01 00:00:00')

In [38]:
df = df[df['event_time_dt'] < np.datetime64('2019-10-08')].reset_index(drop=True)

In [39]:
df['event_time_dt'].max()

Timestamp('2019-10-07 23:59:59')

In [40]:
# delete event_time_dt
df = df.drop(['event_time_dt'],axis=1)

In [41]:
df.head()

Unnamed: 0,event_type,product_id,category_id,category_code,brand,price,user_id,event_time_ts,user_session,prod_first_event_time_ts
0,view,1004768,2053013555631882655,electronics.smartphone,samsung,251.47,546521725,1570361000.0,2,1569891000.0
1,view,1005098,2053013555631882655,electronics.smartphone,samsung,152.58,546521725,1570361000.0,2,1569897000.0
2,view,1005073,2053013555631882655,electronics.smartphone,samsung,1153.03,546521725,1570361000.0,2,1569888000.0
3,view,1004871,2053013555631882655,electronics.smartphone,samsung,286.6,546521725,1570361000.0,2,1569896000.0
4,view,1004751,2053013555631882655,electronics.smartphone,samsung,197.15,546521725,1570361000.0,2,1569897000.0


In [42]:
# save df as csv
df.to_csv(os.path.join(INPUT_DATA_DIR, '2019-Oct-Processed.csv'))

In [43]:
# shutdown kernel
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}