1. pre-process the sessions (you can execute these tasks in the order you prefer, but make sure to specify which task is being solved in which block of code):
    1. sessionise <font color='red'>(1pt)</font>
    2. select sessions with at least one add-to-cart <font color='red'>(1pt)</font>
    3. add class labels: treat purchase as the positive class <font color='red'>(1pt)</font>
    4. cut purchase sessions to the last event before the first purchase <font color='red'>(1pt)</font>
    5. remove sessions shorter than 5 and longer than 155 clicks <font color='red'>(1pt)</font>
    6. symbolise actions <font color='red'>(1pt)</font>

In [1]:
# imports
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# dataset
df = pd.read_csv('training_data.csv', header = 0, sep = ',', nrows=1000)  # first 1000 rows to save time
df.head()

In [3]:
# A. sessionise
# derive sessions from action by action dataset
df['product_action'] = df['product_action'].fillna('view')
df = df.groupby('session_id_hash')['product_action'].agg(list).reset_index()
df.head()

Unnamed: 0,session_id_hash,product_action
0,01d7d940ce504a0dd72b1d58db5151cfdf8db4dec8cc78...,[view]
1,0281ec4ac20cf43d403a4446087f6905bf663dcc96b8b6...,"[view, view, view, view, view, detail]"
2,04710ddefd3cb44bd3c709c4cc71e97ccbf5a39554f0fa...,"[detail, view, view]"
3,064c9669e8bcd2313430b37b4b3de4a09dc82996fd8562...,"[view, view, view, view, detail, view, view, v..."
4,0e6305b2c2e6dddde5b05c54e8f3381c6f6f0f47dcfa51...,"[view, view, view, detail, view, view, view, d..."


In [4]:
# B. select sessions with at least one add-to-cart
df['add'] = np.where(df.product_action.map(set(['add']).issubset), 1, 0)  # add column with value 1 if add is included
df = df[df['add'] == 1]  # filter on 1
df = df.drop('add', axis=1)  # drop add column
df.head()

Unnamed: 0,session_id_hash,product_action
79,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,"[detail, view, view, view, detail, add, view, ..."
123,cedfb82a26c27b1f22fdc678892a426276ce299362cf0f...,"[view, detail, add, view, view, purchase, view..."
150,ef888972c0f85f01b207596ad54fc95dd6b291557af34e...,"[detail, view, view, detail, view, view, view,..."
153,f0682c73c8f2fb5584144e4f152bb5e58570cc1c69cd1f...,"[detail, view, view, view, detail, add, detail..."


In [5]:
# C. add class labels: treat purchase as the positive class
# add class label to each session (BUY V. NO-BUY)
df['purchase'] = np.where(df.product_action.map(set(['purchase']).issubset), 1, 0)
df.head()

Unnamed: 0,session_id_hash,product_action,purchase
79,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,"[detail, view, view, view, detail, add, view, ...",1
123,cedfb82a26c27b1f22fdc678892a426276ce299362cf0f...,"[view, detail, add, view, view, purchase, view...",1
150,ef888972c0f85f01b207596ad54fc95dd6b291557af34e...,"[detail, view, view, detail, view, view, view,...",0
153,f0682c73c8f2fb5584144e4f152bb5e58570cc1c69cd1f...,"[detail, view, view, view, detail, add, detail...",0


In [6]:
# D. cut purchase sessions to the last event before the first purchase
# cut BUY sessions to last event before purchase
sequence_action = []
for session in df['product_action']:
    if 'purchase' in set(session):
        for e in range(len(session)):
            if session[e] == 'purchase':
                sequence_action.append(session[:e])
                break
    else:
        sequence_action.append(session)
df['session'] = sequence_action
df = df[['session_id_hash', 'session', 'purchase']]
df.head()

Unnamed: 0,session_id_hash,session,purchase
79,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,"[detail, view, view, view, detail, add, view, ...",1
123,cedfb82a26c27b1f22fdc678892a426276ce299362cf0f...,"[view, detail, add, view, view]",1
150,ef888972c0f85f01b207596ad54fc95dd6b291557af34e...,"[detail, view, view, detail, view, view, view,...",0
153,f0682c73c8f2fb5584144e4f152bb5e58570cc1c69cd1f...,"[detail, view, view, view, detail, add, detail...",0


In [7]:
# E. remove sessions shorter than 5 and longer than 155 clicks
df['len'] = df.session.map(len)
# removing very short and very long sessions
df.drop(df[df.len < 5].index, inplace=True)
df.drop(df[df.len > 155].index, inplace=True)
print(min(df['len']), max(df['len']))
df.head()

5 16


Unnamed: 0,session_id_hash,session,purchase,len
79,8256e7aaf7ade5e68787f3118077de97ae8eec18f47f97...,"[detail, view, view, view, detail, add, view, ...",1,9
123,cedfb82a26c27b1f22fdc678892a426276ce299362cf0f...,"[view, detail, add, view, view]",1,5
150,ef888972c0f85f01b207596ad54fc95dd6b291557af34e...,"[detail, view, view, detail, view, view, view,...",0,16
153,f0682c73c8f2fb5584144e4f152bb5e58570cc1c69cd1f...,"[detail, view, view, view, detail, add, detail...",0,8


In [8]:
# F. symbolise actions
from collections import Counter

sessions = df['session'].to_list()
labels = df['purchase'].to_list()

counts = Counter([item for session in sessions for item in session])
# I'm not going to use it, but I'm saving the 0 for padding sessions when symbolising
symbol2idx = {symbol: idx for idx, symbol in enumerate(sorted(counts, key=counts.get, reverse=True), 1)}
print(symbol2idx)

symbolised_sessions = []
for idx, session in enumerate(sessions):
    symbolised_session = [symbol2idx[s] for s in session]
    symbolised_sessions.append(symbolised_session)
print(symbolised_sessions[:5])

{'view': 1, 'detail': 2, 'add': 3}
[[2, 1, 1, 1, 2, 3, 1, 1, 1], [1, 2, 3, 1, 1], [2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 3, 1], [2, 1, 1, 1, 2, 3, 2, 1]]
