# Sanity Check
We are testing here on a smaller dataframe (< 1000 rows) the methods to convert the purchase and cart event into features for the view event.

In [28]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath("../.."))
from utils import helper_functions as hf
pd.set_option('display.max_row', None) # display all rows

In [29]:
filepath = 'test_df.csv' 
df = pd.read_csv(filepath) 
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id
0,2021-01-14 22:09:05+00:00,view,5913,2144415922167742561,computers.components.hdd,Unknown,49.57,1515915625591659523
1,2021-01-18 15:47:17+00:00,view,5913,2144415922167742561,computers.components.hdd,Unknown,49.57,1515915625591659523
2,2021-01-18 15:47:22+00:00,cart,5913,2144415922167742561,computers.components.hdd,Unknown,49.57,1515915625591659523
3,2021-01-18 16:15:55+00:00,purchase,5913,2144415922167742561,computers.components.hdd,Unknown,49.57,1515915625591659523
4,2021-01-29 13:56:16+00:00,view,5913,2144415922167742561,computers.components.hdd,Unknown,49.57,1515915625591659523


It is very important data is already sorted by user_id, product_id and event_time. Like that the system can go through the rows sequentially

In [30]:
df = df.sort_values(by=['user_id', 'product_id', 'event_time'])

We create a new column df['last_view_before_purchase'] and enter a "1" only when we have the last view event before a purchase event for the same product_id and user_id. As users could purchase the same product_id multiple times, it is important that the marking happens sequentially. so mark every view above a purchase for the same user_id and product_id. the view could be directly above the purchase or other events (like cart) could be inbetween.

In [31]:
# default new column is 0
df['last_view_before_purchase'] = 0

# iterate through the groups of 'user_id' and 'product_id'
def mark_last_view_before_purchase(group):
    purchase_indices = group.index[group['event_type'] == 'purchase'].tolist()
    if not purchase_indices:
        return group

    for purchase_index in purchase_indices:
        # find the last 'view' before 'purchase'
        views_before_purchase = group[(group.index < purchase_index) & (group['event_type'] == 'view')]
        if not views_before_purchase.empty:
            last_view_index = views_before_purchase.index[-1]
            group.at[last_view_index, 'last_view_before_purchase'] = 1

    return group

df = df.groupby(['user_id', 'product_id']).apply(mark_last_view_before_purchase)

In [32]:
print("Current index levels:", df.index.names)

Current index levels: ['user_id', 'product_id', None]


In [33]:
df = df.reset_index(drop=True)

We create a new column df['last_view_before_cart'] and enter a "1" only when we have the last view event before a cart event for the same product_id and user_id. 

In [34]:
# default new column is 0
df['last_view_before_cart'] = 0

# function to mark the last view before a cart event
def mark_last_view_before_cart(group):
    cart_indices = group.index[group['event_type'] == 'cart'].tolist()
    if not cart_indices:
        return group

    for cart_index in cart_indices:
        # find the last 'view' before 'cart'
        views_before_cart = group[(group.index < cart_index) & (group['event_type'] == 'view')]
        if not views_before_cart.empty:
            last_view_index = views_before_cart.index[-1]
            group.at[last_view_index, 'last_view_before_cart'] = 1

    return group

# apply the function to each group of user_id and product_id
df = df.groupby(['user_id', 'product_id']).apply(mark_last_view_before_cart)

In [35]:
df = df.reset_index(drop=True)

We create a new column 'already in cart' to the view column if the cart event happened before the view event (without a purchase event inbetween).

In [36]:
# new column is defaulted to 0
df['already_in_cart'] = 0

# function to mark views that follow a cart event without a purchase in between
def mark_view_after_cart(group):
    # get indices of cart and view events
    cart_indices = group.index[group['event_type'] == 'cart'].tolist()
    view_indices = group.index[group['event_type'] == 'view'].tolist()
    
    # iterate over each cart event index
    for cart_index in cart_indices:
        # find the first view after the cart event
        for view_index in view_indices:
            if view_index > cart_index:
                # check if there is a purchase between the cart and the view
                purchase_between = group.loc[cart_index + 1:view_index - 1]['event_type'].eq('purchase').any()
                
                # only mark if there is no purchase event in between
                if not purchase_between:
                    group.at[view_index, 'already_in_cart'] = 1
                break 

    return group

# apply the function to each group of user_id and product_id
df = df.groupby(['user_id', 'product_id']).apply(mark_view_after_cart)

In [37]:
# save the results to a CSV file
df.to_csv('test_df_done.csv', index=False)

In [38]:
df = df.reset_index(drop=True)

In [39]:
hf.create_subset(df, ['event_time', 'event_type', 'user_id', 'product_id', 'last_view_before_purchase', 'last_view_before_cart', 'already_in_cart']).head(67)
# check row 1 for an example of the last_view_before_purchase and last_view_before_cart
# check row 65 for an example of the already_in_cart

Unnamed: 0,event_time,event_type,user_id,product_id,last_view_before_purchase,last_view_before_cart,already_in_cart
0,2021-01-14 22:09:05+00:00,view,1515915625591659523,5913,0,0,0
1,2021-01-18 15:47:17+00:00,view,1515915625591659523,5913,1,1,0
2,2021-01-18 15:47:22+00:00,cart,1515915625591659523,5913,0,0,0
3,2021-01-18 16:15:55+00:00,purchase,1515915625591659523,5913,0,0,0
4,2021-01-29 13:56:16+00:00,view,1515915625591659523,5913,1,1,0
5,2021-01-29 13:56:23+00:00,cart,1515915625591659523,5913,0,0,0
6,2021-01-29 13:59:28+00:00,purchase,1515915625591659523,5913,0,0,0
7,2021-01-29 14:09:10+00:00,purchase,1515915625591659523,5913,0,0,0
8,2021-01-29 14:29:08+00:00,purchase,1515915625591659523,5913,0,0,0
9,2021-01-28 21:02:59+00:00,view,1515915625591659523,124077,0,0,0
