# Data Extraction & Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
df = pd.read_csv("ecommerce_clickstream_transactions.csv")

In [5]:
df.head()

Unnamed: 0,UserID,SessionID,Timestamp,EventType,ProductID,Amount,Outcome
0,1,1,2024-07-07 18:00:26.959902,page_view,,,
1,1,1,2024-03-05 22:01:00.072000,page_view,,,
2,1,1,2024-03-23 22:08:10.568453,product_view,prod_8199,,
3,1,1,2024-03-12 00:32:05.495638,add_to_cart,prod_4112,,
4,1,1,2024-02-25 22:43:01.318876,add_to_cart,prod_3354,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74817 entries, 0 to 74816
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   UserID     74817 non-null  int64  
 1   SessionID  74817 non-null  int64  
 2   Timestamp  74817 non-null  object 
 3   EventType  74817 non-null  object 
 4   ProductID  32113 non-null  object 
 5   Amount     10682 non-null  float64
 6   Outcome    10682 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 4.0+ MB


In [24]:
df.describe()

Unnamed: 0,UserID,SessionID,Amount
count,74817.0,74817.0,10682.0
mean,500.740741,5.509777,253.189758
std,288.945183,2.869337,143.043974
min,1.0,1.0,5.131843
25%,251.0,3.0,130.933965
50%,501.0,6.0,253.112517
75%,751.0,8.0,378.832078
max,1000.0,10.0,499.981538


In [33]:
df['EventType'].value_counts()

EventType
page_view       10819
add_to_cart     10735
product_view    10696
logout          10685
purchase        10682
click           10632
login           10568
Name: count, dtype: int64

In [34]:
df.isna().sum()

UserID           0
SessionID        0
Timestamp        0
EventType        0
ProductID    42704
Amount       64135
Outcome      64135
dtype: int64

We are only interested in the events that are directly tied to the product that could signal us that the user is interested in the product. 

In [None]:
events_to_keep = ['product_view', 'add_to_cart', 'purchase']
df = df[df['EventType'].isin(events_to_keep)]
df = df.dropna(subset=['ProductID'])
df = df.drop(columns=['SessionID', 'Amount', 'Outcome'])
df = df.drop_duplicates(subset=['UserID', 'ProductID', 'EventType', 'Timestamp'])

# Data Visualization

We would like to have a metric that keeps track of the user interest on the different products. So we will give each event 'product_view', 'add_to_cart', 'purchase' a weigth representing how strongly a user is interested in the product. Each interaction can be repeated multiple times and they will increase the interaction strength. The reason behind this is so if the user interacts the same with different items, the item with more repeated events will have a higher interaction strength. However, adding an item to the cart is a much better signal of interest than just viewing a product. So, we will also give each event a cap of how many times it can affect the interaction strength.

In [5]:
weights = {
    'product_view': 3,
    'add_to_cart': 5,
    'purchase': 8
}

caps = {
    'product_view': 2,
    'add_to_cart': None,
    'purchase': None
}

In [6]:
counts = df.groupby(['UserID', 'ProductID', 'EventType']).size().reset_index(name='count')

def cap_count(row):
    cap = caps[row['EventType']]
    return min(row['count'], cap) if cap else row['count']

counts['count_capped'] = counts.apply(cap_count, axis=1)
counts['interaction_strength'] = (
    counts['count_capped'] * counts['EventType'].map(weights)
)

# Final interaction per user-item
user_item_df = (
    counts
    .groupby(['UserID', 'ProductID'])['interaction_strength']
    .sum()
    .reset_index()
)


## User-Item Matrix (Interaction strength of each user for each product)

In [10]:
user_item_matrix = (
    user_item_df
    .pivot(index='UserID', columns='ProductID', values='interaction_strength')
    .fillna(0)
)

## Item-Item Matrix (How closely related are two items)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
item_user_matrix = user_item_matrix.T

item_similarity = pd.DataFrame(
    cosine_similarity(item_user_matrix),
    index=item_user_matrix.index,
    columns=item_user_matrix.index
)