In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
file_path = "cleaned_events.csv"  # Update with your file path
df = pd.read_csv(file_path)

In [3]:
# Ensure 'event_time' is in datetime format
df['event_time'] = pd.to_datetime(df['event_time'])

In [4]:
# ------------------------------------------------------------------------
# Dataset Preprocessing
# ------------------------------------------------------------------------
# Strengths:
# - Missing values in category_code, brand, and user_session were handled appropriately by replacing them with "Unknown".
# - Duplicates were identified and removed.
# - event_time was correctly converted to a datetime format for further analysis.


In [5]:
# Improvements:
# 1. Data Validation:
# Check if price contains invalid or extreme outliers, e.g., very high or negative values.
df_invalid_prices = df[df['price'] < 0]  # Identify negative prices
if not df_invalid_prices.empty:
    print("Warning: Found negative prices in the dataset!")
    print(df_invalid_prices)

In [6]:
# 2. Column Standardization:
# Ensure all categorical columns are consistently lowercased to avoid mismatches.
df['category_code'] = df['category_code'].str.lower()
df['brand'] = df['brand'].str.lower()

In [7]:
# 3. Enhance Documentation:
# Comments added to explain why missing values are handled in a specific way (e.g., "Unknown" is used to retain these rows in analysis).

# ------------------------------------------------------------------------
# Exploratory Data Analysis (EDA)
# ------------------------------------------------------------------------
# Strengths:
# - Distribution analysis for event_type, brand, and price was well done with clear visualizations.
# - Time-based insights (daily and hourly event distribution) add depth to understanding user activity.

In [8]:
# Improvements:
# 1. User-Level Insights:
# Analyze user behavior over time.
user_activity_over_time = df.groupby(['user_id', 'event_date'])['event_type'].count().reset_index()

In [9]:
# 2. Event Transitions:
# Enhance transition analysis with Sankey diagrams to visualize view -> cart -> purchase flows.
# Use pandas.crosstab for a summary.
transition_summary = pd.crosstab(df['event_type'], df['event_type'].shift(-1))

# ------------------------------------------------------------------------
# Feature Engineering
# ------------------------------------------------------------------------


In [10]:
# Group data by user
user_group = df.groupby('user_id')

In [11]:
# 1. Recency: Days since the user's last activity
def calculate_recency(group):
    max_date = group['event_time'].max()
    return (df['event_time'].max() - max_date).days

recency = user_group.apply(calculate_recency)

  recency = user_group.apply(calculate_recency)


In [12]:
# 2. Frequency: Number of events by user
frequency = user_group['event_type'].count()

In [13]:
# 3. Monetary: Total spending by user (sum of price for purchase events)
monetary = user_group.apply(lambda group: group[group['event_type'] == 'purchase']['price'].sum())

  monetary = user_group.apply(lambda group: group[group['event_type'] == 'purchase']['price'].sum())


In [14]:
# 4. Session-based features
# Calculate number of unique sessions per user
sessions_per_user = user_group['user_session'].nunique()

In [15]:
# 5. View-to-Cart Ratio
view_to_cart_ratio = user_group.apply(
    lambda group: group[group['event_type'] == 'cart']['event_type'].count() / max(group[group['event_type'] == 'view']['event_type'].count(), 1)
)

  view_to_cart_ratio = user_group.apply(


In [16]:
# 6. Cart-to-Purchase Ratio
cart_to_purchase_ratio = user_group.apply(
    lambda group: group[group['event_type'] == 'purchase']['event_type'].count() / max(group[group['event_type'] == 'cart']['event_type'].count(), 1)
)

  cart_to_purchase_ratio = user_group.apply(


In [17]:
# 7. Average Session Duration
average_session_duration = user_group.apply(
    lambda group: (group['event_time'].max() - group['event_time'].min()).total_seconds() / max(group['user_session'].nunique(), 1)
)

  average_session_duration = user_group.apply(


In [18]:
# 8. Favorite Brand and Category
favorite_brand = user_group['brand'].agg(lambda x: x.value_counts().idxmax() if not x.isnull().all() else "Unknown")
favorite_category = user_group['category_code'].agg(lambda x: x.value_counts().idxmax() if not x.isnull().all() else "Unknown")

In [19]:
# Combine all features into a single DataFrame
features = pd.DataFrame({
    'user_id': frequency.index,
    'recency': recency.values,
    'frequency': frequency.values,
    'monetary': monetary.values,
    'sessions_per_user': sessions_per_user.values,
    'view_to_cart_ratio': view_to_cart_ratio.values,
    'cart_to_purchase_ratio': cart_to_purchase_ratio.values,
    'average_session_duration': average_session_duration.values,
    'favorite_brand': favorite_brand.values,
    'favorite_category': favorite_category.values
})

In [20]:
# Replace infinite values with NaN (caused by divisions)
features.replace([np.inf, -np.inf], np.nan, inplace=True)

# Fill NaN values with 0 (or other appropriate defaults based on your analysis)
features.fillna(0, inplace=True)

In [21]:
# 3. Deep Dive into Top Users:
# Highlight users with high activity or spending to identify patterns.

# ------------------------------------------------------------------------
# Feature Engineering
# ------------------------------------------------------------------------
# Strengths:
# - Includes essential features such as recency, frequency, and monetary.
# - Behavioral metrics like view_to_cart_ratio and cart_to_purchase_ratio are thoughtful additions.

In [22]:
# Improvements:
# 1. Seasonality Features:
# Extract features like month and weekday to capture seasonal trends.
df['event_month'] = df['event_time'].dt.month
df['event_weekday'] = df['event_time'].dt.weekday

In [23]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features[['recency', 'frequency', 'monetary']] = scaler.fit_transform(features[['recency', 'frequency', 'monetary']])
print("Normalization complete. Here's a preview of normalized features:")
print(features[['recency', 'frequency', 'monetary']].head())

Normalization complete. Here's a preview of normalized features:
    recency  frequency  monetary
0  0.777070   0.000000       0.0
1  0.923567   0.000000       0.0
2  0.496815   0.021016       0.0
3  0.949045   0.000000       0.0
4  0.063694   0.061296       0.0


In [24]:
# 3. Interactions:
# Add interaction terms between features (e.g., sessions_per_user * view_to_cart_ratio) to capture complex patterns.
features['interaction'] = features['sessions_per_user'] * features['view_to_cart_ratio']

In [25]:
# Display the first few rows of the feature DataFrame
print("Feature Engineering Complete. Here's a preview of the features:")
print(features.head())

Feature Engineering Complete. Here's a preview of the features:
               user_id   recency  frequency  monetary  sessions_per_user  \
0  1515915625353226922  0.777070   0.000000       0.0                  1   
1  1515915625353230067  0.923567   0.000000       0.0                  1   
2  1515915625353230683  0.496815   0.021016       0.0                  4   
3  1515915625353230922  0.949045   0.000000       0.0                  1   
4  1515915625353234047  0.063694   0.061296       0.0                  1   

   view_to_cart_ratio  cart_to_purchase_ratio  average_session_duration  \
0                 0.0                     0.0                       0.0   
1                 0.0                     0.0                       0.0   
2                 0.0                     0.0                  714304.5   
3                 0.0                     0.0                       0.0   
4                 0.0                     0.0                12244190.0   

  favorite_brand            

In [26]:
# Save the features to a new CSV file
features.to_csv("user_features.csv", index=False)
print("Features saved to 'user_features.csv'.")

Features saved to 'user_features.csv'.
