In [None]:
import os
import pandas as pd
import re
import math
import copy
import sys
# Add the root directory /workspaces/llm_etl to sys.path
sys.path.append(os.path.abspath(os.path.join('..', '..')))
# Now import your module
from spider2_utils import load_csv_database

-setup-

In [None]:
import pandas as pd
_database = load_csv_database("bank_sales_trading", rows_limit=-1)
shopping_cart_page_hierarchy = _database["shopping_cart_page_hierarchy"]
shopping_cart_events = _database["shopping_cart_events"]

# Question
Can you provide a breakdown of how many times each product was viewed, how many times they were added to the shopping cart, and how many times they were left in the cart without being purchased? Also, give me the count of actual purchases for each product. Ensure that products with a page id in (1, 2, 12, 13) are filtered out.

### User Intent 1: Filter `shopping_cart_page_hierarchy` to include only rows where `product_id` is not null.

In [None]:
filtered_hierarchy = shopping_cart_page_hierarchy[shopping_cart_page_hierarchy['product_id'].notnull()]

### User Intent 2: Merge filtered hierarchy with `shopping_cart_events` on `page_id`.

In [None]:
merged_data = filtered_hierarchy.merge(shopping_cart_events, on='page_id', how='inner')

### User Intent 3: Group by `page_id` to count event_type 1 and 2 as page views and added to cart.

In [None]:
product_viewed = merged_data.groupby('page_id').agg(
    n_page_views=pd.NamedAgg(column='event_type', aggfunc=lambda x: (x == 1).sum()),
    n_added_to_cart=pd.NamedAgg(column='event_type', aggfunc=lambda x: (x == 2).sum())
).reset_index()

### User Intent 4: Filter for purchases (event_type == 3) and exclude specific page_ids, then group by `page_id`.

In [None]:
purchases = shopping_cart_events[shopping_cart_events['event_type'] == 3][['visit_id']].drop_duplicates()
filtered_merged = merged_data[
    merged_data['visit_id'].isin(purchases['visit_id']) & 
    (~merged_data['page_id'].isin([1, 2, 12, 13]))
]
product_purchased = filtered_merged[filtered_merged['event_type'] == 2].groupby('page_id').agg(
    purchased_from_cart=('event_type', 'count')
).reset_index()

### User Intent 5: Identify abandoned carts where event_type == 2 but not followed by a purchase (event_type == 3).

In [None]:
non_purchases = shopping_cart_events[shopping_cart_events['event_type'] == 3][['visit_id']].drop_duplicates()
filtered_merged_abandoned = merged_data[
    ~merged_data['visit_id'].isin(non_purchases['visit_id']) & 
    (~merged_data['page_id'].isin([1, 2, 12, 13]))
]
product_abandoned = filtered_merged_abandoned[filtered_merged_abandoned['event_type'] == 2].groupby('page_id').agg(
    abandoned_in_cart=('event_type', 'count')
).reset_index()

### User Intent 6: Join all result tables together with the page name.

In [None]:
final = shopping_cart_page_hierarchy[['page_id', 'page_name']].drop_duplicates()
final = final.merge(product_viewed, on='page_id', how='inner')
final = final.merge(product_purchased, on='page_id', how='inner')
final = final.merge(product_abandoned, on='page_id', how='inner')
final = final.rename(columns={
    'n_page_views': 'number of product being viewed',
    'n_added_to_cart': 'number added to the cart',
    'abandoned_in_cart': 'without being purchased in cart',
    'purchased_from_cart': 'count of actual purchases'
})
final