In [None]:
#Code to get the load_csv_database function.
#We pass this function as context so there's no need to include it when we parse the notebook
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))
from spider2_utils import load_csv_database


-setup-

In [None]:
import pandas as pd
_database = load_csv_database("E_commerce", rows_limit=-1)

customers = _database["customers"]
orders = _database["orders"]
order_items = _database["order_items"]
products = _database["products"]
geolocation = _database["geolocation"]
leads_closed = _database["leads_closed"]
leads_qualified = _database["leads_qualified"]
order_payments = _database["order_payments"]
order_reviews = _database["order_reviews"]
products = _database["products"]
product_category_name_translation = _database["product_category_name_translation"]
sellers = _database["sellers"]

# Question
According to the RFM definition document, calculate the average sales per order for each customer within distinct RFM segments, considering only 'delivered' orders. Use the customer unique identifier. Clearly define how to calculate Recency based on the latest purchase timestamp and specify the criteria for classifying RFM segments. The average sales should be computed as the total spend divided by the total number of orders. Please analyze and report the differences in average sales across the RFM segments

# Step 1: Calculate Recency Score
Group customers based on the recency of their last purchase.

In [None]:
import pandas as pd
import numpy as np

# Assuming orders and customers DataFrames are already loaded
recency_score = orders[orders['order_status'] == 'delivered']
recency_score = recency_score.merge(customers, on='customer_id')
recency_score = recency_score.groupby('customer_unique_id').agg(
    last_purchase=('order_purchase_timestamp', 'max')
).reset_index()
recency_score['recency'] = pd.qcut(
    recency_score['last_purchase'].rank(method='first', ascending=False), 5, labels=False
) + 1

# Step 2: Calculate Frequency Score
Group customers based on the number of orders they have placed.

In [None]:
frequency_score = orders[orders['order_status'] == 'delivered']
frequency_score = frequency_score.merge(customers, on='customer_id')
frequency_score = frequency_score.groupby('customer_unique_id').agg(
    total_orders=('order_id', 'count')
).reset_index()
frequency_score['frequency'] = pd.qcut(
    frequency_score['total_orders'].rank(method='first', ascending=False), 5, labels=False
) + 1

# Step 3: Calculate Monetary Score
Group customers based on the total amount they have spent.

In [None]:
monetary_score = orders[orders['order_status'] == 'delivered']
monetary_score = monetary_score.merge(order_items, on='order_id')
monetary_score = monetary_score.merge(customers, on='customer_id')
monetary_score = monetary_score.groupby('customer_unique_id').agg(
    total_spent=('price', 'sum')
).reset_index()
monetary_score['monetary'] = pd.qcut(
    monetary_score['total_spent'].rank(method='first', ascending=False), 5, labels=False
) + 1

# Step 4: Assign RFM Buckets
Classify customers into RFM buckets based on their scores.

In [None]:
rfm = recency_score.merge(frequency_score, on='customer_unique_id')
rfm = rfm.merge(monetary_score, on='customer_unique_id')

def assign_rfm_bucket(row):
    if row['recency'] == 1 and row['frequency'] + row['monetary'] in [1, 2, 3, 4]:
        return 'Champions'
    elif row['recency'] in [4, 5] and row['frequency'] + row['monetary'] in [1, 2]:
        return "Can't Lose Them"
    elif row['recency'] in [4, 5] and row['frequency'] + row['monetary'] in [3, 4, 5, 6]:
        return 'Hibernating'
    elif row['recency'] in [4, 5] and row['frequency'] + row['monetary'] in [7, 8, 9, 10]:
        return 'Lost'
    elif row['recency'] in [2, 3] and row['frequency'] + row['monetary'] in [1, 2, 3, 4]:
        return 'Loyal Customers'
    elif row['recency'] == 3 and row['frequency'] + row['monetary'] in [5, 6]:
        return 'Needs Attention'
    elif row['recency'] == 1 and row['frequency'] + row['monetary'] in [7, 8]:
        return 'Recent Users'
    elif ((row['recency'] == 1 and row['frequency'] + row['monetary'] in [5, 6]) or 
         (row['recency'] == 2 and row['frequency'] + row['monetary'] in [5, 6, 7, 8])):
        return 'Potential Loyalists'
    elif row['recency'] == 1 and row['frequency'] + row['monetary'] in [9, 10]:
        return 'Price Sensitive'
    elif row['recency'] == 2 and row['frequency'] + row['monetary'] in [9, 10]:
        return 'Promising'
    elif row['recency'] == 3 and row['frequency'] + row['monetary'] in [7, 8, 9, 10]:
        return 'About to Sleep'

rfm['RFM_Bucket'] = rfm.apply(assign_rfm_bucket, axis=1)

# Step 5: Calculate Average Sales per Customer
Compute the average sales per customer for each RFM bucket.

In [None]:
avg_sales = rfm.groupby('RFM_Bucket').apply(
    lambda x: (x['total_spent'] / x['total_orders']).mean()
).reset_index(name='avg_sales_per_customer')

In [None]:
# avg_sales

In [None]:
# spider 2 gold exec results
# RFM_Bucket              avg_sales_per_customer
# About to Sleep	        57.68495912447257
# Can't Lose Them	        350.8868165989553
# Champions	            250.8568210435466
# Hibernating	            182.8458159996057
# Lost	                57.39320983627944
# Loyal Customers	        237.88125736097265
# Needs Attention	        145.90492498719917
# Potentital Loyalists	130.37477273563726
# Price Sensitive	        34.90935135135135
# Promising	            35.08535857461025
# Recent Users	        67.64212875853163