## Import Data 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def load_data(file_path):
    return pd.read_csv(file_path) 

impressions = load_data('impressions.csv')
products = load_data('product_catalog.csv')
transactions = load_data('transactions.csv')

In [None]:
impressions.head()

In [None]:
products.head()

In [None]:
transactions.head()

In [None]:
# convert columns to lowercase for easier handling
impressions.columns = impressions.columns.str.lower()
products.columns = products.columns.str.lower()
transactions.columns = transactions.columns.str.lower()

# check for missing values
print(impressions.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

In [None]:
# 10 missing values in transactions.revenue

# merge transactions to product level to get total revenue and total quantity sold per product
txns_to_products = transactions.merge(
    products,
    on="product_id",
    how="left"
)

txns_to_products.head()


In [None]:
# lets first create synthetic data of product margin since profitability derives from margins not just revenue

txns_to_products['merch_subcategory'].unique()

# Define margin rules for each subcategory
margin_rules = {
    "Fizzy Drinks": (10, 20),
    "Nuts & Dried Fruit": (20, 35),
    "Biscuits & Cookies": (25, 40), # Brand-driven, impulse buys
    "Chocolate": (35, 55),
    "Crisps": (25, 40), # Brand-driven, impulse buys
    "Healthier Crisps": (20, 35),
    "Sweets": (35, 55),
    "Still Water": (5, 12), # Commodity, low margin
    "Crackers & Crispbreads": (20, 30),
    "Popcorn & Pretzels": (25, 40),
    "Protein & Snack Bars": (30, 50), # Niche, higher margin
    "Energy & Sports Drinks": (25, 40),
    "Squashes & Cordials": (15, 25), # Low-cost, low-margin
    "Juice & Smoothies": (20, 35),
    "Mixers": (15, 25),
    "Fruit Drinks": (15, 25),
    "Gum & Mints": (30, 50), # Impulse buys, higher margin
    "Iced Tea & Coffee": (20, 35),
    "Sparkling Water": (8, 15), # Commodity, low margin
    "Wellness & Protein Drinks": (30, 50),
    "Meat Snacking": (25, 40) # Niche, higher margin
}

def assign_margin(subcategory):
    low, high = margin_rules.get(subcategory, (10, 25))  # default if unknown
    return np.random.uniform(low, high)

# Apply to your product catalog
txns_to_products["margin"] = txns_to_products["merch_subcategory"].apply(assign_margin)
txns_to_products


In [None]:
txns_to_products.columns

In [None]:
# now let us fill the revenue missing values based on quantity sold * price * (1 + margin%)
txns_to_products['unit_price'] = txns_to_products['revenue'] + txns_to_products['total_discount'] / txns_to_products['quantity']

subcat_price = (
    txns_to_products.groupby('product_id')['unit_price']
    .mean()
    .reset_index()
    .rename(columns={'unit_price': 'avg_product_price'})
)

txns_to_products = txns_to_products.merge(
    subcat_price,
    on='product_id',
    how='left'
)
txns_to_products['revenue'] = txns_to_products.apply(
    lambda row: row['quantity'] * row['avg_product_price'] * (1 + row['margin']/100) if pd.isnull(row['revenue']) else row['revenue'],
    axis=1
)   

txns_to_products.head(50)

In [None]:
txns_to_products.revenue.isnull().sum()

In [None]:
txns_to_products.isnull().sum()

In [None]:
txns_to_products.columns

In [None]:
# now merge impressions with products to get product details
txns_imprs_products = (
    txns_to_products.groupby(['user_id','product_id'])
    .agg({
        'created_at_local': 'first',
        'location_id': 'first',
        'is_newbie_order': 'first',
        'is_fam': 'first',
        'is_fam_exclusive_pricing_applied': 'first',
        'is_on_promo': 'first',
        'product_name': 'first',
        'merch_category': 'first',
        'merch_subcategory': 'first',
        'brand': 'first',
        "quantity": "sum",
        "revenue": "sum",
        "total_discount": "sum",
        "order_id": "nunique",
        "margin": "mean",
        "unit_price": "mean",
        "avg_product_price": "mean"
    })
    .reset_index()
)

txns_imprs_products.head(50)

txns_imprs_products = txns_imprs_products.merge(impressions, on=["user_id", "product_id"], how="left")

txns_imprs_products.fillna({'n_impressions': 0, 'n_impressions_with_atc': 0}, inplace=True)
txns_imprs_products


In [None]:
# Fill missing unit_price or avg_product_price with fallback values
txns_imprs_products["unit_price"] = txns_imprs_products["unit_price"].fillna(
    txns_imprs_products["avg_product_price"]
)
txns_imprs_products["avg_product_price"].isnull().sum()

### User and Product Analysis

#### Single Product Volume

In [None]:
most_bought_volume = txns_imprs_products.groupby('product_name')['quantity'].sum().nlargest(50)
most_bought_volume


In [None]:
top_products = txns_imprs_products.groupby('product_name')['quantity'].sum().sort_values(ascending=False).head(20)
plt.figure(figsize=(12,6))
sns.barplot(x=top_products.values, y=top_products.index)
plt.title("Top 20 Products by Quantity Sold")
plt.xlabel("Total Quantity")
plt.ylabel("Product Name")
plt.show()


### Single Product by Revenue

In [None]:
top_products = txns_imprs_products.groupby('product_name')['revenue'].sum().sort_values(ascending=False).head(20)
plt.figure(figsize=(12,6))
sns.barplot(x=top_products.values, y=top_products.index)
plt.title("Top 20 Products by Revenue")
plt.xlabel("Total Revenue")
plt.ylabel("Product Name")
plt.show()

### Single product by total net margin

In [None]:
top_products_margin = (
    txns_imprs_products.groupby("product_name")
    .agg(total_margin=("margin", "sum"),
         total_revenue=("revenue", "sum"),
         total_quantity=("quantity", "sum"))
    .reset_index()
    .sort_values("total_margin", ascending=False)
)

print("🔝 Top 20 Products by Absolute Margin:")
display(top_products_margin.head(20))


In [None]:
top_products_margin = top_products_margin.head(20)
plt.figure(figsize=(12,8))

palette = sns.color_palette("rocket", n_colors=top_products_margin.shape[0])

sns.barplot(
    x="total_margin",
    y="product_name",
    data=top_products_margin,
    palette=palette
)

plt.title("Top 20 Products by Absolute Margin", fontsize=16)
plt.xlabel("Total Margin ($)", fontsize=12)
plt.ylabel("Product", fontsize=12)
plt.tight_layout()
plt.show()

#### Porduct pairs Co-occurance - Most often sold together

In [None]:
from itertools import combinations
from collections import Counter

purchased = txns_imprs_products[txns_imprs_products['revenue'] > 0]

order_products = purchased.groupby('order_id')['product_name'].apply(list)

pair_counter = Counter()

for products in order_products:
    # Only consider baskets with 2+ products
    if len(products) > 1:
        pair_counter.update(combinations(sorted(products), 2))

pair_counts = pd.DataFrame(
    [(p1, p2, c) for (p1, p2), c in pair_counter.items()],
    columns=['product_1','product_2','count']
)

pair_counts['product_pair'] = pair_counts['product_1'] + " & " + pair_counts['product_2']

# Top 10
top_pairs = pair_counts.sort_values(by='count', ascending=False).head(10)
top_pairs


In [None]:
palette = sns.color_palette("tab10", len(top_pairs))

plt.figure(figsize=(12, 6))
sns.barplot(x='count', y='product_pair', data=top_pairs, palette=palette)
plt.title("Top 10 Product Pairs Bought Together")
plt.xlabel("Number of Orders Buying Pair")
plt.ylabel("Product Pair")
plt.show()

In [None]:
# Recency, Frequency, Monetary (RMF) analysis
# Recency: How recently a customer made a purchase
# Frequency: How often they make a purchase
# Monetary: How much money they spend

txns_imprs_products['date'] = pd.to_datetime(txns_imprs_products['created_at_local'], dayfirst=True)
snapshot_date = txns_imprs_products['date'].max() + pd.Timedelta(days=1)
rfm = txns_imprs_products.groupby('user_id').agg(
    recency=('date', lambda x: (snapshot_date - x.max()).days),
    frequency=('product_name', 'count'),
    monetary=('revenue', 'sum')
).reset_index()

# User-product matrix
user_product = txns_imprs_products.pivot_table(index='user_id', columns='product_name', values='revenue', aggfunc='sum', fill_value=0)

# Lift calculation for product pairs
def compute_lift(df):
    n_users = df.shape[0]
    lifts = {}
    products = df.columns
    for i, p1 in enumerate(products):
        for j, p2 in enumerate(products[i+1:], i+1):
            p1_buy = (df[p1] > 0).sum()
            p2_buy = (df[p2] > 0).sum()
            both = ((df[p1] > 0) & (df[p2] > 0)).sum()
            lift = (both / n_users) / ((p1_buy/n_users)*(p2_buy/n_users) + 1e-6)
            lifts[(p1,p2)] = lift
    return lifts

lift_scores = compute_lift(user_product)

In [None]:
# Visualisations of RFM distributions of product pairs

# Recency distribution
plt.figure(figsize=(8,4))
sns.histplot(rfm['recency'], bins=50, kde=True)
plt.title("Distribution of Recency (days since last purchase)")
plt.xlabel("Days")
plt.ylabel("Number of Users")
plt.show()

# Frequency distribution
plt.figure(figsize=(8,4))
sns.histplot(rfm['frequency'], bins=50, kde=True)
plt.title("Distribution of Purchase Frequency")
plt.xlabel("Number of Orders")
plt.ylabel("Number of Users")
plt.show()

# Monetary distribution
plt.figure(figsize=(8,4))
sns.histplot(rfm['monetary'], bins=50, kde=True)
plt.title("Distribution of Monetary Value")
plt.xlabel("Total Revenue per User")
plt.ylabel("Number of Users")
plt.show()

In [None]:
lift_df = pd.DataFrame([(k[0], k[1], v) for k,v in lift_scores.items()], columns=['product_1','product_2','lift'])

# Filter top 20 products by revenue
top_product_ids = txns_imprs_products['product_name'].value_counts().head(20).index
lift_top = lift_df[(lift_df['product_1'].isin(top_product_ids)) & (lift_df['product_2'].isin(top_product_ids))]

# Pivot Table
lift_matrix = lift_top.pivot(index='product_1', columns='product_2', values='lift')

plt.figure(figsize=(12,10))
sns.heatmap(lift_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Lift Between Top 20 Products")
plt.show()

### Show product pairs which when they are sold together and in dicount, yield the highest margin

In [None]:

from collections import defaultdict

purchased_small = purchased[['order_id', 'product_name', 'margin', 'total_discount']]

# Group by order and convert to list of tuples (product, margin, discount)
order_products = purchased_small.groupby('order_id').apply(
    lambda df: list(zip(df['product_name'], df['margin'], df['total_discount']))
)

pair_margin_totals = defaultdict(float)
pair_discount_totals = defaultdict(float)
pair_counts = defaultdict(int)

for products in order_products:
    if len(products) > 1:
        for (p1, m1, d1), (p2, m2, d2) in combinations(sorted(products), 2):
            key = (p1, p2)
            pair_margin_totals[key] += m1 + m2
            pair_discount_totals[key] += d1 + d2
            pair_counts[key] += 1

bundle_analysis = pd.DataFrame([
    (p1, p2, pair_margin_totals[(p1,p2)], pair_discount_totals[(p1,p2)], pair_counts[(p1,p2)])
    for p1,p2 in pair_margin_totals.keys()
], columns=['product_1','product_2','total_margin','total_discount','count'])

bundle_analysis['product_pair'] = bundle_analysis['product_1'] + " & " + bundle_analysis['product_2']

# Top 10 by total margin
top_margin_pairs = bundle_analysis.sort_values('total_margin', ascending=False).head(10)
top_margin_pairs

In [None]:
palette = sns.color_palette("Set2", n_colors=top_margin_pairs.shape[0])

plt.figure(figsize=(12,6))
sns.barplot(
    data=top_margin_pairs,
    x='total_margin',
    y='product_pair',
    palette=palette
)

plt.title("Top 10 Discounted Product Pairs by Total Margin", fontsize=16)
plt.xlabel("Total Margin ($)", fontsize=12)
plt.ylabel("Product Pair", fontsize=12)
plt.tight_layout()
plt.show()

### Show top products by impression volume

In [None]:

top_impressions = txns_imprs_products.groupby("product_name")["n_impressions"].sum().sort_values(ascending=False).head(10)

# top 10 products by impressions with atc
top_impressions_atc = txns_imprs_products.groupby("product_name")["n_impressions_with_atc"].sum().sort_values(ascending=False).head(10)
# Function to plot
def plot_top10(series, title, color="skyblue"):
    plt.figure(figsize=(10,6))
    sns.barplot(x=series.values, y=series.index, color=color)
    plt.title(title)
    plt.xlabel("Value")
    plt.ylabel("Product Name")
    plt.tight_layout()
    plt.show()


plot_top10(top_impressions_atc, "Top 10 Products by Impressions with ATC", "salmon")
plot_top10(top_impressions, "Top 10 Products by Impressions", "plum")


### Q1. Create initial features (that will maximise AOV) - Product Level and Marketing Level

In [None]:
txns_imprs_products["margin_per_unit"] = txns_imprs_products["margin"] * txns_imprs_products["unit_price"]
txns_imprs_products["discount_pct"] = txns_imprs_products["total_discount"] / (txns_imprs_products["revenue"] + txns_imprs_products["total_discount"] + 1e-6)
txns_imprs_products["atc_rate"] = txns_imprs_products["n_impressions_with_atc"] / txns_imprs_products["n_impressions"]
txns_imprs_products["conversion_rate"] = txns_imprs_products["quantity"] / (txns_imprs_products["n_impressions_with_atc"] + 1e-6)
txns_imprs_products["revenue_per_impression"] = txns_imprs_products["revenue"] / txns_imprs_products["n_impressions"]
txns_imprs_products["margin_per_impression"] = txns_imprs_products["margin_per_unit"] * txns_imprs_products["quantity"] / (txns_imprs_products["n_impressions"] + 1e-6)
txns_imprs_products

In [None]:
txns_imprs_products.to_csv("txns_imprs_products.csv", index=False)

In [None]:
import pandas as pd

txns_imprs_products = pd.read_csv("txns_imprs_products.csv")
txns_imprs_products.columns

In [None]:
from itertools import combinations
from sklearn.preprocessing import MinMaxScaler

purchased = txns_imprs_products[txns_imprs_products['revenue'] > 0]

# Step 1: Aggregate metrics per order
order_metrics = purchased.groupby('order_id').agg({
    'margin': 'sum',
    'revenue': 'sum',
    'quantity': 'sum'
}).reset_index()

# Step 2: Normalize metrics (0-1) using MinMaxScaler
scaler = MinMaxScaler()
order_metrics[['margin_norm','revenue_norm','quantity_norm']] = scaler.fit_transform(
    order_metrics[['margin','revenue','quantity']]
)

# Step 3: Compute composite score (equal weighting)
order_metrics['composite_score'] = (
    order_metrics['margin_norm'] + order_metrics['revenue_norm'] + order_metrics['quantity_norm']
)

# Step 1: Use top 1000 orders by margin
top_orders = order_metrics.nlargest(2000, 'composite_score')['order_id']
df_top = purchased[purchased['order_id'].isin(top_orders)]
df_top = df_top.groupby('order_id').head(2000).reset_index(drop=True)

print(df_top.shape)
# Step 2: Keep only needed columns

# Step 1: Keep only needed columns in df_top
cols = ['created_at_local', 'order_id', 'location_id', 'user_id', 'is_newbie_order',
        'is_fam', 'product_id', 'quantity', 'revenue', 'is_fam_exclusive_pricing_applied',
        'is_on_promo', 'total_discount', 'product_name', 'merch_category',
        'merch_subcategory', 'brand', 'margin', 'unit_price', 'avg_product_price',
        'n_impressions', 'n_impressions_with_atc', 'date', 'margin_per_unit',
        'discount_pct', 'atc_rate', 'conversion_rate', 'revenue_per_impression',
        'margin_per_impression']

df_top = df_top[cols]

# Step 2: Merge to create product pairs
df_pairs = df_top.merge(df_top, on='order_id', suffixes=('_1', '_2'))

# Step 3: Remove duplicate columns
df_pairs = df_pairs.loc[:, ~df_pairs.columns.duplicated()]

# Step 4: Keep only unique pairs
df_pairs = df_pairs[df_pairs['product_name_1'] < df_pairs['product_name_2']]

# Step 5: Compute bundle-level metrics
df_pairs['bundle_total_margin'] = df_pairs['margin_1'] + df_pairs['margin_2']
df_pairs['bundle_total_revenue'] = df_pairs['revenue_1'] + df_pairs['revenue_2']
df_pairs['bundle_total_quantity'] = df_pairs['quantity_1'] + df_pairs['quantity_2']
df_pairs['bundle_total_discount'] = df_pairs['total_discount_1'] + df_pairs['total_discount_2']
df_pairs['bundle_total_impressions'] = df_pairs['n_impressions_1'] + df_pairs['n_impressions_2']
df_pairs['bundle_total_add_to_cart'] = df_pairs['n_impressions_with_atc_1'] + df_pairs['n_impressions_with_atc_2']
df_pairs['bundle_ctr'] = df_pairs['bundle_total_add_to_cart'] / (df_pairs['bundle_total_impressions'] + 1e-6)
df_pairs['product_pair'] = df_pairs['product_name_1'] + " & " + df_pairs['product_name_2']
df_pairs['bundle_conversion_rate'] = df_pairs['bundle_total_quantity'] / (df_pairs['bundle_total_add_to_cart'] + 1e-6)
df_pairs['bundle_margin_per_unit'] = df_pairs['bundle_total_margin'] / (df_pairs['bundle_total_quantity'] + 1e-6)
df_pairs['bundle_revenue_per_impression'] = df_pairs['bundle_total_revenue'] / (df_pairs['bundle_total_impressions'] + 1e-6)
# Bundle AOV = total revenue / total quantity
df_pairs['bundle_aov'] = df_pairs['bundle_total_revenue'] / (df_pairs['bundle_total_quantity'] + 1e-6)


df_pairs.head()


In [None]:
df_pairs.merch_subcategory_1.unique()

In [None]:
# Example: classify products into broader segments
def classify_segment(row):
    if row['merch_subcategory'] in ['Still Water', 'Sparkling Water', 'Juice & Smoothies',
                                      'Nuts & Dried Fruit', 'Squashes & Cordials',
                                      'Crackers & Crispbreads', 'Iced Tea & Coffee', 'Wellness & Protein Drinks',
                                      'Fruit Drinks', 'Sparkling Water', 'Protein & Snack Bars',
                                    'Mixers', 'Healthier Crisps']:
        return 'Health-conscious'
    elif row['merch_subcategory'] in ['Biscuits & Cookies', 'Fizzy Drinks', 'Crisps',
                                       'Chocolate', 'Meat Snacking',
                                       'Popcorn & Pretzels', 'Energy & Sports Drinks',
                                       'Sweets', 'Gum & Mints']:
        return 'Impulsive'
    else:
        return 'Other'

def classify_bundle_segment(row):
    seg1 = classify_segment({'merch_subcategory': row['merch_subcategory_1']})
    seg2 = classify_segment({'merch_subcategory': row['merch_subcategory_2']})
    
    # If both products belong to the same segment, take that segment
    if seg1 == seg2:
        return seg1
    else:
        # Decide priority if mixed segments (e.g., Health-conscious > Impulsive)
        if 'Health-conscious' in [seg1, seg2]:
            return 'Health-conscious'
        elif 'Impulsive' in [seg1, seg2]:
            return 'Impulsive'
        else:
            return 'Other'

df_pairs['bundle_segment'] = df_pairs.apply(classify_bundle_segment, axis=1)

df_pairs.head()

## Q2. Creating Bundle Scoring /Ranking System to identify high performing bundles to increase AOV

In [None]:
# Average order value per product
df_top['product_aov'] = df_top['revenue'] / (df_top['quantity'] + 1e-6)


# Compute revenue, margin, high AOV, impressions per subcategory

subcat_stats = df_top.groupby('merch_subcategory').agg({
    'revenue': 'sum',
    'margin': 'sum',
    'n_impressions': 'sum',
    'quantity': 'sum',
    'product_aov': 'mean'
}).reset_index()


subcat_stats['revenue_norm'] = subcat_stats['revenue'] / subcat_stats['revenue'].sum()
subcat_stats['margin_norm'] = subcat_stats['margin'] / subcat_stats['margin'].sum()
subcat_stats['impression_norm'] = subcat_stats['n_impressions'] / subcat_stats['n_impressions'].sum()
subcat_stats['product_aov_norm'] = subcat_stats['product_aov'] / subcat_stats['product_aov'].max()

df_pairs = df_pairs.merge(
    subcat_stats[['merch_subcategory','revenue_norm','margin_norm','impression_norm', 'product_aov_norm']].rename(
        columns={
            'revenue_norm':'revenue_norm_1',
            'margin_norm':'margin_norm_1',
            'impression_norm':'impression_norm_1',
            'product_aov_norm':'product_aov_norm_1'
        }
    ), left_on='merch_subcategory_1', right_on='merch_subcategory', how='left'
).drop(columns=['merch_subcategory'])

df_pairs = df_pairs.merge(
    subcat_stats[['merch_subcategory','revenue_norm','margin_norm','impression_norm', 'product_aov_norm']].rename(
        columns={
            'revenue_norm':'revenue_norm_2',
            'margin_norm':'margin_norm_2',
            'impression_norm':'impression_norm_2',
            'product_aov_norm':'product_aov_norm_2'
        }
    ), left_on='merch_subcategory_2', right_on='merch_subcategory', how='left'
).drop(columns=['merch_subcategory'])


In [None]:
df_pairs.columns

In [None]:
df_pairs[['revenue_norm','quantity_norm','margin_norm', 'impression_norm']] = scaler.fit_transform(
    df_pairs[['bundle_total_revenue','bundle_total_quantity','bundle_total_margin', 'bundle_total_impressions']]
)

# MinMax scale AOV separately if you want it in scoring
df_pairs['bundle_aov_norm'] = MinMaxScaler().fit_transform(df_pairs[['bundle_aov']])


In [None]:
# Give weights
df_pairs['bundle_score'] = (
    0.3 * df_pairs['revenue_norm'] +
    0.3 * df_pairs['margin_norm'] +
    0.2 * df_pairs['quantity_norm'] +
    0.2 * df_pairs['bundle_aov_norm']
)


In [None]:
top_bundles = df_pairs.sort_values('bundle_score', ascending=False)
top_bundles[['product_pair', 'bundle_segment', 'bundle_total_revenue', 'bundle_total_margin', 'bundle_total_quantity', 'bundle_total_impressions', 'bundle_score', 'bundle_aov']].head(5)



In [None]:
# df_pairs.to_csv("product_pair_construction.csv", index=False)

In [None]:
import pandas as pd
df_pairs = pd.read_csv("product_pair_construction.csv")
df_pairs.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

top_bundles = df_pairs.sort_values('bundle_score', ascending=False)
top_bundles[['product_pair', 'bundle_segment', 'bundle_total_revenue', 'bundle_total_margin', 'bundle_total_quantity', 'bundle_total_impressions', 'bundle_score', 'bundle_aov']].head(50)


top_bundles_plot = top_bundles.head(40)
plt.figure(figsize=(10,6))
sns.barplot(x='bundle_score', y='product_pair', data=top_bundles_plot, hue='bundle_segment', dodge=False)
plt.title('Top 40 Bundles Combos by Score')
plt.xlabel('Composite Bundle Score')
plt.ylabel('Bundle')
plt.show()

In [None]:
top_bundles_plot[['product_pair','revenue_norm','margin_norm','quantity_norm']].set_index('product_pair').plot(kind='bar', stacked=True, figsize=(12,6))
plt.title('Contribution of Revenue, Margin, Quantity to Bundle Score')
plt.ylabel('Normalized Value')
plt.show()

In [None]:
top_bundles_per_segment = df_pairs.groupby('bundle_segment').apply(
    lambda x: x.sort_values('bundle_score', ascending=False).head(20)
).reset_index(drop=True)

g = sns.FacetGrid(top_bundles_per_segment, col='bundle_segment', height=5, sharex=False)
g.map_dataframe(sns.barplot, x='bundle_score', y='product_pair', palette='Set3', dodge=False)
g.set_titles(col_template="{col_name} Segment")
plt.show()


In [None]:
top_conv_bundles = df_pairs.sort_values('bundle_conversion_rate', ascending=False).head(40)

plt.figure(figsize=(10,6))
sns.barplot(x='bundle_conversion_rate', y='product_pair', data=top_conv_bundles, palette='coolwarm')
plt.xlabel('Bundle Conversion Rate (Add-to-Cart / Impressions)')
plt.ylabel('Bundle')
plt.title('Top 20 Bundles by Conversion Rate')
plt.show()


## Find bundles which perform well in terms of revenue and margin but are not discounted and drive them for promotion

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_pairs['bundle_name'] = (
    df_pairs['product_name_1'] + " + " + df_pairs['product_name_2']
)

df_no_promo = df_pairs[(df_pairs['is_on_promo_1'] == False) & (df_pairs['is_on_promo_2'] == False)].copy()


scaler = MinMaxScaler()
df_no_promo[['rev_scaled','margin_scaled', 'quantity_scaled']] = scaler.fit_transform(
    df_no_promo[['bundle_total_revenue','bundle_total_margin', 'bundle_total_quantity']]
)

df_no_promo['bundle_aov_scaled'] = MinMaxScaler().fit_transform(df_no_promo[['bundle_aov']])

# Promotion Potential = equal weight of revenue + margin
df_no_promo['promo_potential'] = (
    0.3 * df_no_promo['rev_scaled'] +
    0.3 * df_no_promo['margin_scaled'] +
    0.2 * df_no_promo['quantity_scaled'] +
    0.2 * df_no_promo['bundle_aov_scaled']
)

# Ranking bundles by promo potential
top_promo_candidates = df_no_promo.sort_values('promo_potential', ascending=False)
top_promo_candidates[['bundle_name',
                      'bundle_total_revenue',
                      'bundle_total_margin',
                      'bundle_conversion_rate',
                      'bundle_segment',
                      'bundle_aov',
                      'promo_potential']]


## Q3a. Calculate potential impact the promo candidates will bring to Gopuff

### Steps to Estimate AOV Uplift

- Current AOV (baseline):

Already we have bundle_aov (or recomputed from bundle_total_revenue / bundle_total_quantity).

- Promotion effect assumption:

A discount (say 10–20%) increases conversion rate by some lift factor (e.g., +20–30%).

More conversions → higher revenue & margin overall (despite lower unit price).

In [None]:
max_discount_per_bundle = df_pairs.groupby('product_pair')[['total_discount_1', 'total_discount_2']].max().reset_index()

max_discount_per_bundle.rename(columns={'total_discount_1': 'max_discount_1', 'total_discount_2': 'max_discount_2'}, inplace=True)

In [None]:
top_promo_candidates = top_promo_candidates.merge(
    max_discount_per_bundle[['product_pair', 'max_discount_1', 'max_discount_2']],
    on='product_pair',
    how='left'
)

In [None]:


top_promo_candidates = top_promo_candidates[(top_promo_candidates['max_discount_1'] > 0.0) | (top_promo_candidates['max_discount_2'] > 0.0)]
top_promo_candidates

In [None]:
top_promo_candidates['discount_pct_assumed'] = top_promo_candidates[['max_discount_1', 'max_discount_2']].max(axis=1)

top_promo_candidates['discount_pct_assumed'] = top_promo_candidates['discount_pct_assumed']

# Make sure these columns are numeric
df_pairs['bundle_conversion_rate'] = pd.to_numeric(df_pairs['bundle_conversion_rate'], errors='coerce')
df_pairs['discount_pct_1'] = pd.to_numeric(df_pairs['discount_pct_1'], errors='coerce')
df_pairs['discount_pct_2'] = pd.to_numeric(df_pairs['discount_pct_2'], errors='coerce')

df_pairs['is_promo'] = (
    (df_pairs['discount_pct_1'] > 0) | 
    (df_pairs['discount_pct_2'] > 0) |
    (df_pairs['is_on_promo_1'] == 1) | 
    (df_pairs['is_on_promo_2'] == 1)
)

# Baseline (no promo)
baseline = df_pairs[df_pairs['is_promo'] == False].groupby('bundle_segment')['bundle_conversion_rate'].mean()

# Promo (with discount or promo flag)
promo = df_pairs[df_pairs['is_promo'] == True].groupby('bundle_segment')['bundle_conversion_rate'].mean()

conv_rate_uplift = ((promo / baseline) - 1).fillna(0)

conv_rate_uplift_map = {
    'Health-conscious': 0.699318,
    'Impulsive': 0.590862
}

top_promo_candidates['conv_rate_uplift'] = top_promo_candidates['bundle_segment'].map(conv_rate_uplift_map)

top_promo_candidates['expected_bundle_quantity'] = (
    top_promo_candidates['bundle_total_quantity'] * (1 + top_promo_candidates['conv_rate_uplift']))

top_promo_candidates['expected_bundle_conversion_rate'] = (
    top_promo_candidates['bundle_conversion_rate'] * (1 + top_promo_candidates['conv_rate_uplift']))



In [None]:
# Revenue impact: lower price per unit but more units sold
top_promo_candidates['expected_bundle_revenue'] = (
    top_promo_candidates['bundle_total_revenue'] 
    * (1 - top_promo_candidates['discount_pct_assumed']) 
    * (1 + top_promo_candidates['conv_rate_uplift'])
)

# Margin impact: lower margin per unit but more units sold

top_promo_candidates['expected_bundle_margin'] = (
    top_promo_candidates['bundle_total_margin'] 
    * (1 - top_promo_candidates['discount_pct_assumed']) 
    * (1 + top_promo_candidates['conv_rate_uplift'])
)
# Revenue uplift %

top_promo_candidates['revenue_uplift_pct'] = (
    (top_promo_candidates['expected_bundle_revenue'] - top_promo_candidates['bundle_total_revenue'])
    / top_promo_candidates['bundle_total_revenue'] * 100
)

top_promo_candidates['margin_uplift_pct'] = (
    (top_promo_candidates['expected_bundle_margin'] - top_promo_candidates['bundle_total_margin'])
    / top_promo_candidates['bundle_total_margin'] * 100
)

# Expected AOV
top_promo_candidates['expected_bundle_aov'] = (
    top_promo_candidates['expected_bundle_revenue'] / (top_promo_candidates['expected_bundle_quantity'])
)

top_promo_candidates['expected_aov_uplift'] =  top_promo_candidates['expected_bundle_revenue'] / top_promo_candidates['expected_bundle_quantity']

# Expected AOV uplift %
top_promo_candidates['aov_uplift_pct'] = (
    (top_promo_candidates['expected_bundle_aov'] - top_promo_candidates['bundle_aov']) 
    / (top_promo_candidates['bundle_aov'] + 1e-6))  # avoid division by zero



promo_projection = top_promo_candidates.sort_values('revenue_uplift_pct', ascending=False)
promo_projection = promo_projection[['bundle_name', 
                             'bundle_total_quantity','expected_bundle_quantity',
                             'bundle_total_revenue','expected_bundle_revenue','revenue_uplift_pct',
                             'bundle_total_margin','expected_bundle_margin','margin_uplift_pct',
                             'bundle_aov','expected_bundle_aov', 'aov_uplift_pct']]
promo_projection.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

top_expected_bundles = promo_projection.sort_values('expected_bundle_revenue', ascending=False).head(50)

plt.figure(figsize=(20, 15))
sns.barplot(x='expected_bundle_revenue', y='bundle_name', data=top_expected_bundles, palette='flare')
plt.xlabel("Expected Revenue (£)")
plt.ylabel("Bundle")
plt.title("Top 50 Bundles for Promotion by Expected Revenue")
plt.show()

## Q3b. Build a framework to assess if the proposed promo bundles will be profitable for Gopuff

#### Objective: Increase Average Order Value (AOV), revenue, and margin through strategic product bundle promotions (which are not in promo right now but have lots of potential) in high-frequency categories (Snacks, Drinks, Confectionery).

#### Hypothesis: Promoting top-performing bundles with minimal current discount will encourage higher order quantities and incremental revenue.

In [None]:
# Impact Quantification 

total_revenue_before = top_promo_candidates['bundle_total_revenue'].sum()
total_revenue_after = top_promo_candidates['expected_bundle_revenue'].sum()
revenue_uplift_percent = (total_revenue_after / total_revenue_before - 1) 

total_margin_before = top_promo_candidates['bundle_total_margin'].sum()
total_margin_after = top_promo_candidates['expected_bundle_margin'].sum()
margin_uplift_percent = (total_margin_after / total_margin_before - 1) 

margin_pct_before = total_margin_before / total_revenue_before
margin_pct_after = total_margin_after / total_revenue_after

print(f"Total Revenue Before Promotion: £{total_revenue_before:,.2f}")
print(f"Total Revenue After Promotion: £{total_revenue_after:,.2f}")
print(f"Total Margin Before Promotion: £{total_margin_before:,.2f}")
print(f"Total Margin After Promotion: £{total_margin_after:,.2f}")
print(f"Projected Revenue Uplift: {revenue_uplift_percent:.2f}%")
print(f"Projected Margin Uplift: {margin_uplift_percent:.2f}%")
print(f"Margin Percentage Before Promotion: {margin_pct_before:.2f}%")
print(f"Margin Percentage After Promotion: {margin_pct_after:.2f}%")


### A/B Testing Framework to measure if the projected assumptions will have a quantifiable impact at Gopuff

In [None]:
# check which features to include in a/b testing

metrics = ['bundle_aov', 'bundle_total_revenue', 'bundle_total_margin', 'bundle_conversion_rate']

# Features to test
features = ['location_id_1', 'is_fam_1', 'is_newbie_order_1', 'bundle_segment']

for f in features:
    print(f"Feature: {f}")
    print(df_pairs.groupby(f)[metrics].mean())
    print("-"*50)


In [None]:
from scipy.stats import ttest_ind, f_oneway

# Define columns to check
boolean_cols = ['is_fam_1', 'is_fam_2', 'is_newbie_order_1', 'is_newbie_order_2', 'is_on_promo_1', 'is_on_promo_2']
categorical_cols = ['location_id_1', 'location_id_2', 'bundle_segment']

results = []

# Boolean columns: t-test
for col in boolean_cols:
    if col in top_promo_candidates.columns:
        group0 = top_promo_candidates[top_promo_candidates[col]==0]['bundle_aov']
        group1 = top_promo_candidates[top_promo_candidates[col]==1]['bundle_aov']
        t_stat, p_val = ttest_ind(group0, group1, equal_var=False)
        results.append({
            'feature': col,
            'type': 'boolean',
            't_stat': t_stat,
            'p_value': p_val
        })

# Categorical columns: ANOVA
for col in categorical_cols:
    if col in top_promo_candidates.columns:
        groups = [top_promo_candidates[top_promo_candidates[col]==cat]['bundle_aov'] for cat in top_promo_candidates[col].dropna().unique()]
        f_stat, p_val = f_oneway(*groups)
        results.append({
            'feature': col,
            'type': 'categorical',
            'f_stat': f_stat,
            'p_value': p_val
        })

results_df = pd.DataFrame(results)
results_df.sort_values('p_value', inplace=True)
print(results_df)


In [None]:
top_promo_candidates.to_csv("top_promo_candidates.csv", index=False)

In [None]:
top_promo_candidates = pd.read_csv("top_promo_candidates.csv")
top_promo_candidates.shape

In [None]:

# =============================
# 0️⃣ Select top 20 test bundles based on performance
# =============================
test_candidates = top_promo_candidates
# Step 1: Drop duplicate product pairs so each pair appears only once
test_candidates = test_candidates.drop_duplicates(subset='product_pair')

# Step 2: Pick the top 20 based on expected_bundle_aov
test_top20 = test_candidates.nlargest(20, 'expected_bundle_aov')
print("Selected Test Bundles:")
test_top20[['bundle_name', 'expected_bundle_aov', 'expected_bundle_revenue', 'expected_bundle_margin']]



In [None]:
# =============================
# 1️⃣ Select top 20 control bundles ensuring different product pairs, only in selected locations
# =============================

cols_needed = [
    'product_pair', 'bundle_segment', 'location_id_1', 'is_fam_2',
    'bundle_total_revenue', 'bundle_total_margin', 'bundle_total_quantity',
    'bundle_aov', 'bundle_margin_per_unit', 'bundle_conversion_rate', 'bundle_ctr'
]

control_candidates = df_pairs[cols_needed]

# Remove product pairs already in test
control_candidates = control_candidates[~control_candidates['product_pair'].isin(test_top20['product_pair'])]


In [None]:
control_candidates['performance_score'] = (
    control_candidates['bundle_total_revenue'].rank(ascending=False) +
    control_candidates['bundle_total_margin'].rank(ascending=False) +
    control_candidates['bundle_total_quantity'].rank(ascending=False) +
    control_candidates['bundle_conversion_rate'].rank(ascending=False) +
    control_candidates['bundle_aov'].rank(ascending=False)
)

control_top20 = control_candidates.nlargest(20, 'performance_score')

print("Selected Control Bundles:")
control_top20

In [None]:
# =============================
# 2️⃣ Prepare aggregated long table for test
# =============================
test_long = test_top20.groupby([
    'product_pair', 'bundle_segment', 'location_id_1', 'is_fam_2'
]).agg(
    total_revenue=('bundle_total_revenue', 'sum'),
    total_margin=('bundle_total_margin', 'sum'),
    total_quantity=('bundle_total_quantity', 'sum'),
    avg_bundle_aov=('bundle_aov', 'mean'),
    avg_bundle_margin_per_unit=('bundle_margin_per_unit', 'mean'),
    conversion_rate=('bundle_conversion_rate', 'mean'),
    ctr=('bundle_ctr', 'mean'),
    expected_bundle_aov=('expected_bundle_aov', 'mean'),
    expected_bundle_margin=('expected_bundle_margin', 'sum'),
    expected_bundle_revenue=('expected_bundle_revenue', 'sum'),
    expected_bundle_quantity=('expected_bundle_quantity', 'sum'),
    expected_aov_uplift=('expected_aov_uplift', 'mean')
).reset_index()
test_long['arm'] = 'Test'

# =============================
# 3️⃣ Prepare aggregated long table for control
# =============================
control_long = control_top20.groupby([
    'product_pair', 'bundle_segment', 'location_id_1', 'is_fam_2'
]).agg(
    total_revenue=('bundle_total_revenue', 'sum'),
    total_margin=('bundle_total_margin', 'sum'),
    total_quantity=('bundle_total_quantity', 'sum'),
    avg_bundle_aov=('bundle_aov', 'mean'),
    avg_bundle_margin_per_unit=('bundle_margin_per_unit', 'mean'),
    conversion_rate=('bundle_conversion_rate', 'mean'),
    ctr=('bundle_ctr', 'mean')
).reset_index()
control_long['arm'] = 'Control'

# =============================
# 4️⃣ Combine test and control
# =============================
ab_long_table = pd.concat([control_long, test_long], axis=0, ignore_index=True)



In [None]:
markdown_table = ab_long_table.to_markdown(index=False)
print(markdown_table)


### The control vs test product pairs are different for the following reasons: 

### - Avoid contamination

If the same product pair appears in both test and control:

You can’t tell whether changes in sales or conversion are due to the promotion or just natural product performance.

Overlapping products “contaminate” the control, making it harder to isolate the true impact.

### - Preserve the “never-promoted” property

The test bundles are products that have never been promoted.

If we allowed the same product pairs in control, we’d risk including products that have historical promotion data, which breaks the test assumption that you’re seeing the first-time effect.

### - Avoid cross-influence

If a product in the test group has previously appeared in control bundles, past promotion effects could carry over.

Different pairs ensure the uplift is due to the new promotion, not historical effects or overlapping exposure.

### - Maximize learning from the test

Using novel bundles in test means we can discover which products actually perform well when promoted.

If test and control overlap, we lose the ability to learn which untested bundles have potential.

### - Cleaner metrics

Revenue, margin, and conversion improvements are easier to attribute to the promotion.

Non-overlapping pairs reduce noise from products that might already have optimized pricing, bundling, or marketing exposure.



In [None]:
test_candidates.columns

## Q4. Roll out plan for Gopuff's success and next steps

In [None]:
# Impact Quantification 

# ✅ Compute baseline (control) averages
baseline_revenue = control_candidates['bundle_total_revenue'].mean()
baseline_margin = control_candidates['bundle_total_margin'].mean()

# ✅ Compute test averages
test_revenue = test_candidates['expected_bundle_revenue'].mean()
test_margin = test_candidates['expected_bundle_margin'].mean()

# Compute uplifts
revenue_uplift_percent = (test_revenue / baseline_revenue - 1) * 100
margin_uplift_percent = (test_margin / baseline_margin - 1) * 100

print(f"Baseline Revenue per Bundle: £{baseline_revenue:.2f}")
print(f"Test Revenue per Bundle: £{test_revenue:.2f}")
print(f"Revenue Uplift: {revenue_uplift_percent:.2f}%")

print(f"Baseline Margin per Bundle: £{baseline_margin:.2f}")
print(f"Test Margin per Bundle: £{test_margin:.2f}")
print(f"Margin Uplift: {margin_uplift_percent:.2f}%")



# Rollout Plan for Bundle Promotions

## 1. Test & Control Design
- **Test Bundles (20)**  
  - Product pairs never promoted before.  
  - Selected based on high potential (co-purchase frequency, high margin, strong revenue baseline).  
- **Control Bundles (20)**  
  - Different product pairs from test set.  
  - Include products that may have been promoted in the past.  
  - Provide a realistic business-as-usual benchmark.  

- **Markets & Segments**  
  - All selected markets (e.g., 12 pilot markets).  
  - Run across **2 customer segments**:  
    - Segment A: Price-sensitive (e.g., students).  
    - Segment B: Value-driven (e.g., families).  

---

## 2. Quantified Impact (Illustrative Example)
Using test/control A/B methodology, impact is quantified as:  

\[
\text{Revenue Uplift \%} = \frac{\text{Revenue(Test)}}{\text{Revenue(Control)}} - 1
\]

\[
\text{Margin Uplift \%} = \frac{\text{Margin(Test)}}{\text{Margin(Control)}} - 1
\]

- **Baseline (Control):**  
  - Avg Revenue per Bundle = **£8.43**  
  - Avg Margin per Bundle = **£58.30**  

- **Test (Never Promoted Bundles):**  
  - Avg Revenue per Bundle = **£22.76**  
  - Avg Margin per Bundle = **£85.31**  

- **Impact and Strategic Benefits:**  
  - **+169.78% Revenue Uplift** 🚀  
  - **+46.32% Margin Uplift** 💰  
  - **Higher AOV:** customers spend significantly more per basket when new bundles are introduced.  
  - **Competitive Advantage:** Unlocks incremental revenue streams by targeting segments that were previously under-monetized.  
  - **Customer Retention and higher LTV:** New bundles attract repeat purchases and improve stickiness.  
- **Incremental Customers:** attract new customers who see value in fresh bundles.  
- **Reduced Churn:** higher perceived value keeps customers loyal. 
- **Operational Learnings:** framework for identifying future profitable bundles.  

---

## 3. Rollout & Scaling Plan

### Phase 1 – Pilot
- Run A/B test in **12 markets × 2 segments**.  
- Collect results over **6 weeks** to capture seasonality.  
- Metrics: Revenue uplift, Margin uplift, AOV increase, customer retention.  

### Phase 2 – Quantified Expansion
- Scale winning bundles to **top 50 markets and more segments and audiences**.  
- Integrate bundles into seasonal promotions calendar.  
- Monitor KPIs weekly with real-time dashboards.  

### Phase 3 – ML-Driven Optimization
- Build **bundle scoring algorithm**:  
  - Inputs: margin, revenue, co-purchase lift, discount elasticity, promo history, conversion rate, segments, location, recency, frequency, monetary value, demographics, etc.  
  - Outputs: predicted uplift (AOV, margin). --> Regressor Algorithms
- Use **multi-armed bandits** to dynamically allocate bundle discounts.  
- Continuously refresh bundle portfolio every 2–4 weeks.  

### Phase 4 – Full Rollout & Automation
- Nationwide rollout of **validated bundles**.  
- Automated monitoring of:  
  - Net Revenue Impact  
  - Net Margin Impact  
  - Promo ROI  
  - Retention / New Customers Gained  
- Sunset underperforming bundles quickly.  

---

## 4. Executive Summary
By launching **20 new test bundles** against **20 controls across all markets and 2 customer segments**, we can:  
- Deliver **~40% uplift in revenue and margin per bundle**.  
- Drive **higher AOV and customer loyalty**.  
- Build a **repeatable ML-driven framework** to identify and scale bundles profitably.  
- Secure **competitive advantage** through differentiated promotions strategy.  

This phased rollout ensures we **quantify impact, scale efficiently, and automate with ML** for long-term growth.  


## Next steps small ML Script with Kedro Framework as pipeline wrapper

## 🔮 Next Steps: Full-Stack ML for Bundle Optimization

To demonstrate **ML & DL capabilities**, I created a **small prototype script** that predicts **expected bundle AOV** using a `GradientBoostingRegressor` trained on bundle-level features (e.g., total revenue, margin, quantity, CTR, conversion rate, discount, etc.).  

This quick script is a proof-of-concept, but to **deploy a production-ready ML system**, we need to implement the **full machine learning workflow**, which includes:  

### 1. 🔧 Data Preparation & Feature Engineering
- Create new engineered features (e.g., **discount elasticity, margin per impression, revenue per unit, uplift signals**).  
- Aggregate user and location-level signals (e.g., **seasonality, user cohort, demand shifts**).  
- Normalize / scale features where appropriate.  

### 2. 📊 Exploratory Data Analysis (EDA) & Statistics
- Detect and handle **outliers** using IQR or robust z-scores.  
- Conduct **distribution analysis** to ensure stable feature ranges.  
- Use **correlation matrices & VIF (Variance Inflation Factor)** to check for **multicollinearity** between features.  

### 3. 🧮 Feature Selection & Encoding
- Apply **feature importance analysis** (e.g., SHAP, permutation importance) to prioritize predictive drivers.  
- Use **One-Hot Encoding or Label Encoding** for categorical features like `bundle_segment` or `brand` or `merch_subcategory`.  
- Drop redundant or low-signal features to avoid noise.  

### 4. 🏗️ Model Development
- Train multiple models (Gradient Boosting, XGBoost, Random Forests, Regularized Linear Models).  
- Apply **hyperparameter tuning** (Grid Search / Random Search / Bayesian Optimization).  
- Validate performance using **cross-validation** across markets & segments.  

### 5. 📈 Evaluation Metrics
- **R² / RMSE** for predictive accuracy.  
- **Business KPIs**: uplift in AOV, revenue, and margin under simulated promos.  
- Bias/variance trade-off assessment to avoid overfitting.  

### 6. 🚀 Deployment Pipeline
- Wrap the model in a **FastAPI service**.  
- Deploy with **Kedro pipelines** orchestrated via **GCP (Cloud Run, Cloud Scheduler, GCS Buckets)**.  
- Enable **CI/CD** for automated retraining and logging with MLflow or similar tools.  

---

✅ The quick script shows feasibility,  
but the **full-stack ML approach** ensures robustness, interpretability, and scalability to production.

In [None]:
# ======================================================
# 📌 Kedro Pipeline for ML-Driven Bundle AOV Prediction
# ======================================================

from kedro.pipeline import Pipeline, node
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

import matplotlib.pyplot as plt
import seaborn as sns

# ===============================
# 1️⃣ ML Training Node
# ===============================
def train_model(df_pairs: pd.DataFrame) -> GradientBoostingRegressor:
    features = [
        "bundle_total_revenue", "bundle_total_margin", 
        "bundle_total_quantity",
        "bundle_conversion_rate", "bundle_ctr", 
        "is_fam_1", "is_newbie_order_1"
    ]
    target = "bundle_aov"
    
    X = df_pairs[features]
    y = df_pairs[target]
    
    model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.01, max_depth=5)
    model.fit(X, y)
    
    # Save locally for deployment
    joblib.dump(model, "bundle_aov_model.pkl")
    
    return model

# ===============================
# 2️⃣ Evaluation Node
# ===============================
def evaluate_model(model: GradientBoostingRegressor, df_pairs: pd.DataFrame) -> float:
    features = [
        "bundle_total_revenue", "bundle_total_margin", 
        "bundle_total_quantity",
        "bundle_conversion_rate", "bundle_ctr",
        "is_fam_1", "is_newbie_order_1"
    ]
    target = "bundle_aov"
    
    X_val = df_pairs[features]
    y_val = df_pairs[target]
    y_pred = model.predict(X_val)
    
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print(f"Validation MSE: {mse:.2f}")
    print(f"Validation R² score: {r2:.4f}")

    return mse, r2

# ===============================
# 3️⃣ Prediction Node
# ===============================
def predict_top_bundles(model: GradientBoostingRegressor, top_promo_candidates: pd.DataFrame) -> pd.DataFrame:
    features = [
        "bundle_total_revenue", "bundle_total_margin", 
        "bundle_total_quantity",
        "bundle_conversion_rate", "bundle_ctr",
        "is_fam_1", "is_newbie_order_1"
    ]
    
    X_test = top_promo_candidates[features]
    top_promo_candidates['predicted_aov'] = model.predict(X_test)
    
    top_20 = top_promo_candidates.nlargest(20, 'predicted_aov')
    return top_20

# ===============================
# 4️⃣ Kedro Pipeline Definition
# ===============================
def create_pipeline(**kwargs) -> Pipeline:
    return Pipeline(
        [
            node(
                func=train_model,
                inputs="df_pairs",
                outputs="trained_model",
                name="train_bundle_model"
            ),
            node(
                func=evaluate_model,
                inputs=["trained_model", "df_pairs"],
                outputs="model_mse",
                name="evaluate_bundle_model"
            ),
            node(
                func=predict_top_bundles,
                inputs=["trained_model", "top_promo_candidates"],
                outputs="top_20_predicted_bundles",
                name="predict_top_bundles_node"
            )
        ]
    )

df_pairs_sample = df_pairs.sample(n=300000, random_state=42)
df_pairs_eval = df_pairs.sample(n=300000, random_state=42)

# 1️⃣ Train the model on historical bundles
model = train_model(df_pairs_sample)

mse, r2 = evaluate_model(model, df_pairs_eval)

# 3️⃣ Predict AOV for top promo candidates
top_20_bundles = predict_top_bundles(model, top_promo_candidates)

print(top_20_bundles[['bundle_name', 'predicted_aov', 'bundle_total_revenue', 'bundle_total_margin']])


## 🚀 Rollout with ML + GCP Deployment  

**Step 1: Train with Kedro Pipelines**  
- Define modular nodes for data prep, model training, evaluation, and saving artifacts.  
- Run locally or in CI/CD (GitLab CI, GitHub Actions).  

**Step 2: Store Artifacts in GCP**  
- Save trained models to **Google Cloud Storage (GCS)** bucket:  
  `gs://gopuff-model-artifacts/bundle_aov_model.pkl`  

**Step 3: Deploy API via Cloud Run**  
- Wrap model in a **FastAPI service**.  
- Containerize with Docker.  
- Deploy container to **Cloud Run**, autoscaling based on traffic.  

**Step 4: Automate Retraining with Cloud Scheduler**  
- Cloud Scheduler triggers a **Pub/Sub message** to kick off retraining.  
- Retraining pipeline runs on **Cloud Run Job** or **Cloud Build**.  
- New model replaces old in GCS bucket.  

**Step 5: Integration into Gopuff Systems**  
- Frontend (app/website) queries Cloud Run API to fetch recommended bundles.  
- Backend logs results (CTR, margin, AOV) for monitoring & retraining feedback loop.  

