# 04 — Product Recommendation Engine

> Personalising the shopping experience — a core Boots data science use case.

Three approaches built:
1. **Collaborative Filtering** — 'customers like you also bought'
2. **Content-Based Filtering** — 'similar products to what you've bought'
3. **Hybrid Model** — weighted combination for best performance

This mirrors real-world recommender systems used by Boots, Amazon and Netflix.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys; sys.path.insert(0, '.')
from src.data.loader import load_transactions, load_products
from src.data.preprocessor import clean_transactions, add_time_features
from src.models.recommender import CollaborativeFilteringRecommender, ContentBasedRecommender, HybridRecommender
import warnings; warnings.filterwarnings('ignore')

plt.rcParams.update({'figure.dpi': 120, 'axes.spines.top': False, 'axes.spines.right': False})

transactions = load_transactions()
transactions = clean_transactions(transactions)
transactions = add_time_features(transactions)
products     = load_products()

print(f"Transactions: {len(transactions):,}")
print(f"Unique customers: {transactions['customer_id'].nunique():,}")
print(f"Unique products:  {transactions['product_id'].nunique():,}")
print(f"Sparsity of user-item matrix: {1 - len(transactions) / (transactions['customer_id'].nunique() * transactions['product_id'].nunique()):.2%}")


## Train-Test Split

We hold out the last 30 days of each customer's purchases as a test set — this simulates predicting future purchases from historical behaviour.

In [None]:
# Temporal train/test split
cutoff = transactions['date'].max() - pd.Timedelta(days=30)
train = transactions[transactions['date'] <= cutoff].copy()
test  = transactions[transactions['date'] > cutoff].copy()

print(f"Train: {len(train):,} transactions up to {cutoff.date()}")
print(f"Test : {len(test):,} transactions after {cutoff.date()}")
print(f"Test customers: {test['customer_id'].nunique():,}")


## Collaborative Filtering

Builds a user-item matrix and finds customers with similar purchase histories using cosine similarity.

In [None]:
cf_model = CollaborativeFilteringRecommender(min_interactions=3, n_recommendations=10)
cf_model.fit(train)

# Example recommendations
example_customer = train['customer_id'].value_counts().index[0]  # most active customer
recs_cf = cf_model.recommend(example_customer, n=10)
recs_cf = recs_cf.merge(products[['product_id','product_name','category','price']], on='product_id', how='left')

print(f"\nCollaborative Filtering recommendations for customer: {example_customer}")
print(recs_cf[['product_name','category','price','score']].to_string(index=False))


## Content-Based Filtering

Recommends similar products based on category and price features — useful for new customers with limited history (cold start).

In [None]:
cb_model = ContentBasedRecommender(n_recommendations=10)
cb_model.fit(products)

# Find a popular product and recommend similar
popular_product = transactions.groupby('product_id')['quantity'].sum().idxmax()
product_name = products[products['product_id']==popular_product]['product_name'].values[0]
product_cat  = products[products['product_id']==popular_product]['category'].values[0]

recs_cb = cb_model.recommend_similar(popular_product, n=10)
recs_cb = recs_cb.merge(products[['product_id','product_name','category','price']], on='product_id', how='left')

print(f"\nContent-based recommendations similar to: {product_name} (Category: {product_cat})")
print(recs_cb[['product_name','category','price']].to_string(index=False))


## Hybrid Recommender

Combines both signals: 70% collaborative filtering + 30% content-based. This is the production model.

In [None]:
hybrid = HybridRecommender(cf_weight=0.7, cb_weight=0.3, n_recommendations=10)
hybrid.fit(train, products)

recs_hybrid = hybrid.recommend(example_customer, last_purchased_product=popular_product)
recs_hybrid = recs_hybrid.merge(products[['product_id','product_name','category','price']], on='product_id', how='left')

print(f"Hybrid recommendations for: {example_customer}")
print(recs_hybrid[['product_name','category','price','score','method']].to_string(index=False))


## Recommendation Coverage by Category

Are recommendations diverse, or do they over-represent one category?

In [None]:
# Sample recommendations across 50 customers
sample_customers = train['customer_id'].value_counts().head(50).index.tolist()
all_recs = []
for cid in sample_customers:
    try:
        r = hybrid.recommend(cid)
        r['customer_id'] = cid
        all_recs.append(r)
    except:
        pass

all_recs_df = pd.concat(all_recs).merge(products[['product_id','category']], on='product_id', how='left')

fig, ax = plt.subplots(figsize=(10, 5))
all_recs_df['category'].value_counts().plot(kind='bar', ax=ax, color='steelblue', alpha=0.8, edgecolor='white')
ax.set_title('Category Distribution of Recommendations\n(across 50 sampled customers)', fontweight='bold')
ax.set_xlabel('Product Category')
ax.set_ylabel('Number of Recommendations')
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
plt.tight_layout()
plt.savefig('reports/figures/04_recommendation_coverage.png', bbox_inches='tight')
plt.show()


## Next:** → `05_Model_Evaluation_and_Insights.ipynb`