# üõçÔ∏è SmartCart Starter Notebook
This notebook will guide you through your group project on collaborative filtering and association rule mining for an e-commerce recommender system.

## üì• Part 1: Data Preprocessing
Steps:
- Load `ecommerce_user_data.csv` and `product_details.csv`
- Merge data if necessary
- Create user-item matrix
- Fill missing ratings with 0
- Group user behavior by category

In [10]:
# Load data
import pandas as pd
import numpy as np

user_data = pd.read_csv('ecommerce_user_data.csv')
product_data = pd.read_csv('product_details.csv')

print(user_data.head())
print(product_data.head())

  UserID ProductID  Rating   Timestamp  Category
0   U000     P0009       5  2024-09-08     Books
1   U000     P0020       1  2024-09-02      Home
2   U000     P0012       4  2024-10-18     Books
3   U000     P0013       1  2024-09-18  Clothing
4   U000     P0070       4  2024-09-16      Toys
  ProductID      ProductName     Category
0     P0000      Toys Item 0     Clothing
1     P0001  Clothing Item 1  Electronics
2     P0002     Books Item 2  Electronics
3     P0003  Clothing Item 3  Electronics
4     P0004  Clothing Item 4  Electronics


In [3]:
# Create user-item matrix
user_item_matrix = user_data.pivot_table(index='UserID', columns='ProductID', values='Rating')
user_item_matrix_filled = user_item_matrix.fillna(0)
user_item_matrix_filled.head()

ProductID,P0000,P0001,P0002,P0003,P0004,P0005,P0006,P0007,P0008,P0009,...,P0090,P0091,P0092,P0093,P0094,P0095,P0096,P0097,P0098,P0099
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U000,0.0,0.0,0.0,3.0,0.0,5.0,0.0,3.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U001,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0
U002,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
U004,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0


In [4]:
# Aggregate user behavior by category
user_category_agg = user_data.groupby(['UserID', 'Category']).agg({'Rating': ['count', 'mean']}).reset_index()
user_category_agg.columns = ['UserID', 'Category', 'TotalInteractions', 'AverageRating']
user_category_agg.head()

Unnamed: 0,UserID,Category,TotalInteractions,AverageRating
0,U000,Books,6,3.666667
1,U000,Clothing,3,1.666667
2,U000,Electronics,3,3.666667
3,U000,Home,2,1.0
4,U000,Toys,6,3.5


## ü§ù Part 2: User-Based Collaborative Filtering
Steps:
- Use cosine similarity to compare users
- Recommend top-N products based on similar users
- Evaluate with Precision@K and Coverage

In [5]:
# Compute cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(user_item_matrix_filled)
similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix_filled.index, columns=user_item_matrix_filled.index)
similarity_df.head()

UserID,U000,U001,U002,U003,U004,U005,U006,U007,U008,U009,...,U040,U041,U042,U043,U044,U045,U046,U047,U048,U049
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U000,1.0,0.063071,0.195522,0.023466,0.065412,0.161251,0.160096,0.092083,0.238263,0.274844,...,0.241693,0.129483,0.15679,0.1322,0.161478,0.100346,0.126917,0.150727,0.0,0.104294
U001,0.063071,1.0,0.190861,0.0,0.111332,0.00954,0.0,0.172286,0.16746,0.017593,...,0.12154,0.024075,0.097953,0.007821,0.0,0.0,0.19367,0.247805,0.0,0.0
U002,0.195522,0.190861,1.0,0.065094,0.111662,0.05083,0.027756,0.055877,0.0,0.181229,...,0.144756,0.0,0.217465,0.0,0.055992,0.034794,0.194243,0.11003,0.177165,0.0
U003,0.023466,0.0,0.065094,1.0,0.035737,0.104116,0.02665,0.0,0.025384,0.288009,...,0.243836,0.0,0.0,0.074688,0.197121,0.0,0.054396,0.126773,0.374228,0.25
U004,0.065412,0.111332,0.111662,0.035737,1.0,0.159064,0.057144,0.026294,0.195942,0.247023,...,0.062741,0.116202,0.078797,0.048044,0.103747,0.133716,0.124969,0.217464,0.255318,0.172729


In [42]:
# Create recommendation function
# Find most similar user, recommend products they rated highly that target user hasn‚Äôt rated
def recommend_products(user_id, top_n=100, top_similar_users=10):
    similar_users = similarity_df[user_id].drop(index=user_id).sort_values(ascending=False)[:top_similar_users]
    
    recommended_items = pd.Series(dtype="float64")

    for similar_user in similar_users.index:
        similar_user_ratings = user_item_matrix_filled.loc[similar_user]
        target_user_ratings = user_item_matrix_filled.loc[user_id]

        # Get items rated by similar user but not by the target user
        unrated_products = target_user_ratings[target_user_ratings == 0].index
        recommendations = similar_user_ratings[unrated_products]

        # Aggregate ratings from multiple similar users
        recommended_items = recommended_items.add(recommendations, fill_value=0)

    return recommended_items.sort_values(ascending=False).index[:top_n]


# Example: Get top 5 recommendations for a specific user
example_user = user_item_matrix_filled.index[0]  # First user in the dataset
top_recommendations = recommend_products(example_user)
print(f"Top Recommendations for {example_user}: {top_recommendations}")



Top Recommendations for U000: Index(['P0083', 'P0052', 'P0064', 'P0088', 'P0055', 'P0051', 'P0087', 'P0040',
       'P0098', 'P0054', 'P0041', 'P0062', 'P0030', 'P0077', 'P0004', 'P0058',
       'P0032', 'P0078', 'P0027', 'P0099', 'P0001', 'P0060', 'P0002', 'P0049',
       'P0029', 'P0086', 'P0023', 'P0036', 'P0053', 'P0094', 'P0011', 'P0066',
       'P0038', 'P0080', 'P0025', 'P0008', 'P0010', 'P0091', 'P0018', 'P0085',
       'P0082', 'P0056', 'P0059', 'P0063', 'P0065', 'P0006', 'P0068', 'P0072',
       'P0089', 'P0090', 'P0081', 'P0073', 'P0075', 'P0074', 'P0000', 'P0015',
       'P0061', 'P0045', 'P0026', 'P0034', 'P0095', 'P0016', 'P0022', 'P0024',
       'P0067', 'P0057', 'P0035', 'P0037', 'P0039', 'P0031', 'P0084', 'P0019',
       'P0017', 'P0092', 'P0093', 'P0043', 'P0096', 'P0097', 'P0076', 'P0069'],
      dtype='object', name='ProductID')


In [51]:
# Implement evaluation metrics like Precision@K and Coverage
# Example: compare recommended vs actual rated items

from sklearn.metrics import precision_score

def precision_at_k(user_id, k=1):
    """Compute Precision@K for recommendations."""
    
    # Get top K recommendations
    recommended_products = recommend_products(user_id, k)
    
    # Get actual products the user rated highly (e.g., rating >= 4)
    relevant_products = set(user_item_matrix_filled.loc[user_id][user_item_matrix_filled.loc[user_id] >= 1].index)
    
    # Compute precision
    hits = len(set(recommended_products) & relevant_products)
    print(hits)
    precision = hits / k
    
    return precision

# Compute average Precision@K for all users
precision_scores = [precision_at_k(user, k=5) for user in user_item_matrix_filled.index]
average_precision = np.mean(precision_scores)

print(f"Average Precision@5: {average_precision:.2f}")



0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Average Precision@5: 0.00


In [46]:
from sklearn.model_selection import train_test_split

# Split the user data into train and test sets (e.g., 80/20 split)
train_data, test_data = train_test_split(user_data, test_size=0.2, random_state=42)

# Create the training user-item matrix
train_matrix = train_data.pivot_table(index='UserID', columns='ProductID', values='Rating').fillna(0)

# Recompute cosine similarity on the training matrix
train_similarity = cosine_similarity(train_matrix)
train_similarity_df = pd.DataFrame(train_similarity, index=train_matrix.index, columns=train_matrix.index)

def recommend_products_train(user_id, top_n=5, top_similar_users=5):
    if user_id not in train_matrix.index:
        return []
    
    similar_users = train_similarity_df[user_id].drop(index=user_id).sort_values(ascending=False)[:top_similar_users]
    recommended_items = pd.Series(dtype="float64")
    
    for similar_user in similar_users.index:
        similar_user_ratings = train_matrix.loc[similar_user]
        target_user_ratings = train_matrix.loc[user_id]
        
        # Recommend items that the target user hasn't seen in training
        unrated_products = target_user_ratings[target_user_ratings == 0].index
        recommendations = similar_user_ratings[unrated_products]
        recommended_items = recommended_items.add(recommendations, fill_value=0)
    
    return recommended_items.sort_values(ascending=False).index[:top_n]

def evaluate_user(user_id, k=5):
    recommended_products = set(recommend_products_train(user_id, top_n=k))
    # Use the test set to get actual interactions for the user
    actual_products = set(test_data[test_data['UserID'] == user_id]['ProductID'])
    hits = len(recommended_products & actual_products)
    precision = hits / k if k > 0 else 0
    return precision

# Evaluate a specific user
user_to_evaluate = train_matrix.index[0]
print(f"Precision@5 for {user_to_evaluate}: {evaluate_user(user_to_evaluate, 5):.2f}")


Precision@5 for U000: 0.00


In [27]:
def recommendation_coverage():
    """Compute the percentage of unique products recommended across all users."""
    
    recommended_products = set()
    
    for user in user_item_matrix_filled.index:
        recommended_products.update(recommend_products(user, top_n=5))
    
    total_products = len(user_item_matrix_filled.columns)  # Total unique products in dataset
    coverage = len(recommended_products) / total_products
    
    return coverage

# Compute coverage
coverage_score = recommendation_coverage()
print(f"Coverage of the recommender system: {coverage_score:.2%}")


Coverage of the recommender system: 81.00%


In [47]:
def compare_recommendations_vs_actual(user_id, top_n=5):
    """Compare recommended products with actual highly rated products for a user."""
    
    # Get top-N recommendations
    recommended_products = recommend_products(user_id, top_n)
    
    # Get actual products the user rated highly (e.g., rating >= 4)
    actual_rated_products = set(user_item_matrix_filled.loc[user_id][user_item_matrix_filled.loc[user_id] >= 4].index)
    
    # Print comparison
    print(f"User: {user_id}")
    print(f"Recommended Products: {recommended_products}")
    print(f"Actual Highly Rated Products: {list(actual_rated_products)}")
    
    # Compute intersection (true positives)
    matched_items = set(recommended_products) & actual_rated_products
    print(f"Matched Products: {list(matched_items)}")
    print(f"Precision: {len(matched_items) / top_n:.2f}\n")

# Run comparison for a few example users
for user in user_item_matrix_filled.index[:3]:  # First 3 users
    compare_recommendations_vs_actual(user)


User: U000
Recommended Products: Index(['P0083', 'P0052', 'P0064', 'P0088', 'P0055'], dtype='object', name='ProductID')
Actual Highly Rated Products: ['P0009', 'P0033', 'P0079', 'P0021', 'P0012', 'P0005', 'P0048', 'P0070']
Matched Products: []
Precision: 0.00

User: U001
Recommended Products: Index(['P0070', 'P0039', 'P0093', 'P0064', 'P0016'], dtype='object', name='ProductID')
Actual Highly Rated Products: ['P0037', 'P0051', 'P0054', 'P0091', 'P0030']
Matched Products: []
Precision: 0.00

User: U002
Recommended Products: Index(['P0070', 'P0033', 'P0003', 'P0091', 'P0048'], dtype='object', name='ProductID')
Actual Highly Rated Products: ['P0085', 'P0054', 'P0049', 'P0005', 'P0083']
Matched Products: []
Precision: 0.00



In [49]:
# Count the number of rated items (non-zero entries) per user
num_rated_items = (user_item_matrix_filled >= 1).sum(axis=1)

# Print the number of rated items for each user
print(num_rated_items)


UserID
U000    20
U001    13
U002    14
U003    11
U004    17
U005    10
U006    14
U007    11
U008    20
U009    19
U010    19
U011    16
U012    18
U013    18
U014    10
U015    14
U016    12
U017    18
U018    15
U019    15
U020    13
U021    10
U022    11
U023    11
U024    11
U025    10
U026    16
U027    15
U028    19
U029    18
U030    11
U031    15
U032    18
U033    14
U034    10
U035    20
U036    19
U037    15
U038    12
U039    13
U040    17
U041    15
U042    10
U043    11
U044    18
U045    11
U046    18
U047    14
U048    12
U049    13
dtype: int64


In [52]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets (e.g., 80/20 split)
train_data, test_data = train_test_split(user_data, test_size=0.2, random_state=42)


In [78]:
# Create training user-item matrix
train_matrix = train_data.pivot_table(index='UserID', columns='ProductID', values='Rating').fillna(0)

# Compute cosine similarity on training data
from sklearn.metrics.pairwise import cosine_similarity
train_similarity = cosine_similarity(train_matrix)
train_similarity_df = pd.DataFrame(train_similarity, index=train_matrix.index, columns=train_matrix.index)

def recommend_products_train(user_id, top_n=15, top_similar_users=15):
    if user_id not in train_matrix.index:
        return []
    
    similar_users = train_similarity_df[user_id].drop(index=user_id).sort_values(ascending=False)[:top_similar_users]
    recommended_items = pd.Series(dtype="float64")
    
    for similar_user in similar_users.index:
        similar_user_ratings = train_matrix.loc[similar_user]
        target_user_ratings = train_matrix.loc[user_id]
        
        # Recommend items that the target user hasn't interacted with in training
        unrated_products = target_user_ratings[target_user_ratings >= 0].index
        recommendations = similar_user_ratings[unrated_products]
        recommended_items = recommended_items.add(recommendations, fill_value=0)
    
    return recommended_items.sort_values(ascending=False).index[:top_n]



In [79]:
def evaluate_user(user_id, k=5):
    # Generate recommendations from training data
    recommended_products = set(recommend_products_train(user_id, top_n=k))
    # Get actual items the user interacted with in the test set
    actual_products = set(test_data[test_data['UserID'] == user_id]['ProductID'])
    print(test_data[test_data['UserID'] == user_id]['ProductID'])
    print("Sss")
    print(train_data[train_data['UserID'] == user_id]['ProductID'])
    print(recommend_products_train(user_id, top_n=k))
                    
                
    hits = len(recommended_products & actual_products)
    #print(hits)
    precision = hits / k if k > 0 else 0
    return precision

# Evaluate Precision@5 for a specific user
user_to_evaluate = train_matrix.index[43]
print(f"Precision@5 for {user_to_evaluate}: {evaluate_user(user_to_evaluate, 10):.2f}")




627    P0079
631    P0022
Name: ProductID, dtype: object
Sss
632    P0058
635    P0060
633    P0066
630    P0094
628    P0075
637    P0097
629    P0073
634    P0044
636    P0078
Name: ProductID, dtype: object
Index(['P0088', 'P0060', 'P0044', 'P0064', 'P0073', 'P0070', 'P0030', 'P0094',
       'P0066', 'P0065'],
      dtype='object', name='ProductID')
Precision@5 for U043: 0.00


In [None]:
# Compute average Precision@5 for all users in training set (that are also present in test set)
precision_scores = [evaluate_user(user, k=10) for user in train_matrix.index if user in test_data['UserID'].unique()]
average_precision = np.mean(precision_scores)
print(f"Average Precision@5: {average_precision:.2f}")

## üîç Part 3: Association Rule Mining (Apriori)
Steps:
- Convert user-product interactions to transaction format
- Apply Apriori algorithm to find frequent itemsets
- Generate association rules (support, confidence, lift)

In [None]:
# Convert to transaction format
from mlxtend.preprocessing import TransactionEncoder
transactions = user_data.groupby('UserID')['ProductID'].apply(list).tolist()
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_trans = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
# Apply Apriori and generate rules
from mlxtend.frequent_patterns import apriori, association_rules
frequent_itemsets = apriori(df_trans, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()

## üìä Part 4: Visualization
Steps:
- Plot user similarity heatmap
- Plot top frequent itemsets
- Visualize top recommendations

In [None]:
# Heatmap of user similarity
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))
sns.heatmap(similarity_df, cmap='YlGnBu')
plt.title('User Similarity Heatmap')
plt.show()

In [None]:
# Frequent itemsets bar chart
frequent_itemsets.nlargest(10, 'support').plot(kind='bar', x='itemsets', y='support', legend=False)
plt.title('Top 10 Frequent Itemsets')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## üß† Part 5: Conceptual Questions
Answer the following questions in your report:
1. How does data sparsity affect performance?
2. What kinds of product bundles were discovered?
3. What improvements would you suggest for real-world deployment?