Popularity-Based Recommender

In [2]:
# Step 1: Import libraries
import pandas as pd

# Step 2: Load datasets
orders_merged = pd.read_csv("orders_merged.csv")
products_cleaned = pd.read_csv("products_cleaned.csv")

# Step 3: Merge orders with products to get product names
order_details = pd.merge(orders_merged, products_cleaned, on='product_id', how='left')

# Step 4: Calculate product popularity
product_popularity = order_details.groupby('product_name').size().reset_index(name='purchase_count')

# Step 5: Sort by popularity
top_products = product_popularity.sort_values(by='purchase_count', ascending=False)

# Step 6: Display top 10 products
print("Top 10 Popular Products:")
print(top_products.head(10))


Top 10 Popular Products:
                 product_name  purchase_count
3677                   Banana          491291
3472   Bag of Organic Bananas          394930
31923    Organic Strawberries          275577
28843    Organic Baby Spinach          251705
30300    Organic Hass Avocado          220877
28807         Organic Avocado          184224
22415             Large Lemon          160792
42908            Strawberries          149445
23422                   Limes          146660
32481      Organic Whole Milk          142813


ALS stands for Alternating Least Squares — it’s a popular algorithm for collaborative filtering in recommendation systems.

Here’s the simple idea:

    You have a big table (matrix) of users × items (e.g., customers × products).
    
    Most of the cells are empty because users only rate/buy a few items.
    
    ALS tries to fill in the missing cells by finding patterns in what similar users like.

Why it’s used:

    Handles large sparse datasets well (like e-commerce purchase history).
    
    Works even without explicit ratings (can use purchase counts or implicit feedback).
    
    Scales well for big data with frameworks like Spark or implicit library in Python.

In [3]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares
from tabulate import tabulate  # pip install tabulate

# -------------------------------
# 1. Load your data
# -------------------------------
# Example dummy data (replace with Instacart dataset)
interactions = pd.DataFrame({
    'user_id': [1, 1, 2, 2, 3, 4],
    'product_id': [101, 102, 101, 103, 104, 105],
    'purchase_count': [3, 1, 2, 5, 4, 2]
})

products = pd.DataFrame({
    'product_id': [101, 102, 103, 104, 105],
    'product_name': ['Bananas', 'Apples', 'Milk', 'Bread', 'Eggs']
})

# -------------------------------
# 2. Build index mappings
# -------------------------------
user_index = interactions['user_id'].unique()
item_index = interactions['product_id'].unique()

user_index_map = {u: i for i, u in enumerate(user_index)}
item_index_map = {p: i for i, p in enumerate(item_index)}

# -------------------------------
# 3. Build user-item CSR matrix
# -------------------------------
row = interactions['user_id'].map(user_index_map)
col = interactions['product_id'].map(item_index_map)
data = interactions['purchase_count']

user_item_csr = csr_matrix((data, (row, col)), shape=(len(user_index), len(item_index)))

# -------------------------------
# 4. Train ALS model
# -------------------------------
model = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
model.fit(user_item_csr)

# -------------------------------
# 5. Precompute popularity fallback
# -------------------------------
product_popularity = (
    interactions.groupby('product_id')['purchase_count']
    .sum()
    .sort_values(ascending=False)
    .index[:50]  # top 50 popular products
)

item_index = np.array(item_index)  # ensure numpy for position indexing

# -------------------------------
# 6. Safe recommendation function
# -------------------------------
def recommend_als_safe(raw_user_id, N=10):
    # Cold-start: user not in training set
    if raw_user_id not in user_index:
        recs = [
            (pid, products.loc[products['product_id'] == pid, 'product_name'].values[0], None)
            for pid in product_popularity[:N]
        ]
        print(tabulate(
            [(i+1, pid, name, score) for i, (pid, name, score) in enumerate(recs)],
            headers=["Rank", "Product ID", "Product Name", "Score"],
            tablefmt="grid"
        ))
        return recs

    # Map raw user ID to ALS index
    uidx = np.where(user_index == raw_user_id)[0][0]

    # Get ALS recommendations
    ids, scores = model.recommend(uidx, user_item_csr[uidx], N=N, filter_already_liked_items=True)

    # Map back to product IDs
    prod_ids = [item_index[i] for i in ids]

    # Match with product names
    prod_df = products.set_index('product_id')
    recs = [
        (pid,
         prod_df.loc[pid, 'product_name'] if pid in prod_df.index else None,
         score)
        for pid, score in zip(prod_ids, scores)
    ]

    print(tabulate(
        [(i+1, pid, name, score) for i, (pid, name, score) in enumerate(recs)],
        headers=["Rank", "Product ID", "Product Name", "Score"],
        tablefmt="grid"
    ))
    return recs

# -------------------------------
# 7. Test the function
# -------------------------------
print("Existing user (user_id=1):")
recommend_als_safe(1, N=5)

print("\nCold start user (user_id=999):")
recommend_als_safe(999, N=5)


  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

Existing user (user_id=1):
+--------+--------------+----------------+--------------+
|   Rank |   Product ID | Product Name   |        Score |
|      1 |          103 | Milk           |  0.0196556   |
+--------+--------------+----------------+--------------+
|      2 |          104 | Bread          |  6.60792e-05 |
+--------+--------------+----------------+--------------+
|      3 |          105 | Eggs           |  2.54475e-05 |
+--------+--------------+----------------+--------------+
|      4 |          102 | Apples         | -3.40282e+38 |
+--------+--------------+----------------+--------------+
|      5 |          101 | Bananas        | -3.40282e+38 |
+--------+--------------+----------------+--------------+

Cold start user (user_id=999):
+--------+--------------+----------------+---------+
|   Rank |   Product ID | Product Name   | Score   |
|      1 |          101 | Bananas        |         |
+--------+--------------+----------------+---------+
|      2 |          103 | Milk   

[(101, 'Bananas', None),
 (103, 'Milk', None),
 (104, 'Bread', None),
 (105, 'Eggs', None),
 (102, 'Apples', None)]

Collaborative Filtering 


    1.User-Based Collaborative Filtering (UBCF) – finds users similar to the target user and recommends items those similar users liked.

    2.Item-Based Collaborative Filtering (IBCF) – finds items similar to the ones the target user liked and recommends them.

In [4]:
# Item-Based Collaborative Filtering (Memory-Safe Version)

import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# ===== 1. Load your cleaned & merged data =====
# Replace with your actual file path
df = pd.read_csv("user_product_interactions.csv")

# Ensure correct column names
# Must contain: user_id, product_id
df = df[['user_id', 'product_id']]

# ===== 2. Create User-Item Sparse Matrix =====
# Map IDs to index positions
user_ids = df['user_id'].astype("category").cat.codes
item_ids = df['product_id'].astype("category").cat.codes

# Sparse matrix: rows = users, cols = items
user_item_matrix = csr_matrix(
    ( [1]*len(df), (user_ids, item_ids) )
)

# Reverse mapping for later
item_id_map = dict(enumerate(df['product_id'].astype("category").cat.categories))

# ===== 3. Function: Get Top-N Similar Items =====
def get_top_n_similar_items(target_item_id, top_n=5):
    target_idx = list(item_id_map.keys())[list(item_id_map.values()).index(target_item_id)]
    
    # Get column vector for target item
    target_vector = user_item_matrix[:, target_idx]
    
    # Compute cosine similarity with all other items
    sims = cosine_similarity(target_vector.T, user_item_matrix.T).flatten()
    
    # Get top N excluding self
    similar_indices = sims.argsort()[::-1][1:top_n+1]
    similar_scores = sims[similar_indices]
    
    # Map indices back to product IDs
    similar_items = [(item_id_map[i], score) for i, score in zip(similar_indices, similar_scores)]
    
    return similar_items

# ===== 4. Example Usage =====
target_product = df['product_id'].iloc[0]
print(f"Top 5 items similar to Product {target_product}:")
print(get_top_n_similar_items(target_product, top_n=5))


Top 5 items similar to Product 196:
[(46149, np.float64(0.2870940256440365)), (37710, np.float64(0.2502117928605355)), (6184, np.float64(0.22958867591648727)), (41400, np.float64(0.22715170546482377)), (38928, np.float64(0.20335099832315626))]


In [8]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# -----------------------
# 1. Load & Prepare Data
# -----------------------
data = pd.read_csv("orders_merged.csv")

# 🔹 Optional: work only with first 5000 users to debug
subset_users = data['user_id'].drop_duplicates().head(5000)
data = data[data['user_id'].isin(subset_users)]

# Encode user_id and product_id as category codes
user_cat = data['user_id'].astype("category")
item_cat = data['product_id'].astype("category")

user_ids = user_cat.cat.codes
item_ids = item_cat.cat.codes

# Keep mapping dictionaries for decoding later
user_id_map = dict(enumerate(user_cat.cat.categories))     # code → real user_id
product_id_map = dict(enumerate(item_cat.cat.categories))  # code → real product_id

# If you have product metadata (names), load it
products = pd.read_csv("products_cleaned.csv")  # must have product_id, product_name
product_lookup = products.set_index("product_id")["product_name"].to_dict()

# Sparse matrix
values = [1] * len(data)
interaction_matrix = csr_matrix(
    (values, (user_ids, item_ids)),
    shape=(len(user_id_map), len(product_id_map))
)

# -----------------------
# 2. Recommend for One User
# -----------------------
def recommend_for_user(user_index, top_n=5, top_k_users=50):
    """Recommend items for a given user without building full similarity matrix."""

    # Target user's vector
    user_vector = interaction_matrix[user_index]

    # Cosine similarity with all users
    similarities = cosine_similarity(user_vector, interaction_matrix).ravel()

    # Exclude self
    similarities[user_index] = 0

    # Get top-k similar users
    top_users = similarities.argsort()[::-1][:top_k_users]

    # Aggregate items from similar users
    similar_items = interaction_matrix[top_users].sum(axis=0)
    similar_items = np.array(similar_items).ravel()

    # Remove already purchased items
    purchased_items = user_vector.toarray().ravel()
    similar_items[purchased_items > 0] = 0

    # Recommend top-N (item indices)
    top_items = similar_items.argsort()[::-1][:top_n]

    # Convert to real product IDs + names
    recs = []
    for idx in top_items:
        pid = product_id_map[idx]
        pname = product_lookup.get(pid, "Unknown Product")
        recs.append((pid, pname))

    return recs

# Example: Recommend for first user in subset
print("Recommendations for user:", user_id_map[0])
print(recommend_for_user(0, top_n=5))


Recommendations for user: 66
[(24852, 'Banana'), (26209, 'Limes'), (47626, 'Large Lemon'), (20114, 'Jalapeno Peppers'), (29487, 'Roma Tomato')]


In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load product metadata
products = pd.read_csv("products_cleaned.csv")

# 2. Create combined text field
products['products_cleaned'] = (
    products['product_name'].astype(str) + " " +
    products['aisle'].astype(str) + " " +
    products['department'].astype(str)
)

# 3. Drop duplicates
product_metadata = products[['product_id', 'products_cleaned']].drop_duplicates().reset_index(drop=True)

# 4. TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(product_metadata['products_cleaned'])  # stays sparse

# Map product_id ↔ row index
id_to_index = pd.Series(product_metadata.index, index=product_metadata['product_id'])
index_to_id = pd.Series(product_metadata['product_id'].values, index=product_metadata.index)

# 5. Recommend function (no full similarity matrix!)
def recommend_similar_products(product_id, top_n=5):
    if product_id not in id_to_index:
        return []
    
    idx = id_to_index[product_id]
    sim_scores = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).ravel()
    top_indices = sim_scores.argsort()[::-1][1:top_n+1]
    recommended_ids = index_to_id[top_indices].tolist()
    
    # Map back to names
    results = products[products['product_id'].isin(recommended_ids)][['product_id', 'product_name', 'aisle', 'department']]
    return results

# Example
print("Content-based recommendations for 24852:")
print(recommend_similar_products(24852, top_n=5))



Content-based recommendations for 24852:
       product_id    product_name         aisle department
7888         7889      Red Banana  fresh fruits    produce
14926       14927       Blueberry  fresh fruits    produce
28554       28555         Coconut  fresh fruits    produce
30556       30557  Manzano Banana  fresh fruits    produce
37066       37067  Organic Banana  fresh fruits    produce
