## Load the Data

In [1]:
import pandas as pd
import numpy as np

orders = pd.read_csv("orders.csv")
order_products = pd.read_csv("order_products__prior.csv")
products = pd.read_csv("products.csv")

## Merged orders with products

In [2]:
df = (
    order_products
    .merge(orders[["order_id", "user_id"]], on="order_id", how="left")
    .merge(products[["product_id", "product_name"]], on="product_id", how="left")
)

In [3]:
df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,product_name
0,2,33120.0,1.0,1.0,202279,Organic Egg Whites
1,2,28985.0,2.0,1.0,202279,Michigan Organic Kale
2,2,9327.0,3.0,0.0,202279,Garlic Powder
3,2,45918.0,4.0,1.0,202279,Coconut Butter
4,2,30035.0,5.0,0.0,202279,Natural Sweetener
...,...,...,...,...,...,...
16310880,1720834,30662.0,3.0,0.0,120245,Cranberry Watermelon Probiotic Juice Drink
16310881,1720834,26751.0,4.0,0.0,120245,Organic Tomato Basil Sauce
16310882,1720835,33000.0,1.0,1.0,178580,Pure Irish Butter
16310883,1720835,2855.0,2.0,1.0,178580,Organic Good Seed Bread


## Filter the Data

## Keeping only active users

In [4]:
user_counts = df["user_id"].value_counts()
active_users = user_counts[user_counts >= 20].index

df = df[df["user_id"].isin(active_users)]

## Keeping popular products

In [7]:
product_counts = df["product_id"].value_counts()
popular_products = product_counts[product_counts >= 50].index

df = df[df["product_id"].isin(popular_products)]

## This improves:

## speed

## recommendation quality

## interpretability

## Encoding users and products as integers

In [16]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

df["user_idx"] = user_encoder.fit_transform(df["user_id"])
df["product_idx"] = product_encoder.fit_transform(df["product_id"])

## Building a sparse interaction matrix

In [17]:
from scipy.sparse import csr_matrix

rows = df["user_idx"]
cols = df["product_idx"]
data = np.ones(len(df))   # implicit feedback = 1

user_item_sparse = csr_matrix(
    (data, (rows, cols)),
    shape=(df["user_idx"].nunique(), df["product_idx"].nunique())
)

## Computing item–item similarity

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(user_item_sparse.T, dense_output=False)

## Simple recommendation function

In [23]:
def recommend_products(user_id, n=5):
    # 1) Validating user
    if user_id not in set(user_encoder.classes_):
        return pd.DataFrame(columns=["product_id", "product_name"])

    # 2) Getting user's sparse row
    user_idx = user_encoder.transform([user_id])[0]
    user_vector = user_item_sparse[user_idx]  # 1 x num_items sparse row

    # 3) Computing item scores (sparse -> dense 1D array)
    scores_sparse = user_vector.dot(item_similarity)   # 1 x num_items (sparse)
    scores = scores_sparse.toarray().ravel()           # numeric 1D array

    # 4) Remove already purchased items
    purchased = user_vector.nonzero()[1]
    scores[purchased] = 0

    # 5) Top-N indices
    top_idx = scores.argsort()[-n:][::-1]
    top_product_ids = product_encoder.inverse_transform(top_idx)

    # 6) Returning product names
    return products.loc[products["product_id"].isin(top_product_ids), ["product_id", "product_name"]]

## Testing

In [27]:
sample_user = df["user_id"].iloc[0]
recommend_products(sample_user, n=5)

Unnamed: 0,product_id,product_name
21136,21137,Organic Strawberries
21902,21903,Organic Baby Spinach
22934,22935,Organic Yellow Onion
24963,24964,Organic Garlic
47208,47209,Organic Hass Avocado


## Splitting interactions

In [28]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df[["user_id", "product_id"]],
    test_size=0.2,
    random_state=42
)

## Precision K

In [29]:
def precision_at_k(user_id, k=5):
    true_products = test_df[test_df["user_id"] == user_id]["product_id"].unique()
    if len(true_products) == 0:
        return None

    recs = recommend_products(user_id, n=k)
    if len(recs) == 0:
        return None

    recommended_products = recs["product_id"].values
    hits = len(set(recommended_products) & set(true_products))

    return hits / k

## Evaluation on sample users

In [30]:
scores = []
for user in test_df["user_id"].unique()[:100]:
    p = precision_at_k(user, k=5)
    if p is not None:
        scores.append(p)

np.mean(scores)

np.float64(0.0)

## Exporting Results

In [31]:
df.to_csv("instacart_transactions.csv", index=False)