In [3]:
from google.colab import files

uploaded = files.upload()


Saving Office_Products.test.csv.gz to Office_Products.test.csv (2).gz
Saving Office_Products.valid.csv.gz to Office_Products.valid.csv (1).gz
Saving Office_Products.train.csv.gz to Office_Products.train.csv (1).gz


In [4]:
import pandas as pd

# Step 2: Load the datasets
train_df = pd.read_csv('Office_Products.train.csv.gz', compression='gzip')
valid_df = pd.read_csv('Office_Products.valid.csv.gz', compression='gzip')
test_df = pd.read_csv('Office_Products.test.csv.gz', compression='gzip')

# Step 3: Convert timestamp from **milliseconds** to datetime
train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], unit='ms', errors='coerce')
valid_df['timestamp'] = pd.to_datetime(valid_df['timestamp'], unit='ms', errors='coerce')
test_df['timestamp'] = pd.to_datetime(test_df['timestamp'], unit='ms', errors='coerce')

# Optional check (you can remove later)
print("Train timestamp range:", train_df['timestamp'].min(), "to", train_df['timestamp'].max())
print("Valid timestamp range:", valid_df['timestamp'].min(), "to", valid_df['timestamp'].max())
print("Test timestamp range:", test_df['timestamp'].min(), "to", test_df['timestamp'].max())

# Step 4: Filter for data from the year 2020 and beyond
filtered_train_df = train_df[train_df['timestamp'].dt.year >= 2020]
filtered_valid_df = valid_df[valid_df['timestamp'].dt.year >= 2020]
filtered_test_df = test_df[test_df['timestamp'].dt.year >= 2020]

# Step 5: Count records before and after filtering
print(f"\nTrain records before: {len(train_df)}, after (2020+): {len(filtered_train_df)}")
print(f"Valid records before: {len(valid_df)}, after (2020+): {len(filtered_valid_df)}")
print(f"Test records before: {len(test_df)}, after (2020+): {len(filtered_test_df)}")

# Step 6: Print unique user and product counts after filtering
print(f"\n[Train Set] Unique users: {filtered_train_df['user_id'].nunique()}, products: {filtered_train_df['parent_asin'].nunique()}")
print(f"[Valid Set] Unique users: {filtered_valid_df['user_id'].nunique()}, products: {filtered_valid_df['parent_asin'].nunique()}")
print(f"[Test Set] Unique users: {filtered_test_df['user_id'].nunique()}, products: {filtered_test_df['parent_asin'].nunique()}")


Train timestamp range: 1999-08-02 15:44:22 to 2023-09-01 14:00:00.316000
Valid timestamp range: 2002-04-08 18:24:33 to 2023-09-01 14:00:33.644000
Test timestamp range: 2002-11-20 07:14:04 to 2023-09-09 15:15:13.298000

Train records before: 1354262, after (2020+): 420333
Valid records before: 223308, after (2020+): 119969
Test records before: 223308, after (2020+): 154000

[Train Set] Unique users: 95062, products: 55137
[Valid Set] Unique users: 119969, products: 38164
[Test Set] Unique users: 154000, products: 40356


In [None]:
'''
# Downgrade NumPy to version 1.x
!pip install numpy==1.24.4 --quiet

# Restart the Colab runtime to apply changes
import os
os.kill(os.getpid(), 9)
'''


build a user-based collaborative filtering recommender using K-Nearest Neighbors (KNN) for explicit ratings

In [1]:
'''
!pip install scikit-surprise
'''




In [None]:
'''
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split, cross_validate

# Step 1: Define the reader with your actual rating scale
reader = Reader(rating_scale=(1, 5))

# Step 2: Load data into Surprise format
data = Dataset.load_from_df(filtered_train_df[['user_id', 'parent_asin', 'rating']], reader)

# Step 3: Build the full trainset
trainset = data.build_full_trainset()

# Step 4: Define the user-based collaborative filtering model
sim_options = {
    'name': 'cosine',      # similarity measure: cosine or pearson
    'user_based': True     # this means user-user filtering
}
model = KNNBasic(k=20, sim_options=sim_options)
model.fit(trainset)

# Step 5: Try a prediction
uid = filtered_train_df['user_id'].iloc[0]
iid = filtered_train_df['parent_asin'].iloc[0]
pred = model.predict(uid, iid)
print(f"Predicted rating for user {uid} on item {iid}: {pred.est:.2f}")
'''


In [5]:
!pip install lightfm --quiet


In [10]:
# Step 2: Prepare and Train LightFM model
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset

# Build LightFM-compatible dataset from user-product-rating tuples
dataset = Dataset()
dataset.fit(filtered_train_df['user_id'], filtered_train_df['parent_asin'])

# Build the interaction matrix using explicit ratings
(interactions, weights) = dataset.build_interactions([
    (row['user_id'], row['parent_asin'], row['rating']) for idx, row in filtered_train_df.iterrows()
])

# Train a model using logistic loss (works well for ranking & can approximate explicit ratings)
model = LightFM(loss='logistic')
model.fit(interactions, sample_weight=weights, epochs=10, num_threads=2)



<lightfm.lightfm.LightFM at 0x78f97e45ddd0>

<lightfm.lightfm.LightFM at 0x78f9769954d0>

In [12]:
# Step 3: Predict the score for a specific user-item pair

# Get the internal LightFM mappings
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()

# Select one user and one product (item)
uid = list(user_id_map.keys())[0]
iid = list(item_id_map.keys())[0]

# Convert to internal LightFM IDs
user_index = user_id_map[uid]
item_index = item_id_map[iid]

# Predict the score (needs to be in list format)
score = model.predict([user_index], [item_index])

# Output result
print(f"(SVD)Predicted rating (score) for user '{uid}' and item '{iid}': {score[0]:.2f}")



(SVD)Predicted rating (score) for user 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ' and item 'B098K24779': 2.74


In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# === Step 1: Extract user/item embeddings ===
user_embeddings = model.user_embeddings
item_embeddings = model.item_embeddings

# === Step 2: Pick a target user or item ===
# Let's say we pick the first user in the user_id_map
uid = list(user_id_map.keys())[0]
user_index = user_id_map[uid]

# === Step 3: Compute cosine similarity between this user and all other users ===
user_vec = user_embeddings[user_index].reshape(1, -1)
similarities = cosine_similarity(user_vec, user_embeddings)[0]  # shape: (num_users,)

# === Step 4: Get top K similar users (excluding the user itself) ===
K = 5
similar_user_indices = np.argsort(similarities)[::-1][1:K+1]  # skip self at index 0

# === Step 5: Map internal indices back to user_ids and print ===
inv_user_id_map = {v: k for k, v in user_id_map.items()}

print(f"\nTop {K} similar users to '{uid}':")
for neighbor_idx in similar_user_indices:
    neighbor_uid = inv_user_id_map[neighbor_idx]
    sim_score = similarities[neighbor_idx]
    print(f"User '{neighbor_uid}' | Cosine Similarity: {sim_score:.4f}")



Top 5 similar users to 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ':
User 'AEMMD7M4TPVOKOG4WTVGFQMFAI6A' | Cosine Similarity: 0.9392
User 'AFXV5R4UZHUHQBDWCFUMPZVER2ZA' | Cosine Similarity: 0.9389
User 'AFZ4D3N3EZQSAPZVHYAKBSMI2EPQ' | Cosine Similarity: 0.9262
User 'AFZNJEXXZJSBG3VCOOAMQZGMBPZA' | Cosine Similarity: 0.9210
User 'AHGBHWTHNCKNCA6AGTTN32YY7RFQ' | Cosine Similarity: 0.9145


In [14]:
# Predict for the target user for items liked by nearest neighbor
item_index = 0  # for example
pred_score = model.predict([user_index], [item_index])[0]
print(f"Predicted score for user '{uid}' on item index {item_index}: {pred_score:.2f}")


Predicted score for user 'AFKZENTNBQ7A7V7UXW5JJI6UGRYQ' on item index 0: 2.74
