In [2]:
import pandas as pd

# Load e-commerce data
df = pd.read_csv('../data/raw/2019-Nov.csv')
df = df[df['event_type'] == 'purchase']

# Only needed columns
ratings_df = df[['user_id', 'product_id']].copy()

# Add implicit rating = 1 per purchase (binary)
ratings_df['rating'] = 1


In [3]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise.accuracy import rmse


In [4]:
# Define schema
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(ratings_df[['user_id', 'product_id', 'rating']], reader)

# Train-test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [5]:
algo = SVD()
algo.fit(trainset)

# Evaluate
predictions = algo.test(testset)
rmse(predictions)


RMSE: 0.0303


0.030327912134463723

In [6]:
# Pick a user
user_id = ratings_df['user_id'].sample(1).values[0]

# Get all unique products
all_products = ratings_df['product_id'].unique()

# Get products this user already purchased
purchased = ratings_df[ratings_df['user_id'] == user_id]['product_id'].values

# Predict scores for all unseen products
unseen = [pid for pid in all_products if pid not in purchased]
preds = [algo.predict(user_id, pid) for pid in unseen]

# Sort predictions by estimated rating
top_preds = sorted(preds, key=lambda x: x.est, reverse=True)[:5]

# Output recommendations
recommendations = [(p.iid, p.est) for p in top_preds]
recommendations


[(26401669, 1), (1801881, 1), (5800823, 1), (30000218, 1), (12703106, 1)]

In [7]:
top_users = ratings_df['user_id'].value_counts().head(100).index.tolist()

final_recs = []

for uid in top_users:
    seen = ratings_df[ratings_df['user_id'] == uid]['product_id'].values
    unseen = [pid for pid in all_products if pid not in seen]
    preds = [algo.predict(uid, pid) for pid in unseen]
    top5 = sorted(preds, key=lambda x: x.est, reverse=True)[:5]
    for p in top5:
        final_recs.append({'user_id': p.uid, 'product_id': p.iid, 'score': p.est})

# Create DataFrame
recs_df = pd.DataFrame(final_recs)
recs_df.to_csv('../data/processed/recommendations.csv', index=False)
