In [2]:
# Retailrocket Model Training

# 1. Import Libraries
import os
import pandas as pd
import numpy as np
import pickle
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ndcg_score
from scipy.sparse import csr_matrix
import faiss

# Make sure the models directory exists
os.makedirs('../models', exist_ok=True)

# 2. Load Filtered Events
events = pd.read_csv('../data/filtered_events.csv')
print(events.head())

# 3. Prepare Interaction Matrix (for Collaborative Filtering)
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

events['user_id_enc'] = user_encoder.fit_transform(events['visitorid'])
events['item_id_enc'] = item_encoder.fit_transform(events['itemid'])

n_users = events['user_id_enc'].nunique()
n_items = events['item_id_enc'].nunique()

interaction_matrix = csr_matrix(
    (np.ones(events.shape[0]), (events['user_id_enc'], events['item_id_enc'])),
    shape=(n_users, n_items)
)

print(f"Interaction matrix: {interaction_matrix.shape}")

# 4. Train Simple Collaborative Filtering (Matrix Factorization Approx.)
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(interaction_matrix)
item_factors = svd.components_.T

# Save Encoders and Factors
pickle.dump(user_encoder, open('../models/user_encoder.pkl', 'wb'))
pickle.dump(item_encoder, open('../models/item_encoder.pkl', 'wb'))
np.save('../models/user_factors.npy', user_factors)
np.save('../models/item_factors.npy', item_factors)

print("Collaborative Filtering model trained and saved.")



       timestamp  visitorid event  itemid  transactionid
0  1433224214164     992329  view  248676            NaN
1  1433223203944     125625  view   17655            NaN
2  1433222147345    1076270  view  262799            NaN
3  1433221377547    1153198  view  388242            NaN
4  1433223176926     629333  view  128394            NaN
Interaction matrix: (57734, 42725)
Collaborative Filtering model trained and saved.


In [1]:
# 5. Content-Based Filtering (using item metadata)
item_properties = pd.read_csv('../data/item_properties_part1.csv')  # Merge with part2 if needed
item_properties = item_properties.dropna()

# Take latest properties
item_properties_latest = item_properties.sort_values('timestamp').drop_duplicates('itemid', keep='last')

# Example: One-hot encode categoryid
item_metadata = item_properties_latest[['itemid', 'property', 'value']].pivot_table(
    index='itemid', columns='property', values='value', aggfunc='first'
)

item_metadata = item_metadata.fillna('unknown')  # Fill missing

# Simple text embedding using one-hot encoding
item_metadata_encoded = pd.get_dummies(item_metadata.apply(lambda x: str(x)))

# Match items with encoded ids
item_metadata_encoded = item_metadata_encoded.reset_index()
item_metadata_encoded['item_id_enc'] = item_encoder.transform(item_metadata_encoded['itemid'])

item_feature_matrix = csr_matrix(item_metadata_encoded.drop(['itemid', 'item_id_enc'], axis=1).values)

# Save FAISS index for fast similarity search
d = item_feature_matrix.shape[1]
index = faiss.IndexFlatL2(d)
index.add(item_feature_matrix.toarray())

faiss.write_index(index, "../models/item_content_index.faiss")
print("Content-Based model (FAISS index) trained and saved.")

# 6. Hybrid Recommender (Weighted Fusion)
def hybrid_score(user_vector, item_vector_cf, item_vector_content, alpha=0.5):
    """Simple weighted hybrid score between CF and Content-Based"""
    return alpha * np.dot(user_vector, item_vector_cf) + (1 - alpha) * np.dot(user_vector, item_vector_content)

# 7. Prepare Data for Learning-to-Rank (LTR)
# (Synthetic Example: Click = relevant, View = less relevant)
events['event_weight'] = events['event'].map({
    'view': 1,
    'addtocart': 2,
    'transaction': 3
}).fillna(0)

X = events[['user_id_enc', 'item_id_enc']]
y = events['event_weight']

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM expects group info (how many items per user in train set)
train_group = X_train.groupby('user_id_enc').size().values
val_group = X_val.groupby('user_id_enc').size().values

lgb_train = lgb.Dataset(X_train, label=y_train, group=train_group)
lgb_val = lgb.Dataset(X_val, label=y_val, group=val_group, reference=lgb_train)

params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1
}

ltr_model = lgb.train(params, lgb_train, valid_sets=[lgb_val], num_boost_round=100, early_stopping_rounds=10)

# Save LTR model
ltr_model.save_model('../models/ltr_model.txt')

print("LTR model trained and saved.")

# 8. Quick Evaluation
# Predict scores
y_pred_val = ltr_model.predict(X_val)

# Group by user_id
grouped = X_val.copy()
grouped['y_true'] = y_val
grouped['y_pred'] = y_pred_val

ndcg_per_user = []
for user_id, group in grouped.groupby('user_id_enc'):
    if group.shape[0] > 1:
        ndcg = ndcg_score([group['y_true'].values], [group['y_pred'].values])
        ndcg_per_user.append(ndcg)

mean_ndcg = np.mean(ndcg_per_user)
print(f"Validation NDCG: {mean_ndcg:.4f}")

NameError: name 'pd' is not defined