In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:

# File paths
events_path = r"C:\Users\mbs-p\Desktop\E-commerce-Recommendation-System\Notebook\events_clean.csv"
item_props_path = r"C:\Users\mbs-p\Desktop\E-commerce-Recommendation-System\Notebook\item_properties_clean.csv"
category_path = r"C:\Users\mbs-p\Desktop\E-commerce-Recommendation-System\Raw Data\category_tree.csv"

# Load CSVs
events = pd.read_csv(events_path)
item_props = pd.read_csv(item_props_path)
category_tree = pd.read_csv(category_path)

print(events.head())
print(item_props.head())
print(category_tree.head())


                 timestamp  visitorid event  itemid  transactionid  hour
0  2015-06-02 05:02:12.117     257597  view  355908            NaN     5
1  2015-06-02 05:50:14.164     992329  view  248676            NaN     5
2  2015-06-02 05:13:19.827     111016  view  318965            NaN     5
3  2015-06-02 05:12:35.914     483717  view  253185            NaN     5
4  2015-06-02 05:02:17.106     951259  view  367447            NaN     5
   itemid  property  num_value
0  395014       400      552.0
1  395014       400   639502.0
2  395014       400        NaN
3  395014       400   424566.0
4   59481       790    15360.0
   categoryid  parentid
0        1016     213.0
1         809     169.0
2         570       9.0
3        1691     885.0
4         536    1691.0


In [3]:
# Assign weights to events (stronger signal for transactions)
weights = {"view": 1, "addtocart": 3, "transaction": 5}
events["event_strength"] = events["event"].map(weights)

# Drop NA itemid or visitorid
events = events.dropna(subset=["visitorid", "itemid"])

# Aggregate user-item interaction strength
user_item_strength = (
    events.groupby(["visitorid", "itemid"])["event_strength"]
    .sum()
    .reset_index()
)
print(user_item_strength.head())


   visitorid  itemid  event_strength
0          0   67045               1
1          0  285930               1
2          0  357564               1
3          1   72028               1
4          2  216305               2


In [4]:
# Merge item_properties into item_features
item_features = item_props.groupby("itemid")["num_value"] \
    .apply(lambda x: " ".join(x.dropna().astype(str))) \
    .reset_index()

# Prepare categories (convert to string for consistency)
item_categories = category_tree.rename(columns={"categoryid": "num_value", "parentid": "parent_category"})
item_categories["num_value"] = item_categories["num_value"].astype(str)

# Merge (now dtypes match)
item_features = item_features.merge(item_categories, on="num_value", how="left")

print(item_features.head())



   itemid                                          num_value  parent_category
0       0                                    3.168 1144008.0              NaN
1       1  5760.0 5760.0 5760.0 5760.0 6120.0 5760.0 6240...              NaN
2       2  192.0 145688.0 72.0 6000.0 1144008.0 41040.0 2...              NaN
3       3                                             1560.0              NaN
4       4                                               24.0              NaN


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Vectorize item features
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(item_features["num_value"].astype(str))

# Use Nearest Neighbors instead of full cosine_similarity
knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20, n_jobs=-1)
knn.fit(tfidf_matrix)

# Function to get top-k similar items
def get_similar_items(item_idx, top_k=10):
    distances, indices = knn.kneighbors(tfidf_matrix[item_idx], n_neighbors=top_k+1)
    return list(zip(indices.flatten()[1:], distances.flatten()[1:]))  # skip self


In [6]:
from scipy.sparse import coo_matrix

# Ensure no duplicates (aggregate first)
user_item_strength = (
    events.groupby(["visitorid", "itemid"])["event_strength"]
    .sum()
    .reset_index()
)

# Map users and items to indices
user_mapping = {u: i for i, u in enumerate(user_item_strength["visitorid"].unique())}
item_mapping = {i: j for j, i in enumerate(user_item_strength["itemid"].unique())}

# Convert ids to row/col indices
rows = user_item_strength["visitorid"].map(user_mapping)
cols = user_item_strength["itemid"].map(item_mapping)
data = user_item_strength["event_strength"]

# Build sparse matrix (users × items)
X = coo_matrix((data, (rows, cols)), 
               shape=(len(user_mapping), len(item_mapping)))

print("Sparse matrix shape:", X.shape)
print("Non-zero interactions:", X.nnz)


Sparse matrix shape: (1407580, 235061)
Non-zero interactions: 2145179


In [7]:
from sklearn.decomposition import TruncatedSVD

# Apply SVD directly on sparse matrix
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(X)
item_factors = svd.components_.T

print("User factors shape:", user_factors.shape)
print("Item factors shape:", item_factors.shape)


User factors shape: (1407580, 50)
Item factors shape: (235061, 50)


In [8]:
from sklearn.neighbors import NearestNeighbors

# Already trained
knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=50, n_jobs=-1)
knn.fit(tfidf_matrix)

def get_cb_scores(interacted_items_idx, n_items):
    cb_scores = np.zeros(n_items)
    for item_idx in interacted_items_idx:
        distances, indices = knn.kneighbors(tfidf_matrix[item_idx], n_neighbors=50)
        # Convert distances to similarity (1 - distance)
        sims = 1 - distances.flatten()
        for idx, sim in zip(indices.flatten(), sims):
            cb_scores[idx] += sim
    return cb_scores


In [9]:
def hybrid_recommend(user_id, top_n=10, alpha=0.5):
    if user_id not in user_mapping:
        return []  # cold-start user
    
    user_idx = user_mapping[user_id]

    # --- CF part ---
    cf_scores = np.dot(user_factors[user_idx], item_factors.T)

    # --- CB part ---
    user_row = X.getrow(user_idx).toarray().ravel()
    interacted_items_idx = np.where(user_row > 0)[0]
    if len(interacted_items_idx) > 0:
        cb_scores = get_cb_scores(interacted_items_idx, item_factors.shape[0])
    else:
        cb_scores = np.zeros(item_factors.shape[0])

    # --- Hybrid ---
    hybrid_scores = alpha * cf_scores + (1 - alpha) * cb_scores

    # Remove items already interacted with
    hybrid_scores[interacted_items_idx] = -np.inf

    # Top-N
    recommended_idx = np.argsort(-hybrid_scores)[:top_n]
    inv_item_mapping = {v: k for k, v in item_mapping.items()}
    return [inv_item_mapping[i] for i in recommended_idx]


In [10]:
sample_users = list(user_mapping.keys())[:5]
for u in sample_users:
    print(f"\nUser {u} recommendations:", hybrid_recommend(u, top_n=5, alpha=0.7))


IndexError: index 284244 is out of bounds for axis 0 with size 235061