In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn joblib scipy tqdm



In [34]:
import pandas as pd

# Set the correct path
file_path = r"C:\Users\Alby Anil\Desktop\Internship\E-Com Recommendation system\data\2019-Nov.csv"

# Load the first 1 million rows
print("Loading data...")
df = pd.read_csv(file_path, nrows=1_000_000)

print("Data loaded successfully.")
print("Shape of loaded data:", df.shape)

Loading data...
Data loaded successfully.
Shape of loaded data: (1000000, 9)


In [35]:
print("Columns in the dataset:")
print(df.columns.tolist())

print("\nFirst few rows:")
df.head()

Columns in the dataset:
['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']

First few rows:


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-11-01 00:00:00 UTC,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00 UTC,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01 UTC,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
3,2019-11-01 00:00:01 UTC,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
4,2019-11-01 00:00:01 UTC,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [36]:
# Select relevant columns
cols = ['event_type', 'product_id', 'category_id', 'user_id', 'event_time','brand']
df = df[cols]
# Drop missing values
df = df.dropna()
# Only keep items with valid category/brand
df = df[df['category_id'].notna() & df['brand'].notna()]
# Convert IDs to string for encoding
df['product_id'] = df['product_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)

print("Cleaned data shape:", df.shape)

Cleaned data shape: (852803, 6)


In [30]:
from sklearn.preprocessing import LabelEncoder

print("Encoding user and item IDs...")
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_idx'] = user_encoder.fit_transform(df['user_id'])
df['item_idx'] = item_encoder.fit_transform(df['product_id'])

n_users = df['user_idx'].nunique()
n_items = df['item_idx'].nunique()

print(f"Unique users: {n_users}, Unique items: {n_items}")

Encoding user and item IDs...
Unique users: 170337, Unique items: 66001


In [10]:
# Map event types to weights
event_weight_map = {
    'view': 1,
    'cart': 3,
    'purchase': 5
}
df['weight'] = df['event_type'].map(event_weight_map)

print("Weighted events added.")

Weighted events added.


In [11]:
from scipy.sparse import csr_matrix

print("Building weighted user-item matrix...")
user_item_matrix = csr_matrix(
    (df['weight'], (df['user_idx'], df['item_idx']))
)

print("User-item matrix created.")

Building weighted user-item matrix...
User-item matrix created.


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

print("Calculating item similarities...")
item_co_occurrence = user_item_matrix.T @ user_item_matrix
item_similarities = cosine_similarity(item_co_occurrence, dense_output=False)

# Avoid recommending the same item
import numpy as np
np.fill_diagonal(item_similarities.toarray(), 0)

Calculating item similarities...


In [13]:
def recommend_items(item_idx, top_n=10):
    scores = item_similarities[item_idx].toarray().flatten()
    top_items = np.argsort(-scores)[:top_n]
    return [(item_encoder.inverse_transform([i])[0], scores[i]) for i in top_items]

def recommend_for_user(user_idx, top_n=10):
    _, items = user_item_matrix[user_idx].nonzero()
    if len(items) == 0:
        return []

    scores = np.zeros(n_items)
    for item in items:
        scores += item_similarities[item].toarray().flatten()

    scores /= len(items)
    top_items = np.argsort(-scores)[:top_n]
    return [(item_encoder.inverse_transform([i])[0], scores[i]) for i in top_items]

In [14]:
test_user_idx = 0
print(f"Recommendations for user index {test_user_idx}:")
recs = recommend_for_user(test_user_idx)

for item_id, score in recs:
    print(f"Product ID: {item_id} | Score: {score:.4f}")

Recommendations for user index 0:
Product ID: 1003746 | Score: 1.0000
Product ID: 1003752 | Score: 0.3464
Product ID: 14400029 | Score: 0.3004
Product ID: 1005105 | Score: 0.2685
Product ID: 2702028 | Score: 0.2303
Product ID: 25300471 | Score: 0.2303
Product ID: 47000023 | Score: 0.2228
Product ID: 7101191 | Score: 0.2214
Product ID: 7101667 | Score: 0.2198
Product ID: 26205392 | Score: 0.2189


In [15]:
# Reload data with 'category_code' and 'brand'
cols = ['event_type', 'product_id', 'category_code', 'brand', 'price', 'user_id']
df = pd.read_csv(file_path, nrows=1_000_000, usecols=cols)

# Clean and encode
df = df.dropna()
df['product_id'] = df['product_id'].astype(str)
df['user_id'] = df['user_id'].astype(str)

# Deduplicate and clean
product_catalog = df.groupby('product_id').agg({
    'category_code': 'first',
    'brand': 'first',
    'price': 'mean'
}).reset_index()

# Create a human-readable product name
def make_product_name(row):
    brand = row['brand'].title() if isinstance(row['brand'], str) else ''
    category = row['category_code'].split('.')[-1].replace('_', ' ') if isinstance(row['category_code'], str) else 'item'
    return f"{brand} {category}"

product_catalog['product_name'] = product_catalog.apply(make_product_name, axis=1)

# Fill missing values
product_catalog['product_name'] = product_catalog['product_name'].fillna('Unknown Item')
product_catalog['category_code'] = product_catalog['category_code'].fillna('unknown')
product_catalog['brand'] = product_catalog['brand'].fillna('Unknown Brand')
product_catalog['price'] = product_catalog['price'].replace(0, np.nan).fillna(product_catalog['price'].mean())

# Set index
product_catalog.set_index('product_id', inplace=True, drop=False)

# Convert to dictionary
product_info = product_catalog.to_dict(orient='index')

print("✅ Improved product catalog built with product names.")

✅ Improved product catalog built with product names.


In [16]:
def recommend_for_user(user_idx, top_n=10):
    _, items = user_item_matrix[user_idx].nonzero()
    if len(items) == 0:
        return []

    scores = np.zeros(n_items)
    for item in items:
        scores += item_similarities[item].toarray().flatten()

    scores /= len(items)
    top_items = np.argsort(-scores)[:top_n]

    # Get names of items user has interacted with
    user_interacted_ids = [item_encoder.inverse_transform([i])[0] for i in items]
    user_interacted_products = [product_info.get(pid, {}) for pid in user_interacted_ids]
    user_interacted_names = [
        p.get('product_name', 'Unknown Item') for p in user_interacted_products
    ]

    explanation_base = " because you viewed similar items like "
    recommendations = []
    for i in top_items:
        product_id = item_encoder.inverse_transform([i])[0]
        info = product_info.get(product_id, {})
        rec = {
            'product_id': product_id,
            'product_name': info.get('product_name', 'Unknown Item'),
            'score': float(scores[i]),
            'category': info.get('category_code', 'unknown'),
            'brand': info.get('brand', 'Unknown Brand'),
            'price': round(float(info.get('price', 0)), 2),
            'explanation': f"Recommended{explanation_base + ', '.join(user_interacted_names[:2])}"
        }
        recommendations.append(rec)

    return recommendations

In [17]:
test_user_idx = 0
recs = recommend_for_user(test_user_idx)

for rec in recs:
    print(f"""
Product: {rec['product_name']} ({rec['product_id']})
Category: {rec['category']}
Brand: {rec['brand']}
Price: ${rec['price']:.2f}
Score: {rec['score']:.4f}
Explanation: {rec['explanation']}
-------------------------------
""")


Product: Sony smartphone (1003746)
Category: electronics.smartphone
Brand: sony
Price: $304.75
Score: 1.0000
Explanation: Recommended because you viewed similar items like Sony smartphone
-------------------------------


Product: Sony smartphone (1003752)
Category: electronics.smartphone
Brand: sony
Price: $257.38
Score: 0.3464
Explanation: Recommended because you viewed similar items like Sony smartphone
-------------------------------


Product: Unknown Item (14400029)
Category: unknown
Brand: Unknown Brand
Price: $0.00
Score: 0.3004
Explanation: Recommended because you viewed similar items like Sony smartphone
-------------------------------


Product: Apple smartphone (1005105)
Category: electronics.smartphone
Brand: apple
Price: $1348.55
Score: 0.2685
Explanation: Recommended because you viewed similar items like Sony smartphone
-------------------------------


Product: Samsung refrigerators (2702028)
Category: appliances.kitchen.refrigerators
Brand: samsung
Price: $1003.86
Sco

In [41]:
test_user_idx = 100  # try different values like 0, 1, 5, 10, etc.

# Get recommendations using collaborative filtering
recs = recommend_for_user(test_user_idx)

# Get what the user interacted with
_, interacted_items = user_item_matrix[test_user_idx].nonzero()
interacted_product_ids = [item_encoder.inverse_transform([i])[0] for i in interacted_items]

# Optional: get names of interacted products
interacted_names = []
for pid in interacted_product_ids:
    info = product_info.get(pid, {})
    name = info.get('product_name', 'Unknown Item')
    interacted_names.append(f"{name} ({pid})")

# Show top interacted item
if interacted_names:
    print(f"🎯 User viewed: {interacted_names[0]}")
    print("-------------------------------")

# Show recommendations
for rec in recs:
    print(f"""
Product: {rec['product_name']} ({rec['product_id']})
Category: {rec['category']}
Brand: {rec['brand']}
Price: ${rec['price']:.2f}
Score: {rec['score']:.4f}
Explanation: {rec['explanation']}
-------------------------------
""")

🎯 User viewed: Samsung smartphone (1004767)
-------------------------------

Product: Lg acoustic (18200086)
Category: electronics.audio.acoustic
Brand: lg
Price: $334.60
Score: 0.5887
Explanation: Recommended because you viewed similar items like Samsung smartphone, Sony acoustic
-------------------------------


Product: Sony acoustic (18200035)
Category: electronics.audio.acoustic
Brand: sony
Price: $257.15
Score: 0.5697
Explanation: Recommended because you viewed similar items like Samsung smartphone, Sony acoustic
-------------------------------


Product: Sony acoustic (18200034)
Category: electronics.audio.acoustic
Brand: sony
Price: $479.16
Score: 0.5275
Explanation: Recommended because you viewed similar items like Samsung smartphone, Sony acoustic
-------------------------------


Product: Sony acoustic (18200001)
Category: electronics.audio.acoustic
Brand: sony
Price: $308.60
Score: 0.4662
Explanation: Recommended because you viewed similar items like Samsung smartphone, Son

In [53]:
import os
import joblib

# Ensure models directory exists
os.makedirs('models', exist_ok=True)

# Save the model
joblib.dump({
    'user_item_matrix': user_item_matrix,
    'item_similarities': item_similarities,
    'user_encoder': user_encoder,
    'item_encoder': item_encoder,
    'product_info': product_info,
}, 'models/recommender_model.pkl')

print("✅ Model saved successfully to models/recommender_model.pkl")

✅ Model saved successfully to models/recommender_model.pkl
