In [40]:
# 📌 Step 1: Import Libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import warnings
warnings.filterwarnings('ignore')

In [41]:
# 📌 Step 2: Load the Dataset
df = pd.read_csv('electronics_dataset.csv')
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,2984,316609,5,06/10/2014,Male,Television & Video,Toshiba,2013,,0
1,4334,316610,5,06/10/2014,Male,Headphones,,2014,,0
2,4334,316611,5,06/10/2014,Male,Headphones,,2014,,0
3,5041,316612,5,06/10/2014,Male,Headphones,Bose,2014,Female,0
4,4434,316613,5,06/10/2014,Male,Computers & Accessories,,2014,,0


In [42]:
# standardizing the date format
# Example: If timestamps are like "01/12/2023"
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y', errors='coerce')

# df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
# df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d')
# df.head()

In [43]:
df.isnull().sum().sort_values(ascending=False)

user_attr     605400
brand         547006
item_id            0
user_id            0
rating             0
timestamp          0
model_attr         0
category           0
year               0
split              0
dtype: int64

In [44]:
# 3. Clean model_attr field (contains "Female&Male" which should probably be standardized)
df['model_attr'] = df['model_attr'].str.replace('&', '/')  # Standardize separator
df['model_attr'] = df['model_attr'].fillna('Unknown')

In [45]:
df['brand'] = df.groupby(['category','model_attr','year'])['brand'].transform(
    lambda x: x.fillna(x.mode()[0] if not x.mode().empty else ' Unknown')
)

In [46]:
df['user_attr'] = df['user_attr'].fillna(df['model_attr'])

In [47]:
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,2984,316609,5,2014-10-06,Male,Television & Video,Toshiba,2013,Male,0
1,4334,316610,5,2014-10-06,Male,Headphones,Bose,2014,Male,0
2,4334,316611,5,2014-10-06,Male,Headphones,Bose,2014,Male,0
3,5041,316612,5,2014-10-06,Male,Headphones,Bose,2014,Female,0
4,4434,316613,5,2014-10-06,Male,Computers & Accessories,HP,2014,Male,0


In [48]:
df.isnull().sum().sort_values(ascending=False)

item_id       0
user_id       0
rating        0
timestamp     0
model_attr    0
category      0
brand         0
year          0
user_attr     0
split         0
dtype: int64

In [49]:
df.head(10)

Unnamed: 0,item_id,user_id,rating,timestamp,model_attr,category,brand,year,user_attr,split
0,2984,316609,5,2014-10-06,Male,Television & Video,Toshiba,2013,Male,0
1,4334,316610,5,2014-10-06,Male,Headphones,Bose,2014,Male,0
2,4334,316611,5,2014-10-06,Male,Headphones,Bose,2014,Male,0
3,5041,316612,5,2014-10-06,Male,Headphones,Bose,2014,Female,0
4,4434,316613,5,2014-10-06,Male,Computers & Accessories,HP,2014,Male,0
5,3556,316614,4,2014-10-06,Female/Male,Computers & Accessories,Plemo,2013,Female/Male,0
6,4564,316615,3,2014-10-06,Female,Camera & Photo,Fujifilm,2014,Female,0
7,3401,257302,4,2014-10-06,Female,Computers & Accessories,Bose,2013,Female,2
8,4334,93971,5,2014-10-06,Male,Headphones,Bose,2014,Male,0
9,3189,136390,5,2014-10-06,Female/Male,Headphones,Etre Jeune,2013,Female,2


In [50]:
#df.to_csv('electronics_dataset1.csv', index=False)

In [None]:
# # Filter training split only
# train_data = df[df['split'] == 0]

# # Step 1: Group item_ids per user
# user_transactions = train_data.groupby('user_id')['item_id'].apply(list).values.tolist()

# # Step 2: Convert to one-hot encoded format
# te = TransactionEncoder()
# te_data = te.fit(user_transactions).transform(user_transactions)
# df_te = pd.DataFrame(te_data, columns=te.columns_)

# # Step 3: Apply Apriori algorithm
# frequent_itemsets = apriori(df_te, min_support=0.01, use_colnames=True)

# # Step 4: Generate association rules
# rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# # Step 5: Sort rules by lift
# rules_sorted = rules.sort_values(by="lift", ascending=False)

# # Step 6: Show top pairs and triplets
# print("Top Item Pairs:")
# print(rules_sorted[(rules_sorted['antecedents'].apply(lambda x: len(x) == 1)) &
#                    (rules_sorted['consequents'].apply(lambda x: len(x) == 1))][['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

# print("\nTop Item Triplets:")
# print(rules_sorted[((rules_sorted['antecedents'].apply(lambda x: len(x) == 2)) &
#                     (rules_sorted['consequents'].apply(lambda x: len(x) == 1))) |
#                    ((rules_sorted['antecedents'].apply(lambda x: len(x) == 1)) &
#                     (rules_sorted['consequents'].apply(lambda x: len(x) == 2)))][['antecedents', 'consequents', 'support', 'confidence', 'lift']].head())

Top Item Pairs:
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []

Top Item Triplets:
Empty DataFrame
Columns: [antecedents, consequents, support, confidence, lift]
Index: []


In [None]:
# from surprise import Dataset, Reader, SVD
# from surprise.model_selection import train_test_split

# # Step 1: Filter training data
# train_data = df[df['split'] == 0][['user_id', 'item_id', 'rating']]

# # Step 2: Prepare for surprise
# reader = Reader(rating_scale=(1, 5))
# data = Dataset.load_from_df(train_data, reader)
# trainset = data.build_full_trainset()

# # Step 3: Train SVD model (fast & accurate)
# model = SVD()
# model.fit(trainset)


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1dca014c610>

In [None]:
# def get_top_n_recommendations(model, user_id, df_all, trainset, n=5):
#     # All items
#     all_items = df_all['item_id'].unique()

#     # Items already rated by this user in training set
#     rated_items = df_all[(df_all['user_id'] == user_id) & (df_all['split'] == 0)]['item_id'].values

#     # Predict ratings for all *unseen* items
#     predictions = []
#     for item in all_items:
#         if item not in rated_items:
#             try:
#                 pred = model.predict(user_id, item)
#                 predictions.append((item, pred.est))
#             except:
#                 continue

#     # Sort by predicted rating
#     predictions.sort(key=lambda x: x[1], reverse=True)
#     return predictions[:n]

In [None]:
# # Pick a user from test set
# test_users = df[df['split'] == 2]['user_id'].unique()
# top_5 = get_top_n_recommendations(model, test_users[0], df, trainset, n=5)

# print(f"Top 5 recommended items for user {test_users[0]}:")
# for item, score in top_5:
#     print(f"Item ID: {item}, Predicted Rating: {score:.2f}")

Top 5 recommended items for user 257302:
Item ID: 1647, Predicted Rating: 4.96
Item ID: 2513, Predicted Rating: 4.93
Item ID: 1416, Predicted Rating: 4.90
Item ID: 7570, Predicted Rating: 4.89
Item ID: 8946, Predicted Rating: 4.86
