In [3]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split
import collections
from collections import defaultdict
import math

In [4]:
book = pd.read_csv('Amazon_Books_Scraping/Books_df.csv')
book.insert(0, 'book_id', range(len(book)))
# genre = pd.read_csv('Amazon_Books_Scraping/Genre_df.csv')
# sub_genre = pd.read_csv('Amazon_Books_Scraping/Sub_Genre_df.csv')

In [5]:
book.head()

Unnamed: 0.1,book_id,Unnamed: 0,Title,Author,Main Genre,Sub Genre,Type,Price,Rating,No. of People rated,URLs
0,0,0,The Complete Novel of Sherlock Holmes,Arthur Conan Doyle,"Arts, Film & Photography",Cinema & Broadcast,Paperback,₹169.00,4.4,19923.0,https://www.amazon.in/Complete-Novels-Sherlock...
1,1,1,Black Holes (L) : The Reith Lectures [Paperbac...,Stephen Hawking,"Arts, Film & Photography",Cinema & Broadcast,Paperback,₹99.00,4.5,7686.0,https://www.amazon.in/Black-Holes-Lectures-Ste...
2,2,2,The Kite Runner,Khaled Hosseini,"Arts, Film & Photography",Cinema & Broadcast,Kindle Edition,₹175.75,4.6,50016.0,https://www.amazon.in/Kite-Runner-Khaled-Hosse...
3,3,3,Greenlights: Raucous stories and outlaw wisdom...,Matthew McConaughey,"Arts, Film & Photography",Cinema & Broadcast,Paperback,₹389.00,4.6,32040.0,https://www.amazon.in/Greenlights-Raucous-stor...
4,4,4,The Science of Storytelling: Why Stories Make ...,Will Storr,"Arts, Film & Photography",Cinema & Broadcast,Paperback,₹348.16,4.5,1707.0,https://www.amazon.in/Science-Storytelling-Wil...


In [6]:
df_book = book[['book_id', 'Main Genre', 'Rating']].copy()
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(df_book, reader)

train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=21)

In [7]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

model_knn = KNNBasic(sim_options=sim_options)
model_knn.fit(train_set)
pred_knn = model_knn.test(test_set)
rmse_knn = accuracy.rmse(pred_knn)
print('KNN RMSE:', rmse_knn)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8982
KNN RMSE: 0.8981812842972066


In [8]:
model_svd = SVD()
model_svd.fit(train_set)
pred_svd = model_svd.test(test_set)
rmse_svd = accuracy.rmse(pred_svd)
print('SVD RMSE:', rmse_svd)

RMSE: 0.8527
SVD RMSE: 0.8526609552315656


In [9]:
def precision_recall(predictions, k=10, threshold=3.5):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((iid, est, true_r))
    precisions = []
    recalls = []
    
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, _, true_r) in user_ratings)
        n_rec_k = sum((true_r >= threshold) for (_, _, true_r) in user_ratings[:k])
        
        if k > 0:
            precision = n_rec_k / k
        else:
            precision = 1
        if n_rel != 0:
            recall = n_rec_k / n_rel
        else:
            recall = 1
        precisions.append(precision)
        recalls.append(recall)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    return mean_precision, mean_recall

### Data Sparsity

In [10]:
fractions = [0.1, 0.3, 0.5, 0.7]
results_rmse = {}
results_prec = {}
results_recall = {}

for f in fractions:
    print(f'==========Data fraction: {f}==========')
    df_sample = df_book.sample(frac=f, random_state=21)
    dataset_sample = Dataset.load_from_df(df_sample, reader)
    train_set_sample, test_set_sample = train_test_split(dataset_sample, test_size=0.2, random_state=21)
    
    model_knn = KNNBasic(sim_options=sim_options)
    model_knn.fit(train_set_sample)
    pred_knn = model_knn.test(test_set_sample)
    
    rmse_knn = accuracy.rmse(pred_knn)
    results_rmse[f] = rmse_knn
    
    p_val, r_val = precision_recall(pred_knn, k=10, threshold=3.5)
    results_prec[f] = p_val
    results_recall[f] = r_val

print("Sparsity Analysis")
for f in fractions:
    print(f"Fraction: {f}, RMSE: {results_rmse[f]}, Precision: {results_prec[f]}, Recall: {results_recall[f]}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8191
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8278
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9249
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9391
Sparsity Analysis
Fraction: 0.1, RMSE: 0.8190706644723327, Precision: 0.0962264150943396, Recall: 1.0
Fraction: 0.3, RMSE: 0.827847621451401, Precision: 0.09579831932773111, Recall: 1.0
Fraction: 0.5, RMSE: 0.9249435575855631, Precision: 0.09432534678436318, Recall: 1.0
Fraction: 0.7, RMSE: 0.9391025585491617, Precision: 0.09450450450450448, Recall: 1.0


### Item Popularity

In [11]:
df_itempop = pd.read_csv('Amazon_Books_Scraping/Books_df.csv')
df_itempop.insert(0, 'book_id', range(len(book)))

df_itempop = df_itempop[['book_id', 'Main Genre', 'Rating']]
data_itempop = Dataset.load_from_df(df_itempop, reader)
train_set_ip, test_set_ip = train_test_split(data_itempop, test_size=0.2, random_state=42)

In [12]:
train_items = collections.defaultdict(int)

for uid, iid, rating in train_set.all_ratings():
    raw_iid = train_set.to_raw_iid(int(iid))
    train_items[raw_iid] += 1

In [13]:
sorted_items = sorted(train_items.items(), key=lambda x: x[1], reverse=True)
num_popular_items = math.ceil(0.2 * len(sorted_items))
popular_item_ids = set([item[0] for item in sorted_items[:num_popular_items]])

print(f"Number of popular items: {num_popular_items}")

Number of popular items: 6


In [14]:
model_knn_pop = KNNBasic(sim_options=sim_options)
model_knn_pop.fit(train_set)
pred_knn_pop = model_knn_pop.test(test_set_ip)

rmse_all = accuracy.rmse(pred_knn_pop)
p_all, r_all = precision_recall(pred_knn_pop, k=10, threshold=3.5)

preds_popular = []
preds_longtail = []

for p in pred_knn_pop:
    if p.iid in popular_item_ids:
        preds_popular.append(p)
    else:
        preds_longtail.append(p)
        
rmse_popular = accuracy.rmse(preds_popular)
p_pop, r_pop = precision_recall(preds_popular, k=10, threshold=3.5)
rmse_longtail = accuracy.rmse(preds_longtail)
p_long, r_long = precision_recall(preds_longtail, k=10, threshold=3.5)

print("Item Popularity Bias Analysis")
print(f"Overall RMSE: {rmse_all}, Precision: {p_all}, Recall: {r_all}")
print(f"Popular Items RMSE: {rmse_popular}, Precision: {p_pop}, Recall: {r_pop}")
print(f"Long-tail Items RMSE: {rmse_longtail}, Precision: {p_long}, Recall: {r_long}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.4443
RMSE: 0.4283
RMSE: 0.4639
Item Popularity Bias Analysis
Overall RMSE: 0.4442858597187124, Precision: 0.09407313997477931, Recall: 1.0
Popular Items RMSE: 0.42825024831070246, Precision: 0.09436936936936938, Recall: 1.0
Long-tail Items RMSE: 0.46388596705932467, Precision: 0.09369627507163322, Recall: 1.0


### Cold start

In [15]:
df_cold = pd.read_csv('Amazon_Books_Scraping/Books_df.csv')
df_cold.insert(0, 'book_id', range(len(book)))

df_cold = df_cold[['book_id', 'Main Genre', 'Rating']]

In [16]:
data_cold = Dataset.load_from_df(df_cold, reader)
train_set_cold, test_set_cold = train_test_split(data_cold, test_size=0.2, random_state=21)

In [17]:
test_df_cold = pd.DataFrame(test_set_cold, columns=['book_id', 'Main Genre', 'Rating'])
new_book = [
    (7928,'Action & Adventure',4)
    ]

new_users_df = pd.DataFrame(new_book, columns=['book_id', 'Main Genre', 'Rating'])
test_df_cold = pd.concat([test_df_cold, new_users_df], ignore_index=True)

In [18]:
test_set_new = list(test_df_cold[['book_id', 'Main Genre', 'Rating']].itertuples(index=False, name=None))

model_knn_cold = KNNBasic(sim_options=sim_options)
model_knn_cold.fit(train_set_cold)
pred_knn_cold = model_knn_cold.test(test_set_new)
rmse_knn_cold = accuracy.rmse(pred_knn_cold)

new_user_ids = {7928}
#filter out predictions for new users
preds_new = [p for p in pred_knn_cold if p.uid in new_user_ids]
preds_old = [p for p in pred_knn_cold if p.uid not in new_user_ids]

rmse_new = accuracy.rmse(preds_new)
rmse_old = accuracy.rmse(preds_old)

p_all_cold, r_all_cold = precision_recall(pred_knn_cold, k=10, threshold=3.5)
p_new, r_new = precision_recall(preds_new, k=10, threshold=3.5)
p_old, r_old = precision_recall(preds_old, k=10, threshold=3.5)
print("Cold Start Analysis")
print(f"Overall RMSE: {rmse_knn_cold}, Precision: {p_all_cold}, Recall: {r_all_cold}")
print(f"New Users RMSE: {rmse_new}, Precision: {p_new}, Recall: {r_new}")
print(f"Old Users RMSE: {rmse_old}, Precision: {p_old}, Recall: {r_old}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8979
RMSE: 0.2603
RMSE: 0.8982
Cold Start Analysis
Overall RMSE: 0.8979220296467953, Precision: 0.09439193446754883, Recall: 1.0
New Users RMSE: 0.2602806685588144, Precision: 0.1, Recall: 1.0
Old Users RMSE: 0.8981812842972066, Precision: 0.09438839848675913, Recall: 1.0
