# Book Recommender

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
from collections import defaultdict
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import SVD, SlopeOne, NMF, KNNBaseline, KNNBasic, KNNWithMeans, \
                     KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import cross_validate, train_test_split

## 1. Evaluating the provided dataset

In [2]:
original_df = pd.read_csv("./original_df.csv", encoding='unicode_escape', sep=';')
original_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
original_df = original_df.drop_duplicates()
original_df = original_df.sort_values(by = 'User-ID', ignore_index=True)
original_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,2,195153448,0
1,7,34542252,0
2,8,771025661,0
3,8,1881320189,7
4,8,1575663937,6


In [4]:
original_df.describe().iloc[:, 1:2]

Unnamed: 0,Book-Rating
count,1149780.0
mean,2.86695
std,3.854184
min,0.0
25%,0.0
50%,0.0
75%,7.0
max,10.0


In [5]:
init_notebook_mode(connected=True)

data = original_df['Book-Rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / original_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )

layout = dict(title = 'Book ratings distribution'.format(original_df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [12]:
data = original_df.groupby('ISBN')['Book-Rating'].count().clip(upper=25)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 25,
                                  size = 2))

layout = go.Layout(title = 'Number of ratings per book',
                   xaxis = dict(title = 'Number of ratings per book'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)


fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [52]:
data = original_df.groupby('User-ID')['Book-Rating'].count().clip(upper=25)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 25,
                                  size = 2))

layout = go.Layout(title = 'Number of ratings per user',
                   xaxis = dict(title = 'Number of ratings per user'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)


fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [14]:
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(original_df[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [17]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=1)
svd = SVD(verbose=True)
predictions = svd.fit(trainset).test(testset)
accuracy.rmse(predictions)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 3.5038


3.5037876554475766

In [17]:
def get_positives_negatives(predictions, threshold):

    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for _, _, true_r, est, _ in predictions:
        if true_r >= threshold:
            if est >= threshold:
                true_positive += 1
            else:
                false_negative += 1
        else:
            if est >= threshold:
                false_positive += 1
            else:
                true_negative += 1  

    return true_positive, true_negative, false_positive, false_negative

In [18]:
# Taken from https://surprise.readthedocs.io/en/stable/FAQ.html

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [20]:
true_positive, true_negative, false_positive, false_negative = get_positives_negatives(predictions, threshold=9)
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=9)
print("True Positives:", true_positive)
print("True Negatives:", true_negative)
print("False Positives:", false_positive)
print("False Negatives:", false_negative)
print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))

True Positives: 619
True Negatives: 250305
False Positives: 661
False Negatives: 35860
Precision: 0.004321501270744203
Recall: 0.0009832119214625227


In [25]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df = df[df['details'] == {'was_impossible': False}]
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [26]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
287444,31315,671867172,0.0,0.0,{'was_impossible': False},0.0
248560,210792,1558175636,0.0,0.0,{'was_impossible': False},0.0
146449,78783,671027123,0.0,0.0,{'was_impossible': False},0.0
203949,210792,60971401,0.0,0.0,{'was_impossible': False},0.0
248549,98391,1551667223,10.0,10.0,{'was_impossible': False},0.0
248546,76151,671744208,10.0,10.0,{'was_impossible': False},0.0
146446,102647,440241537,0.0,0.0,{'was_impossible': False},0.0
203966,84024,312195516,0.0,0.0,{'was_impossible': False},0.0
248527,211426,441478123,0.0,0.0,{'was_impossible': False},0.0
60011,198711,553154214,0.0,0.0,{'was_impossible': False},0.0


In [27]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
52680,29526,0743411323,0.0,10.0,{'was_impossible': False},10.0
253489,151608,0345361792,10.0,0.0,{'was_impossible': False},10.0
29209,225087,0452278155,10.0,0.0,{'was_impossible': False},10.0
138579,172061,034541389X,10.0,0.0,{'was_impossible': False},10.0
230612,242106,0553802453,10.0,0.0,{'was_impossible': False},10.0
70448,136313,0553234811,10.0,0.0,{'was_impossible': False},10.0
188676,125736,0385335482,10.0,0.0,{'was_impossible': False},10.0
230567,143968,0316666343,0.0,10.0,{'was_impossible': False},10.0
24889,136205,0446310786,10.0,0.0,{'was_impossible': False},10.0
190980,122793,0312979096,10.0,0.0,{'was_impossible': False},10.0


In [44]:
print("Percentage of predictions with rating error one or less: " +
      f"{round(df[df['err'] <= 1].shape[0] / df['err'].shape[0] * 100)}%") 

Percentage of predictions with rating error one or less: 23%


## 2. Evaluating the polished dataset

In [12]:
polished_df = pd.read_csv("./polished_df.csv").drop(columns = "Unnamed: 0")
polished_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,1,971880107,1
1,1,553278223,5
2,2,446605484,1
3,2,446606383,3
4,2,449911004,1


In [46]:
init_notebook_mode(connected=True)

data = polished_df['Book-Rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / polished_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )

layout = dict(title = 'Book ratings distribution'.format(polished_df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [48]:
data = polished_df.groupby('ISBN')['Book-Rating'].count().clip(upper=50)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))

layout = go.Layout(title = 'Number of ratings per book',
                   xaxis = dict(title = 'Number of ratings per book'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)


fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [53]:
data = polished_df.groupby('User-ID')['Book-Rating'].count().clip(upper=50)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))

layout = go.Layout(title = 'Number of ratings per user',
                   xaxis = dict(title = 'Number of ratings per user'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)


fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [13]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(polished_df[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [26]:
benchmarks = []

for algorithm in tqdm([SVD(), SlopeOne(), NMF(), KNNBaseline(), CoClustering(), BaselineOnly()]):
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    results = pd.DataFrame.from_dict(results).mean(axis=0)
    results = results.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmarks.append(results)
    
pd.DataFrame(benchmarks).set_index('Algorithm').sort_values('test_rmse')


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

 50%|██████████████████████▌                      | 3/6 [02:08<02:07, 42.43s/it]

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

 83%|█████████████████████████████████████▌       | 5/6 [03:09<00:33, 33.98s/it]

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...



The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

100%|█████████████████████████████████████████████| 6/6 [03:18<00:00, 33.11s/it]


Unnamed: 0_level_0,test_rmse,test_mae,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BaselineOnly,1.246345,1.001907,0.691285,1.608823
SVD,1.259384,0.981293,10.611673,3.305171
KNNBaseline,1.27655,0.986894,2.907772,8.9403
CoClustering,1.28901,0.96129,4.462898,0.921184
SlopeOne,1.308618,0.983627,3.198455,10.91672
NMF,1.361328,0.969387,11.856221,1.156084


In [14]:
bsl_options = {'method': 'als'}

trainset, testset = train_test_split(data, test_size=0.25, random_state=1)
baseline = BaselineOnly(bsl_options=bsl_options, verbose=True)
predictions = baseline.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 1.2440


1.2440142583213243

In [27]:
true_positive, true_negative, false_positive, false_negative = get_positives_negatives(predictions, threshold=4)
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
print("True Positives:", true_positive)
print("True Negatives:", true_negative)
print("False Positives:", false_positive)
print("False Negatives:", false_negative)
print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))

True Positives: 90
True Negatives: 48636
False Positives: 14
False Negatives: 12133
Precision: 0.0042117245304496296
Recall: 0.000849797074834812


In [20]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df = df[df['details'] == {'was_impossible': False}]
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [21]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
18240,736,805056645,1.0,1.0,{'was_impossible': False},0.0
31015,1054,446604275,1.0,1.0,{'was_impossible': False},0.0
19212,5494,449134482,1.0,1.0,{'was_impossible': False},0.0
45843,1537,553252828,1.0,1.0,{'was_impossible': False},0.0
44455,3918,425137457,1.0,1.0,{'was_impossible': False},0.0
28978,736,385425074,1.0,1.0,{'was_impossible': False},0.0
6384,2357,440237262,1.0,1.0,{'was_impossible': False},0.0
34699,1119,553282204,1.0,1.0,{'was_impossible': False},0.0
19206,2025,61097101,1.0,1.0,{'was_impossible': False},0.0
13927,2025,452281784,1.0,1.0,{'was_impossible': False},0.0


In [22]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
35107,1121,345347951,5.0,1.04902,{'was_impossible': False},3.95098
32125,3887,151446474,5.0,1.016486,{'was_impossible': False},3.983514
11713,3752,679433740,5.0,1.0,{'was_impossible': False},4.0
39739,4638,515132268,5.0,1.0,{'was_impossible': False},4.0
21354,1776,440351626,5.0,1.0,{'was_impossible': False},4.0
4133,198,375703063,5.0,1.0,{'was_impossible': False},4.0
32556,4876,743406184,5.0,1.0,{'was_impossible': False},4.0
25840,3003,553584014,5.0,1.0,{'was_impossible': False},4.0
26913,4093,451160533,5.0,1.0,{'was_impossible': False},4.0
33089,637,439064864,1.0,5.0,{'was_impossible': False},4.0


In [23]:
print("Percentage of predictions with rating error one or less: " +
      f"{round(df[df['err'] <= 1].shape[0] / df['err'].shape[0] * 100)}%") 

Percentage of predictions with rating error one or less: 59%


## 3. Content boosted collaborative filtering

### 3.1 Getting the content boosted collaborative filtering dataset
#### You can skip this section and head over to the section 3.2 to load the already prepared cbcf dataset

In [12]:
sim_options = {'name': 'cosine',
                'min_support': 5,
                'user_based': False
              }

bsl_options = {'method': 'als'}

trainset = data.build_full_trainset()
knn = KNNBaseline(sim_options=sim_options, bsl_options=bsl_options, verbose=True)
knn.fit(trainset)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f9b91baa6d0>

In [13]:
results = {}
for idx in range(len(set(polished_df['ISBN'].tolist()))):
    similar_indices = knn.sim[idx].argsort()[::-1] 
    similar_items = [(knn.sim[idx][i], trainset.to_raw_iid(i)) for i in similar_indices] 
    results[idx] = similar_items[1:]

In [18]:
def get_idx(id):
    return trainset.to_inner_iid(id)

In [19]:
final = {}

for user_id in tqdm(range(len(set(polished_df['User-ID'].tolist())))):
    final[user_id + 1] = {}
    user_ratings = polished_df[polished_df['User-ID'] == user_id + 1]
    
    if len(user_ratings) > 25:
        similarity_scores = {}
        for idx, row in user_ratings.iterrows():
            final[user_id + 1][row['ISBN']] = row['Book-Rating']

        for idx, row in user_ratings[user_ratings['Book-Rating'] >= 4].iterrows():
            get_results = results[get_idx(row['ISBN'])]
            for i in get_results:
                if i[1] not in final[user_id + 1].keys():
                    if i[1] not in similarity_scores.keys():
                        if round(i[0]*5) == 0:
                            similarity_scores[i[1]] = [1]
                        else:
                            similarity_scores[i[1]] = [round(i[0]*5)]
                    else:
                        if round(i[0]*5) == 0:
                            similarity_scores[i[1]].append(1)
                        else:
                            similarity_scores[i[1]].append(round(i[0]*5))

        for key, value in similarity_scores.items():
            final[user_id + 1][key] = max(value)
            
    else:
        for idx, row in user_ratings.iterrows():
            final[user_id + 1][row['ISBN']] = row['Book-Rating']

100%|███████████████████████████████████████| 5758/5758 [11:01<00:00,  8.70it/s]


In [20]:
content_filter = {'User-ID': [], 'ISBN': [], 'Book-Rating': []}

for i in tqdm(final):
    for key, value in final[i].items():
        content_filter['User-ID'] += [i]
        content_filter['ISBN'] += [key]
        content_filter['Book-Rating'] += [value]
        
final_df = pd.DataFrame.from_dict(content_filter)
final_df.to_csv('cbcf.csv')

100%|██████████████████████████████████████| 5758/5758 [00:08<00:00, 650.44it/s]


### 3.2 Evaluating the content boosted collaborative filtering dataset

In [27]:
cbcf_df = pd.read_csv("./cbcf.csv").drop(columns = "Unnamed: 0")
cbcf_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,1,971880107,1
1,1,553278223,5
2,2,446605484,1
3,2,446606383,3
4,2,449911004,1


In [73]:
init_notebook_mode(connected=True)

data = cbcf_df['Book-Rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / cbcf_df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )

layout = dict(title = 'Book ratings distribution'.format(cbcf_df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [28]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(cbcf_df[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [154]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=1)
svd = SVD(verbose=True)
predictions = svd.fit(trainset).test(testset)
accuracy.rmse(predictions)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
RMSE: 0.8367


0.836707721300556

In [155]:
true_positive, true_negative, false_positive, false_negative = get_positives_negatives(predictions, threshold=4)
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
print("True Positives:", true_positive)
print("True Negatives:", true_negative)
print("False Positives:", false_positive)
print("False Negatives:", false_negative)
print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))

True Positives: 1156608
True Negatives: 1047750
False Positives: 34819
False Negatives: 595363
Precision: 0.40736894299179477
Recall: 0.024399534662445686


In [156]:
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df = df[df['details'] == {'was_impossible': False}]
df['err'] = abs(df.est - df.rui)
best_predictions = df.sort_values(by='err')[:10]
worst_predictions = df.sort_values(by='err')[-10:]

In [157]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,err
1582180,5257,553580930,5.0,5.0,{'was_impossible': False},0.0
1505764,3023,8806142100,1.0,1.0,{'was_impossible': False},0.0
1804417,5079,590494481,1.0,1.0,{'was_impossible': False},0.0
1505760,3722,375411550,1.0,1.0,{'was_impossible': False},0.0
1062172,5551,812580346,1.0,1.0,{'was_impossible': False},0.0
2478482,5472,380711877,5.0,5.0,{'was_impossible': False},0.0
284797,4450,671534734,5.0,5.0,{'was_impossible': False},0.0
1062174,5038,1573225126,1.0,1.0,{'was_impossible': False},0.0
2478491,2184,449219372,5.0,5.0,{'was_impossible': False},0.0
2043890,757,446600415,5.0,5.0,{'was_impossible': False},0.0


In [158]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,err
2566381,1065,385319568,1.0,5.0,{'was_impossible': False},4.0
178110,4616,340682272,5.0,1.0,{'was_impossible': False},4.0
1005035,471,743437640,1.0,5.0,{'was_impossible': False},4.0
1921666,1184,671023209,1.0,5.0,{'was_impossible': False},4.0
30508,1395,345419995,5.0,1.0,{'was_impossible': False},4.0
2649654,5738,425104273,1.0,5.0,{'was_impossible': False},4.0
2251638,3470,446605239,1.0,5.0,{'was_impossible': False},4.0
1706060,1799,1551664275,1.0,5.0,{'was_impossible': False},4.0
2054172,881,399142282,1.0,5.0,{'was_impossible': False},4.0
2593344,242,553349481,5.0,1.0,{'was_impossible': False},4.0


In [159]:
print("Percentage of predictions with rating error one or less: " +
      f"{round(df[df['err'] <= 1].shape[0] / df['err'].shape[0] * 100)}%")

Percentage of predictions with rating error one or less: 83%
