### Collaborative Filtering

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [3]:
# books_data = pd.read_csv('../data/books_data.csv')
books_data = pd.read_csv('../../data_preprocessing/cleaned_books_data.csv')
ratings_data = pd.read_csv('../../data/books_ratings_data.csv')
book_tags_data = pd.read_csv('../../data/book_tags_data.csv')
tags_data = pd.read_csv('../../data/tags_data.csv')

In [4]:
# Added to remove unnamed column from preprocessed dataset

books_data.dropna(inplace=True)

In [5]:
# Cleaning 'original_publication_year'
books_data['original_publication_year'] = books_data['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [6]:
# Removing duplicate ratings
unique_ratings = ratings_data.drop_duplicates()

# removing users with less than 4 ratings
unwanted_users = unique_ratings.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 4]
unwanted_ratings = unique_ratings[unique_ratings.user_id.isin(unwanted_users.index)]
filtered_ratings = unique_ratings.drop(unwanted_ratings.index)

To get an overview of the books data:

In [7]:
books_data.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,book_id,best_book_id,work_id,books_count,isbn13,original_publication_year,title,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,authors,summary,genre
0,0,0,1,2767052,2767052,2792775,272,9780439023480,2008,"The Hunger Games (The Hunger Games, #1)",...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,Suzanne Collins,"In a future North America, where the rulers of...",Juvenile Fiction
1,3,2,3,41865,41865,3212258,226,9780316015840,2005,"Twilight (Twilight, #1)",...,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,Stephenie Meyer,With 160 million copies of the Twilight Saga s...,Young Adult Fiction


To get an overview of the ratings data:

In [8]:
filtered_ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


To merge the two tables.

In [9]:
filtered_ratings = pd.merge(filtered_ratings, books_data[['id','title']], how='inner', left_on='book_id', right_on='id')

To check if the two tables are merged properly or not.

In [10]:
filtered_ratings.head()

Unnamed: 0,book_id,user_id,rating,id,title
0,1,314,5,1,"The Hunger Games (The Hunger Games, #1)"
1,1,439,3,1,"The Hunger Games (The Hunger Games, #1)"
2,1,588,5,1,"The Hunger Games (The Hunger Games, #1)"
3,1,1169,4,1,"The Hunger Games (The Hunger Games, #1)"
4,1,1185,4,1,"The Hunger Games (The Hunger Games, #1)"


##### User based approach

In [11]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [12]:
# laod dataset to surprise from pandas dataframe - filtered_ratings
reader = Reader()
data = Dataset.load_from_df(filtered_ratings[['user_id', 'book_id', 'rating']], reader)

Training the model to predict user rating

In [29]:
# perform a 5 fold cross validation on SVD model
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv =5,verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7417  0.7396  0.7428  0.7398  0.7415  0.7411  0.0012  
MAE (testset)     0.5548  0.5545  0.5561  0.5539  0.5546  0.5548  0.0007  
Fit time          43.68   41.80   44.03   41.11   45.65   43.25   1.63    
Test time         4.27    2.98    2.36    3.04    2.58    3.05    0.66    


{'test_rmse': array([0.74174826, 0.73961383, 0.74280832, 0.73979428, 0.74148421]),
 'test_mae': array([0.5547604 , 0.5544912 , 0.5560817 , 0.5539294 , 0.55464041]),
 'fit_time': (43.67715263366699,
  41.80285286903381,
  44.02550196647644,
  41.11349534988403,
  45.653207540512085),
 'test_time': (4.265209436416626,
  2.9832866191864014,
  2.355137586593628,
  3.044433832168579,
  2.579904079437256)}

In [14]:
# Build and fit model on trainset
trainset = data.build_full_trainset()
algo.fit(trainset);

In [15]:
# check data with user_id = 40
filtered_ratings[filtered_ratings['user_id'] == 40]

Unnamed: 0,book_id,user_id,rating,id,title
277799,2707,40,4,2707,Boomerang: Travels in the New Third World
321632,3123,40,3,3123,Confessions of an Economic Hit Man
339341,3324,40,5,3324,One More Thing: Stories and Other Stories
382585,3722,40,4,3722,The Story of the Lost Child (The Neapolitan No...
382586,3722,40,4,3722,The Story of the Lost Child (The Neapolitan No...
407692,3961,40,4,3961,Dept. of Speculation
410180,3985,40,4,3985,Love Is a Mix Tape
413356,4020,40,3,4020,A Hologram for the King
424791,4140,40,4,4140,Manhunt: The 12-Day Chase for Lincoln's Killer
522290,5122,40,5,5122,Fear and Loathing on the Campaign Trail '72


Function to predict rating for a book based on users past ratings for similar books.

In [None]:
def predict_rating(user_id,book_id):
    return algo.predict(user_id, book_id,verbose=True)

In [16]:
# predict how the user would rate a particular book
user_id = 50
book_id = 1000
predict_rating(user_id, book_id)

Prediction(uid=50, iid=1000, r_ui=None, est=3.6377653378063295, details={'was_impossible': False})

We get an estimated user rating of 3.6 for this book.

##### Iten based approach

In [18]:
# filtered_ratings[filtered_ratings['title'].str.contains('^Twi.*')==True]

Unnamed: 0,book_id,user_id,rating,id,title
100,3,314,3,3,"Twilight (Twilight, #1)"
101,3,588,1,3,"Twilight (Twilight, #1)"
102,3,2077,2,3,"Twilight (Twilight, #1)"
103,3,2487,3,3,"Twilight (Twilight, #1)"
104,3,2900,3,3,"Twilight (Twilight, #1)"
...,...,...,...,...,...
773173,7837,47609,4,7837,Twilight and History
773174,7837,48206,4,7837,Twilight and History
773175,7837,48606,5,7837,Twilight and History
773176,7837,49916,4,7837,Twilight and History


Creating an Item-Item matrix based on ratings given by every user for every item.

In [19]:
bookmat = filtered_ratings.pivot_table(index='user_id', columns='title', values='rating')
bookmat.head()

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,...,أقوم قيلا,السنجة,الطنطورية,الفيل الأزرق,المانيفستو,ساق البامبو,صانع الظلام,فوضى الحواس,في قلبي أنثى عبرية,كخه يا بابا
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,


In [20]:
def get_similar(title, mat):
    title_user_ratings = mat[title]
    similar_to_title = mat.corrwith(title_user_ratings)
    corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
    corr_title.dropna(inplace=True)
    corr_title.sort_values('correlation', ascending=False, inplace=True)
    return corr_title

In [21]:
bookmat.head()

title,"Angels (Walsh Family, #3)",#GIRLBOSS,'Salem's Lot,"'Tis (Frank McCourt, #2)","1,000 Places to See Before You Die",1/4 جرام,"10% Happier: How I Tamed the Voice in My Head, Reduced Stress Without Losing My Edge, and Found Self-Help That Actually Works","100 Bullets, Vol. 1: First Shot, Last Call",100 Love Sonnets,100 Selected Poems,...,أقوم قيلا,السنجة,الطنطورية,الفيل الأزرق,المانيفستو,ساق البامبو,صانع الظلام,فوضى الحواس,في قلبي أنثى عبرية,كخه يا بابا
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,


In [23]:
title = "Twilight (Twilight, #1)"
smlr = get_similar(title, bookmat)

In [24]:
smlr.head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
god is Not Great: How Religion Poisons Everything,1.0
Swamplandia!,1.0
Bad Feminist,1.0
"Persepolis, Volume 1",1.0
Ash,1.0
Run,1.0
"Anna and the French Kiss (Anna and the French Kiss, #1)",1.0
Children of Dune (Dune Chronicles #3),1.0
Skipping Christmas,1.0
"Splintered (Splintered, #1)",1.0


In [25]:
# filter by rating count
smlr = smlr.join(books_data.set_index('title')['ratings_count'])
smlr.head()

Unnamed: 0_level_0,correlation,ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Salem's Lot,0.25,72797
'Salem's Lot,0.25,72797
11/22/63,0.431331,258464
"13 Little Blue Envelopes (Little Blue Envelope, #1)",-0.5,66950
1776,0.301511,130293


Filtering out the books with more than 500000 ratings.

In [26]:
smlr[smlr.ratings_count > 5e5].sort_values('correlation', ascending=False).head(10)

Unnamed: 0_level_0,correlation,ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Twilight (Twilight, #1)",1.0,3866839
"The Selection (The Selection, #1)",0.866025,505340
"Eclipse (Twilight, #3)",0.857845,1134511
"Me Before You (Me Before You, #1)",0.771845,587647
"Matched (Matched, #1)",0.707029,511815
"Breaking Dawn (Twilight, #4)",0.689029,1070245
"City of Bones (The Mortal Instruments, #1)",0.654081,1154031
The Perks of Being a Wallflower,0.574701,888806
"Fifty Shades of Grey (Fifty Shades, #1)",0.568323,1338493
"Eragon (The Inheritance Cycle, #1)",0.53995,1104021


In [27]:
!jupyter nbconvert collaborative_Lavesh_Jain*.ipynb --to python

[NbConvertApp] Converting notebook collaborative_Lavesh_Jain.ipynb to python
[NbConvertApp] Writing 3470 bytes to collaborative_Lavesh_Jain.py
