#### Collaborative Filtering

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# books_data = pd.read_csv('../data/books_data.csv')
books_data = pd.read_csv('../../data_preprocessing/books_data.csv')
ratings_data = pd.read_csv('../../data/books_ratings_data.csv')
book_tags_data = pd.read_csv('../../data/book_tags_data.csv')
tags_data = pd.read_csv('../../data/tags_data.csv')

In [3]:
# Added to remove unnamed column from preprocessed dataset

books_data.dropna(inplace=True)

In [4]:
# Cleaning 'original_publication_year' -> move to preprocessing later
books_data['original_publication_year'] = books_data['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [5]:
# Removing duplicate ratings
unique_ratings = ratings_data.drop_duplicates()

# removing users with less than 4 ratings
unwanted_users = unique_ratings.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 4]
unwanted_ratings = unique_ratings[unique_ratings.user_id.isin(unwanted_users.index)]
filtered_ratings = unique_ratings.drop(unwanted_ratings.index)

In [6]:
books_data.head(2)

Unnamed: 0.1,Unnamed: 0,id,best_book_id,work_id,books_count,isbn13,original_publication_year,title,language_code,average_rating,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,authors,summary
0,0,1,2767052,2792775,272,9780439023480,2008,the hunger games the hunger games,eng,4.34,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,Suzanne Collins,"In a future North America, where the rulers of..."
3,2,3,41865,3212258,226,9780316015840,2005,twilight twilight,en-US,3.57,...,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,Stephenie Meyer,With 160 million copies of the Twilight Saga s...


In [7]:
filtered_ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [15]:
filtered_ratings = pd.merge(filtered_ratings, books_data[['id','title']], how='inner', left_on='book_id', right_on='id')

In [16]:
filtered_ratings.head()

Unnamed: 0,book_id,user_id,rating,id,title
0,1,314,5,1,the hunger games the hunger games
1,1,439,3,1,the hunger games the hunger games
2,1,588,5,1,the hunger games the hunger games
3,1,1169,4,1,the hunger games the hunger games
4,1,1185,4,1,the hunger games the hunger games


##### User based approach

In [17]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [18]:
# laod dataset to surprise from pandas dataframe - filtered_ratings
reader = Reader()
data = Dataset.load_from_df(filtered_ratings[['user_id', 'book_id', 'rating']], reader)

In [19]:
# perform a 5 fold cross validation
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv =5)

{'test_rmse': array([0.73908068, 0.7398235 , 0.7398254 , 0.74180835, 0.738603  ]),
 'test_mae': array([0.55332681, 0.55373871, 0.55423842, 0.55531341, 0.55279923]),
 'fit_time': (53.35231685638428,
  53.92302703857422,
  54.86458230018616,
  52.98295974731445,
  58.633641719818115),
 'test_time': (3.274871349334717,
  2.5828487873077393,
  3.4083523750305176,
  3.2898521423339844,
  3.5534040927886963)}

In [20]:
# Build and fit model on trainset
trainset = data.build_full_trainset()
algo.fit(trainset);

In [32]:
filtered_ratings[filtered_ratings['user_id'] == 40]

Unnamed: 0,book_id,user_id,rating,id,title
283780,2707,40,4,2707,boomerang travels in the new third world
328477,3123,40,3,3123,confessions of an economic hit man
346798,3324,40,5,3324,one more thing stories and other stories
391022,3722,40,4,3722,the story of the lost child the neapolitan novels
391023,3722,40,4,3722,the story of the lost child the neapolitan novels
416716,3961,40,4,3961,dept of speculation
419204,3985,40,4,3985,love is a mix tape
422578,4020,40,3,4020,a hologram for the king
434210,4140,40,4,4140,manhunt the day chase for lincolns killer
535456,5122,40,5,5122,fear and loathing on the campaign trail


In [22]:
algo.predict(50, 1000)

Prediction(uid=50, iid=1000, r_ui=None, est=4.209903103983639, details={'was_impossible': False})

##### Iten based approach

In [23]:
filtered_ratings[filtered_ratings['title'].str.contains('^twiligh.*')==True]

Unnamed: 0,book_id,user_id,rating,id,title
100,3,314,3,3,twilight twilight
101,3,588,1,3,twilight twilight
102,3,2077,2,3,twilight twilight
103,3,2487,3,3,twilight twilight
104,3,2900,3,3,twilight twilight
...,...,...,...,...,...
798180,7837,47609,4,7837,twilight and history
798181,7837,48206,4,7837,twilight and history
798182,7837,48606,5,7837,twilight and history
798183,7837,49916,4,7837,twilight and history


In [24]:
bookmat = filtered_ratings.pivot_table(index='user_id', columns='title', values='rating')
bookmat.head()

title,a bad case of stripes,a bargain for frances,a bear called paddington paddington,a beautiful dark a beautiful dark,a beautiful mind,a beautiful wedding beautiful,a bend in the river,a bite to remember argeneau,a book of five rings the classic guide to strategy,a breath of snow and ashes outlander,...,zen mind beginners mind informal talks on zen meditation and practice,zero day john puller,zero history blue ant,zero to one notes on startups or how to build the future,zita the spacegirl zita the spacegirl,zodiac,zoes tale old mans war,zone one,zoo,zorba the greek
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,4.0,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,


In [25]:
def get_similar(title, mat):
    title_user_ratings = mat[title]
    similar_to_title = mat.corrwith(title_user_ratings)
    corr_title = pd.DataFrame(similar_to_title, columns=['correlation'])
    corr_title.dropna(inplace=True)
    corr_title.sort_values('correlation', ascending=False, inplace=True)
    return corr_title

In [26]:
bookmat.head()

title,a bad case of stripes,a bargain for frances,a bear called paddington paddington,a beautiful dark a beautiful dark,a beautiful mind,a beautiful wedding beautiful,a bend in the river,a bite to remember argeneau,a book of five rings the classic guide to strategy,a breath of snow and ashes outlander,...,zen mind beginners mind informal talks on zen meditation and practice,zero day john puller,zero history blue ant,zero to one notes on startups or how to build the future,zita the spacegirl zita the spacegirl,zodiac,zoes tale old mans war,zone one,zoo,zorba the greek
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,4.0,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,


In [27]:
bookmat['twilight twilight']

user_id
5       NaN
7       NaN
9       NaN
10      NaN
11      NaN
         ..
53419   NaN
53420   NaN
53421   NaN
53422   NaN
53424   NaN
Name: twilight twilight, Length: 39125, dtype: float64

In [28]:
title = "twilight twilight"
smlr = get_similar(title, bookmat)

In [29]:
smlr.head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
lean in women work and the will to lead,1.0
the world is flat a brief history of the twenty first century,1.0
ash,1.0
the autobiography of malcolm x,1.0
the secret diary of adrian mole aged adrian mole,1.0
everything i never told you,1.0
swamplandia,1.0
fablehaven fablehaven,1.0
fables vol legends in exile,1.0
stolen songbird the malediction trilogy,1.0


In [30]:
# filter by rating count
smlr = smlr.join(books_data.set_index('title')['ratings_count'])
smlr.head()

Unnamed: 0_level_0,correlation,ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
a bad case of stripes,0.5,51764
a bear called paddington paddington,-1.0,62590
a bear called paddington paddington,-1.0,62590
a breath of snow and ashes outlander,0.080064,87098
a brief history of time,0.302148,165628


In [31]:
smlr[smlr.ratings_count > 5e5].sort_values('correlation', ascending=False).head(10)

Unnamed: 0_level_0,correlation,ratings_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
twilight twilight,1.0,3866839
the selection the selection,0.866025,505340
eclipse twilight,0.857845,1134511
me before you me before you,0.771845,587647
matched matched,0.707029,511815
breaking dawn twilight,0.689029,1070245
city of bones the mortal instruments,0.654081,1154031
the perks of being a wallflower,0.574701,888806
fifty shades of grey fifty shades,0.568323,1338493
eragon the inheritance cycle,0.53995,1104021
