In [1]:
import pandas as pd
import numpy as np
from copy import copy
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans, SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict

In [2]:
%run import_data_.py

Continuing with existing version of data folder
Goodreads dataset loaded successfully as books_goodreads
Pandas dataframes (books_goodreads, books_big, book, users, ratings) loaded successfully
Columns in DataFrames 'users' and 'ratings' renamed
You can use the DataFrames 'books' or 'books_big' - they are exactly the same (big) dataset
loading books_ratings and books_users_ratings
Ready to go!


#### We want to find users with similar taste in books and try to predict what we will like based on what they like ####

##### step 1: what do I like? #####

In [3]:
# !jupyter nbconvert --to notebook --execute --inplace --no-input test_content_rec_Susanne_Lay.ipynb

In [5]:
data = pd.read_csv('../data/books_ratings.csv')
data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,genre,user_id,book_rating
0,068160204X,The Royals,Kitty Kelley,2020,Bausch & Lombard,,16634,0
1,068160204X,The Royals,Kitty Kelley,2020,Bausch & Lombard,,87141,0
2,068160204X,The Royals,Kitty Kelley,2020,Bausch & Lombard,,169736,7
3,068160204X,The Royals,Kitty Kelley,2020,Bausch & Lombard,,208406,0
4,068160204X,The Royals,Kitty Kelley,2020,Bausch & Lombard,,230496,0


In [6]:
average_rating = pd.read_csv('../data/averageRatingdf.csv')
average_rating.head()

Unnamed: 0,book_title,book_author,isbn,rating_count,average_rating,mod_titles
0,!%@ (A Nutshell handbook),Donnalyn Frey,1565920317,1,6.0,A Nutshell handbook
1,!%@ (A Nutshell handbook),Donnalyn Frey,1565920465,1,0.0,A Nutshell handbook
2,$30 Film School,Michael W. Dean,1592000673,1,8.0,30 Film School
3,$oft Money: The True Power in Our Nation's Cap...,E. L. Burton,1588204030,5,5.4,oft Money The True Power in Our Nation s Cap...
4,' Mein verwundetes Herz'. Das Leben der Lilli ...,Martin Doerry,342105634X,1,9.0,Mein verwundetes Herz Das Leben der Lilli ...


In [7]:
data['user_id'] = data['user_id'].astype('str') 

In [8]:
# filtering user_id who read and rated more than 15 books
user_counts = data['user_id'].value_counts()
data_users = data[data['user_id'].isin(user_counts[user_counts >= 15].index)]
data_users.sample(10)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,genre,user_id,book_rating
72116,042517400X,"Night Moves (Tom Clancy's Net Force, No. 3)",Tom Clancy,2000,Berkley Publishing Group,"Computer crimes, Fiction, Investigation, Compu...",76352,0
17674,184023492X,Tank Girl 2,Jamie Hewlett,2003,Titan Books (UK),"Comic books, strips, Deserts, Punk culture, Ta...",76626,0
114172,1573225789,The Color of Water: A Black Man's Tribute to H...,James McBride,1997,Riverhead Books,"Mulattoes, Race identity, Mothers, Racially mi...",35859,0
102385,155166304X,Here And Then,Linda Lael Miller,1997,Mira,"Fiction, Romance, Contemporary, Time Travel, F...",226545,0
45603,8420443107,Con Animo De Ofender,Arturo Perez-Reverte,2001,"Alfaguara Ediciones, S.A. (Spain)","Literature, Humanities, Editors, Felietony his...",39467,8
141421,1560769319,Tales of Ravenloft (Ravenloft),Brian Thomsen,1994,Wizards of the Coast (Mm),,212898,0
23367,1551669234,Stonebrook Cottage,Carla Neggers,2002,Mira,"Detective and mystery stories, Fiction, romanc...",246655,0
10890,031298328X,Full Tilt (Janet Evanovich's Full Series),Janet Evanovich,2003,St. Martin's Paperbacks,"Max Holt (Fictitious character), Detective and...",238781,0
87457,1568957602,Eye of the Beholder (Wheeler Large Print Book ...,Jayne Ann Krentz,1999,Wheeler Publishing,"Fiction, New Age movement, Women art dealers, ...",197775,0
15599,184068108X,Bad Wisdom,Bill Drummond,2003,Creation Books,"Polar regions, description and travel, North pole",170513,0


In [10]:
# creating list of books for one reference user

books_read = data[data['user_id'] == '114368']
books_read 

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,genre,user_id,book_rating
221,042517140X,Loyalty in Death,J. D. Robb,2004,Berkley Publishing Group,"Fiction, Eve Dallas (Fictitious character), Po...",114368,5
2034,037379133X,Forbidden (Harlequin Blaze),Tori Carrington,2004,Harlequin,"Fiction, Passion, Romance, Large type books, C...",114368,0
2161,037383599X,Simply Sensual (Harlequin Single Title),Carly Phillips,2004,Harlequin,"Fiction, Bodyguards, Children of the rich in f...",114368,5
4208,1551667401,Out Of The Dark,Sharon Sala,2003,Mira,"Romance, Cults, Missing children, Ex-cultists,...",114368,5
4273,1551667436,Strange Bedpersons,Jennifer Crusie,2003,Mira,"Social action, Feminists, Conservatism, Ambiti...",114368,0
...,...,...,...,...,...,...,...,...
166713,044011585X,Crossings,DANIELLE STEEL,1987,Dell,"Fiction, World War, 1939-1945, Ocean travel, L...",114368,10
167136,037308529X,"Temptation (Silhouette Romance, No 529)",Nora Roberts,1987,Silhouette,"Open Library Staff Picks, Fiction, Camps for g...",114368,5
168482,082171838X,September Moon,Constance O'Banyon,1986,Zebra Books,,114368,5
170307,037309230X,Sarah's Child (Silhouette Special Edition # 230),Linda Howard,1985,Silhouette,"Romance, Contemporary Romance, Fiction, Infant...",114368,5


In [11]:
# find similar users

In [12]:
# create a set of all the unique books that user has read
# create a set of all the unique books that user has read, including user_id

unique_books = set(books_read['isbn'])

In [13]:
len(unique_books)

100

In [14]:
# store any user that read the same book as us in the overlap_users dictionary
overlap_users = {}

# overlap_users is a dictionary with key = user_id and value is count of times that reader read a book that we also read  
for book in unique_books:
    for user in data[data['isbn'] == book]['user_id']:
        if user != '114368':
            if user in overlap_users:
                overlap_users[user] += 1
            else:
                overlap_users[user] = 1


In [15]:
len(overlap_users)  

1567

In [16]:
overlap_users

{'31742': 1,
 '69042': 4,
 '76352': 19,
 '98104': 1,
 '98547': 4,
 '118275': 4,
 '118627': 1,
 '175003': 14,
 '184532': 3,
 '204591': 7,
 '212898': 8,
 '213998': 1,
 '227250': 6,
 '227520': 10,
 '228764': 13,
 '242006': 13,
 '242781': 1,
 '246671': 5,
 '247447': 12,
 '253821': 3,
 '264634': 2,
 '265889': 13,
 '269386': 1,
 '275481': 2,
 '278144': 1,
 '8245': 1,
 '14374': 1,
 '17282': 4,
 '17724': 1,
 '30972': 3,
 '31315': 4,
 '39616': 2,
 '40889': 8,
 '43006': 1,
 '43842': 5,
 '51350': 3,
 '60168': 1,
 '69697': 34,
 '77940': 16,
 '78783': 16,
 '79724': 1,
 '95991': 1,
 '107021': 9,
 '108480': 4,
 '111261': 2,
 '114414': 4,
 '115002': 2,
 '116599': 10,
 '119575': 3,
 '123388': 1,
 '123883': 15,
 '125063': 1,
 '126693': 3,
 '128738': 1,
 '128835': 4,
 '129084': 3,
 '130474': 10,
 '140293': 1,
 '145451': 16,
 '145619': 9,
 '151098': 3,
 '151537': 1,
 '151806': 1,
 '153718': 4,
 '156688': 1,
 '161903': 2,
 '168387': 2,
 '170575': 12,
 '177374': 9,
 '180917': 2,
 '185233': 12,
 '185254': 3,

In [17]:
print(overlap_users)

{'31742': 1, '69042': 4, '76352': 19, '98104': 1, '98547': 4, '118275': 4, '118627': 1, '175003': 14, '184532': 3, '204591': 7, '212898': 8, '213998': 1, '227250': 6, '227520': 10, '228764': 13, '242006': 13, '242781': 1, '246671': 5, '247447': 12, '253821': 3, '264634': 2, '265889': 13, '269386': 1, '275481': 2, '278144': 1, '8245': 1, '14374': 1, '17282': 4, '17724': 1, '30972': 3, '31315': 4, '39616': 2, '40889': 8, '43006': 1, '43842': 5, '51350': 3, '60168': 1, '69697': 34, '77940': 16, '78783': 16, '79724': 1, '95991': 1, '107021': 9, '108480': 4, '111261': 2, '114414': 4, '115002': 2, '116599': 10, '119575': 3, '123388': 1, '123883': 15, '125063': 1, '126693': 3, '128738': 1, '128835': 4, '129084': 3, '130474': 10, '140293': 1, '145451': 16, '145619': 9, '151098': 3, '151537': 1, '151806': 1, '153718': 4, '156688': 1, '161903': 2, '168387': 2, '170575': 12, '177374': 9, '180917': 2, '185233': 12, '185254': 3, '194646': 1, '199494': 1, '200226': 9, '203805': 2, '206074': 4, '2129

In [18]:
unique_books

{'006104122X',
 '006108445X',
 '006440031X',
 '034538475X',
 '037307445X',
 '037307607X',
 '037307817X',
 '037308529X',
 '037309230X',
 '037322365X',
 '037325928X',
 '037348240X',
 '037348285X',
 '037348397X',
 '037348464X',
 '037351011X',
 '037369105X',
 '037379018X',
 '037379021X',
 '037379035X',
 '037379049X',
 '037379097X',
 '037379102X',
 '037379116X',
 '037379133X',
 '037383568X',
 '037383599X',
 '038076654X',
 '038079456X',
 '038531292X',
 '038531437X',
 '038531695X',
 '038542471X',
 '038550120X',
 '042517140X',
 '042518630X',
 '044011585X',
 '044652767X',
 '044900256X',
 '050552354X',
 '050552421X',
 '051511779X',
 '051512317X',
 '051512608X',
 '051512947X',
 '051513287X',
 '051513628X',
 '055328990X',
 '055329783X',
 '055344557X',
 '055356045X',
 '055358457X',
 '067164257X',
 '067168289X',
 '067172858X',
 '067179938X',
 '067942573X',
 '073943053X',
 '073943232X',
 '080411952X',
 '082171838X',
 '082171922X',
 '082177512X',
 '084394563X',
 '084394952X',
 '084395048X',
 '15516600

In [19]:
import pandas as pd

# Ensure books_read is a DataFrame
if not isinstance(unique_books, pd.DataFrame):
    unique_books = pd.DataFrame(unique_books, columns=["isbn"])

# Print overlap_users dictionary
print("overlap_users:", overlap_users)

# Print books_read shape
print("books_read.shape:", books_read.shape)

# Print intermediate results
for k in overlap_users:
    print(f"User: {k}, Count: {overlap_users[k]}, Threshold: {unique_books.shape[0] / 5}")

# Filter users who have read at least 20% of the same books
filtered_overlap_users = {k for k in overlap_users if overlap_users[k] > unique_books.shape[0] / 5}

# Print filtered_overlap_users
print("filtered_overlap_users:", filtered_overlap_users)

overlap_users: {'31742': 1, '69042': 4, '76352': 19, '98104': 1, '98547': 4, '118275': 4, '118627': 1, '175003': 14, '184532': 3, '204591': 7, '212898': 8, '213998': 1, '227250': 6, '227520': 10, '228764': 13, '242006': 13, '242781': 1, '246671': 5, '247447': 12, '253821': 3, '264634': 2, '265889': 13, '269386': 1, '275481': 2, '278144': 1, '8245': 1, '14374': 1, '17282': 4, '17724': 1, '30972': 3, '31315': 4, '39616': 2, '40889': 8, '43006': 1, '43842': 5, '51350': 3, '60168': 1, '69697': 34, '77940': 16, '78783': 16, '79724': 1, '95991': 1, '107021': 9, '108480': 4, '111261': 2, '114414': 4, '115002': 2, '116599': 10, '119575': 3, '123388': 1, '123883': 15, '125063': 1, '126693': 3, '128738': 1, '128835': 4, '129084': 3, '130474': 10, '140293': 1, '145451': 16, '145619': 9, '151098': 3, '151537': 1, '151806': 1, '153718': 4, '156688': 1, '161903': 2, '168387': 2, '170575': 12, '177374': 9, '180917': 2, '185233': 12, '185254': 3, '194646': 1, '199494': 1, '200226': 9, '203805': 2, '20

In [20]:
filtered_overlap_users  

{'11676', '129358', '153662', '69697'}

In [21]:
# get the isbn and ratings of the books that these users have read

# Create a dataframe that contains all user_ids that are in filtered_overlap_users and add isbn and book_rating
filtered_overlap_data = data[data['user_id'].isin(filtered_overlap_users)][['user_id', 'isbn', 'book_rating']]
filtered_overlap_data

Unnamed: 0,user_id,isbn,book_rating
9,153662,1593100175,0
10,11676,1881273156,8
123,153662,1583145397,0
141,129358,084395289X,0
208,11676,042517140X,0
...,...,...,...
183298,11676,039480001X,10
183451,153662,030702153X,9
183454,11676,006440031X,7
183566,11676,044077456X,10


### For collaborative filtering: create a user / book matrix ###

In [22]:
unique_books

Unnamed: 0,isbn
0,067179938X
1,051511779X
2,1551660504
3,051512947X
4,084394952X
...,...
95,037379097X
96,1551662469
97,037379035X
98,082171838X


In [23]:
# every row if the matrix will be a different user and every column of the matrix will be a different book containing teh rating of that user for that book

# first we need to concatenate the ratings of the picked user (id = 114368) with the filtered_overlap_data

# Create a DataFrame with the ratings of the picked user

# Filter the original DataFrame by user_id
filtered_data = data[data['user_id'] == '114368']

# Merge unique_books with the filtered data to add 'book_rating' and 'user_id' columns
unique_books = unique_books.merge(filtered_data[['isbn','user_id', 'book_rating', 'book_title']], on='isbn', how='left')

unique_books

Unnamed: 0,isbn,user_id,book_rating,book_title
0,067179938X,114368,5,Son of the Morning
1,051511779X,114368,10,Born in Shame
2,1551660504,114368,10,Skin Deep
3,051512947X,114368,10,Once upon a Dream
4,084394952X,114368,5,Dark Legend
...,...,...,...,...
95,037379097X,114368,0,"Friendly Persuasion (Blaze, Book 93)"
96,1551662469,114368,5,Mackenzies
97,037379035X,114368,0,"BODY CONTACT (Blaze, 31)"
98,082171838X,114368,5,September Moon


In [24]:
# concat the unique_books and filtered_overlap_data
filtered_overlap_data = pd.concat([unique_books, filtered_overlap_data])
filtered_overlap_data

Unnamed: 0,isbn,user_id,book_rating,book_title
0,067179938X,114368,5,Son of the Morning
1,051511779X,114368,10,Born in Shame
2,1551660504,114368,10,Skin Deep
3,051512947X,114368,10,Once upon a Dream
4,084394952X,114368,5,Dark Legend
...,...,...,...,...
183298,039480001X,11676,10,
183451,030702153X,153662,9,
183454,006440031X,11676,7,
183566,044077456X,11676,10,


In [25]:
filtered_overlap_data['book_rating'] = pd.to_numeric(filtered_overlap_data['book_rating'])

In [26]:
filtered_overlap_data['user_id'].unique()

array(['114368', '153662', '11676', '129358', '69697'], dtype=object)

In [27]:
# create a user_index for each user_id
# category function creates for each user_id a category and then we can use cat.codes to get the index of the user

filtered_overlap_data['user_index'] = filtered_overlap_data['user_id'].astype('category').cat.codes
filtered_overlap_data.iloc[0]

isbn                   067179938X
user_id                    114368
book_rating                     5
book_title     Son of the Morning
user_index                      0
Name: 0, dtype: object

In [28]:
len(filtered_overlap_data['user_index'].unique())

5

In [29]:
# do the same thing with the isbn
filtered_overlap_data['isbn_index'] = filtered_overlap_data['isbn'].astype('category').cat.codes
filtered_overlap_data.iloc[0]

isbn                   067179938X
user_id                    114368
book_rating                     5
book_title     Son of the Morning
user_index                      0
isbn_index                   1008
Name: 0, dtype: object

In [30]:
len(filtered_overlap_data['isbn_index'].unique())

3802

In [31]:
# creating a sparse matrix (doesn't take any memory if there is no rating for a book)

from scipy.sparse import coo_matrix

ratings_coo_mat = coo_matrix((filtered_overlap_data['book_rating'], (filtered_overlap_data['user_index'], filtered_overlap_data['isbn_index'])))

In [32]:
ratings_coo_mat

<5x3802 sparse matrix of type '<class 'numpy.int64'>'
	with 4220 stored elements in COOrdinate format>

In [33]:
print(ratings_coo_mat.row)      # Array of row indices of non-zero values
print(ratings_coo_mat.col)      # Array of column indices of non-zero values
print(ratings_coo_mat.data)     # Array of non-zero values

[0 0 0 ... 1 1 1]
[1008  774 1399 ...   77  694 2422]
[ 5 10 10 ...  7 10 10]


In [34]:
ratings_mat = ratings_coo_mat.tocsr()

In [35]:
filtered_overlap_data[filtered_overlap_data['user_id'] == '114368']

Unnamed: 0,isbn,user_id,book_rating,book_title,user_index,isbn_index
0,067179938X,114368,5,Son of the Morning,0,1008
1,051511779X,114368,10,Born in Shame,0,774
2,1551660504,114368,10,Skin Deep,0,1399
3,051512947X,114368,10,Once upon a Dream,0,790
4,084394952X,114368,5,Dark Legend,0,1256
...,...,...,...,...,...,...
95,037379097X,114368,0,"Friendly Persuasion (Blaze, Book 93)",0,424
96,1551662469,114368,5,Mackenzies,0,1422
97,037379035X,114368,0,"BODY CONTACT (Blaze, 31)",0,420
98,082171838X,114368,5,September Moon,0,1179


In [36]:
# setting the chosen user_index to 'my_index'
my_index = 0

In [37]:
# use cosine similarity to see how similar each user is to us

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index, :], ratings_mat).flatten()

In [38]:
# how similar are we to ourself?

similarity[my_index]

0.9999999999999981

In [39]:
#how similar are we to the other users?
similarity[2]

0.05096051011181266

In [40]:
# find indices (= position) for 4 users most similar to us
import numpy as np

indices = np.argpartition(similarity, -4)[-4:]

In [41]:
indices

array([1, 0, 3, 4])

In [42]:
# find the user_id for the 4 most similar users

similar_users = filtered_overlap_data[filtered_overlap_data['user_index'].isin(indices)].copy()

In [43]:
#takes ourself out of the list
similar_users = similar_users[similar_users['user_id'] != '114368']


In [44]:
book_recs = similar_users.groupby('isbn').book_rating.agg(['count', 'mean'])

In [45]:
book_recs  

Unnamed: 0_level_0,count,mean
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
000225929X,1,10.0
000617891X,1,0.0
000648302X,1,8.0
000649840X,1,8.0
002411510X,1,8.0
...,...,...
9879397274,1,7.0
987960170X,1,5.0
9974643058,1,7.0
9997511417,1,0.0


In [46]:
# include book_titles in book_recs
# Drop duplicates in the data DataFrame based on the 'isbn' column
unique_isbn = average_rating[['isbn', 'book_title', 'rating_count', 'mod_titles']].drop_duplicates(subset='isbn')

# Merge book_recs with the unique_data to add 'book_title' column
book_recs = book_recs.merge(unique_isbn, on='isbn', how='inner')

In [47]:
book_recs

Unnamed: 0,isbn,count,mean,book_title,rating_count,mod_titles
0,000225929X,1,10.0,MARBLE HEART,4,MARBLE HEART
1,000617891X,1,0.0,At the Stroke of Twelve,2,At the Stroke of Twelve
2,000648302X,1,8.0,Before and After,14,Before and After
3,000649840X,1,8.0,Angelas Ashes,89,Angelas Ashes
4,002411510X,1,8.0,Walden Two (Trade Book),13,Walden Two Trade Book
...,...,...,...,...,...,...
3592,9879397274,1,7.0,Desde Mi Cielo,6,Desde Mi Cielo
3593,987960170X,1,5.0,Kolon,2,Kolon
3594,9974643058,1,7.0,La piel dura,2,La piel dura
3595,9997511417,1,0.0,A Bundle for the Toff,2,A Bundle for the Toff


### Ranking book rocemmendations ###

#### create adjusted book count, e.g. normalized for the count books appeared among people like us vs the rest of people ####

In [48]:
# Find books that are specifically tailored to our taste
book_recs['adjusted_count'] = book_recs['count'] * (book_recs['count'] / book_recs['rating_count'])

In [49]:
# how many times was the book recommended by users like us?
book_recs['score'] = book_recs['mean'] * book_recs['adjusted_count']

In [50]:
book_recs.head()

Unnamed: 0,isbn,count,mean,book_title,rating_count,mod_titles,adjusted_count,score
0,000225929X,1,10.0,MARBLE HEART,4,MARBLE HEART,0.25,2.5
1,000617891X,1,0.0,At the Stroke of Twelve,2,At the Stroke of Twelve,0.5,0.0
2,000648302X,1,8.0,Before and After,14,Before and After,0.071429,0.571429
3,000649840X,1,8.0,Angelas Ashes,89,Angelas Ashes,0.011236,0.089888
4,002411510X,1,8.0,Walden Two (Trade Book),13,Walden Two Trade Book,0.076923,0.615385


In [51]:
# take out books that we have already read
book_recs = book_recs[~book_recs['isbn'].isin(unique_books['isbn'])]    

In [52]:
# removing duplicate titles
book_recs = book_recs[~book_recs['book_title'].isin(unique_books['book_title'])]

### Adjusting the amount of similar users and mean in the following 2 cells affects how many recommendations we find:

In [53]:
# at least 1 user similar to us have to have read the book
book_recs = book_recs[book_recs['count'] > 1]

In [54]:
book_recs = book_recs[book_recs['mean'] > 1]

In [55]:
top_10_recs = book_recs.sort_values('score', ascending=False).head(10)  
top_10_recs

Unnamed: 0,isbn,count,mean,book_title,rating_count,mod_titles,adjusted_count,score
938,067981485X,2,5.5,Teenage Mutant Ninja Turtles: Don't do drugs :...,2,Teenage Mutant Ninja Turtles Don t do drugs ...,2.0,11.0
1120,084232044X,2,4.0,Kerry (Grace Livingston Hill),2,Kerry Grace Livingston Hill,2.0,8.0
253,037307977X,2,8.5,Heart Of Midnight (Silhouette Intimate Moments...,5,Heart Of Midnight Silhouette Intimate Moments...,0.8,6.8
1981,1578565731,2,8.5,Boo,5,Boo,0.8,6.8
1952,1576737659,2,4.0,It Had to Be You,3,It Had to Be You,1.333333,5.333333
1708,1562470310,2,5.0,"Happy Birthday, Felicity! A Springtime Story, ...",4,Happy Birthday Felicity A Springtime Story ...,1.0,5.0
1983,1578566436,2,4.5,Three Roads Home: Stories of First Love & Seco...,4,Three Roads Home Stories of First Love Seco...,1.0,4.5
1607,1558530908,2,5.0,New England Ghosts (American Ghosts Series),5,New England Ghosts American Ghosts Series,0.8,4.0
1395,1551667045,2,5.5,Down By The River,6,Down By The River,0.666667,3.666667
1825,1569870225,2,2.5,The Tale of Mr. Jeremy Fisher (Peter Rabbit an...,3,The Tale of Mr Jeremy Fisher Peter Rabbit an...,1.333333,3.333333
