In [None]:
# run on colab
!pip install surprise

In [None]:
import pandas as pd
import numpy as np
from copy import copy
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans, SVD
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict

#### We want to find users with similar taste in books and try to predict what we will like based on what they like ####

##### step 1: what do I like? #####

In [None]:
data = pd.read_csv('data/books_ratings.csv')
data.head()

In [None]:
average_rating = pd.read_csv('data/AverageRatingdf.csv')
average_rating.head()

In [None]:
data['user_id'] = data['user_id'].astype('str') 

In [None]:
# filtering user_id who read and rated more than 15 books
user_counts = data['user_id'].value_counts()
data_users = data[data['user_id'].isin(user_counts[user_counts >= 15].index)]
data_users.sample(10)

In [None]:
# creating list of books for one reference user

books_read = data[data['user_id'] == '114368']
books_read 

In [None]:
# find similar users

In [None]:
# create a set of all the unique books that user has read
# create a set of all the unique books that user has read, including user_id

unique_books = set(books_read['isbn'])

In [None]:
len(unique_books)

In [None]:
# store any user that read the same book as us in the overlap_users dictionary
overlap_users = {}

# overlap_users is a dictionary with key = user_id and value is count of times that reader read a book that we also read  
for book in unique_books:
    for user in data[data['isbn'] == book]['user_id']:
        if user != '114368':
            if user in overlap_users:
                overlap_users[user] += 1
            else:
                overlap_users[user] = 1


In [None]:
len(overlap_users)  

In [None]:
overlap_users

In [None]:
print(overlap_users)

In [None]:
unique_books

In [None]:
import pandas as pd

# Ensure books_read is a DataFrame
if not isinstance(unique_books, pd.DataFrame):
    unique_books = pd.DataFrame(unique_books, columns=["isbn"])

# Print overlap_users dictionary
print("overlap_users:", overlap_users)

# Print books_read shape
print("books_read.shape:", books_read.shape)

# Print intermediate results
for k in overlap_users:
    print(f"User: {k}, Count: {overlap_users[k]}, Threshold: {unique_books.shape[0] / 5}")

# Filter users who have read at least 20% of the same books
filtered_overlap_users = {k for k in overlap_users if overlap_users[k] > unique_books.shape[0] / 5}

# Print filtered_overlap_users
print("filtered_overlap_users:", filtered_overlap_users)

In [None]:
filtered_overlap_users  

In [None]:
# get the isbn and ratings of the books that these users have read

# Create a dataframe that contains all user_ids that are in filtered_overlap_users and add isbn and book_rating
filtered_overlap_data = data[data['user_id'].isin(filtered_overlap_users)][['user_id', 'isbn', 'book_rating']]
filtered_overlap_data








### For collaborative filtering: create a user / book matrix ###

In [None]:
unique_books

In [None]:
# every row if the matrix will be a different user and every column of the matrix will be a different book containing teh rating of that user for that book

# first we need to concatenate the ratings of the picked user (id = 114368) with the filtered_overlap_data

# Create a DataFrame with the ratings of the picked user

# Filter the original DataFrame by user_id
filtered_data = data[data['user_id'] == '114368']

# Merge unique_books with the filtered data to add 'book_rating' and 'user_id' columns
unique_books = unique_books.merge(filtered_data[['isbn','user_id', 'book_rating', 'book_title']], on='isbn', how='left')

unique_books








In [None]:
# concat the unique_books and filtered_overlap_data
filtered_overlap_data = pd.concat([unique_books, filtered_overlap_data])
filtered_overlap_data

In [None]:
filtered_overlap_data['book_rating'] = pd.to_numeric(filtered_overlap_data['book_rating'])

In [None]:
filtered_overlap_data['user_id'].unique()

In [None]:
# create a user_index for each user_id
# category function creates for each user_id a category and then we can use cat.codes to get the index of the user

filtered_overlap_data['user_index'] = filtered_overlap_data['user_id'].astype('category').cat.codes
filtered_overlap_data.iloc[0]

In [None]:
len(filtered_overlap_data['user_index'].unique())

In [None]:
# do teh same thing with the isbn
filtered_overlap_data['isbn_index'] = filtered_overlap_data['isbn'].astype('category').cat.codes
filtered_overlap_data.iloc[0]

In [None]:
len(filtered_overlap_data['isbn_index'].unique())

In [None]:
# creating a sparse matrix (doesn't take any memory if there is no rating for a book)

from scipy.sparse import coo_matrix

ratings_coo_mat = coo_matrix((filtered_overlap_data['book_rating'], (filtered_overlap_data['user_index'], filtered_overlap_data['isbn_index'])))

In [None]:
ratings_coo_mat

In [None]:
ratings_mat = ratings_coo_mat.tocsr()

In [None]:
filtered_overlap_data[filtered_overlap_data['user_id'] == '114368']

In [None]:
# setting the chossen user_index to 'my_index'

my_index = 0

In [None]:
# use cosine similarity to see how similar each user is to us

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index, :], ratings_mat).flatten()

In [None]:
# how similar are we to ourself?

similarity[my_index]

In [None]:
#how similar are we to the other users?
similarity[2]

In [None]:
# find indices (= position) for 4 users most similar to us
import numpy as np

indices = np.argpartition(similarity, -4)[-4:]

In [None]:
indices

In [None]:
# find the user_id for the 4 most similar users

similar_users = filtered_overlap_data[filtered_overlap_data['user_index'].isin(indices)].copy()

In [None]:
#takes ourselv out of the list
similar_users = similar_users[similar_users['user_id'] != '114368']


In [None]:
book_recs = similar_users.groupby('isbn').book_rating.agg(['count', 'mean'])

In [None]:
book_recs  

In [None]:
# include book_titles in book_recs
# Drop duplicates in the data DataFrame based on the 'isbn' column
unique_isbn = average_rating[['isbn', 'book_title', 'rating_count', 'mod_titles']].drop_duplicates(subset='isbn')

# Merge book_recs with the unique_data to add 'book_title' column
book_recs = book_recs.merge(unique_isbn, on='isbn', how='inner')

In [None]:
book_recs

### Ranking book rocemmendations ###

#### create adjusted book count, e.g. normalized for the count books appeared among people like us vs the rest of people ####

In [None]:
# Find books that are specifically tailored to our taste
book_recs['adjusted_count'] = book_recs['count'] * (book_recs['count'] / book_recs['rating_count'])

In [None]:
# how many times was the book recommmended by users like us?
book_recs['score'] = book_recs['mean'] * book_recs['adjusted_count']    

In [None]:
# take out books that we have already read

book_recs = book_recs[~book_recs['isbn'].isin(unique_books['isbn'])]    

In [None]:
# removing duplicate titles
book_recs = book_recs[~book_recs['book_title'].isin(unique_books['book_title'])]   

In [None]:
# at least 2 users similar to us have to have read the book
book_recs = book_recs[book_recs['count'] > 2]

In [None]:
book_recs = book_recs[book_recs['mean'] > 4]

In [None]:
top_10_recs = book_recs.sort_values('score', ascending=False).head(10)  
top_10_recs