In [33]:
# Read in personal data from Goodreads export reformatted
# Optional addition - write a function to transform export format to include only user_id, book_id, rating, and title
import pandas as pd

my_books = pd.read_csv("my_liked_books.csv", index_col=0)

In [34]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,127455,0,"The Lies of Locke Lamora (Gentleman Bastard, #1)"
1,-1,17800,0,Dance Dance Dance (The Rat #4)
2,-1,13538873,0,Mr. Penumbra's 24-Hour Bookstore (Mr. Penumbra...
3,-1,641604,0,Purple Cow: Transform Your Business by Being R...
4,-1,122,0,"The Power of One (The Power of One, #1)"
...,...,...,...,...
80,-1,1215032,5,"The Wise Man's Fear (The Kingkiller Chronicle,..."
81,-1,186074,5,The Name of the Wind (The Kingkiller Chronicle...
82,-1,2213661,5,The Graveyard Book
83,-1,15783514,4,The Ocean at the End of the Lane


In [36]:
# Ensure book_id field is a string
my_books["book_id"] = my_books["book_id"].astype(str)

In [4]:
# Finding similar users
csv_book_mapping = {}

# The csv data is two columns seperated by a comma
# Read the csv data into a dictionary
# The strip removes the newline character at the end of the line

# loop through the file line by line adding
with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",") 
        csv_book_mapping[csv_id] = book_id # Assign variables to dictionary that helps map between book_id in different files

In [5]:
# Remove duplicate book_ids
book_set = set(my_books["book_id"])

In [6]:
book_set

{'113576',
 '12125412',
 '1215032',
 '128029',
 '139069',
 '1685995',
 '17662739',
 '18949861',
 '1898',
 '228221',
 '228665',
 '2517439',
 '25659450',
 '28187',
 '2913377',
 '35100',
 '356824',
 '437143',
 '5096865',
 '5439',
 '5578108',
 '6448772',
 '76680',
 '77203',
 '8161140',
 '82599',
 '883438'}

In [7]:
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [8]:
# Find users that have read 20% of the same books as us. Unless a user has read a certain amount of books using list compmrehension
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [9]:
# Build a list to gather user_id, book_id, and rating to generate recommendations

interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        # Add other user's reading history into the interactions list if they've overlapped
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [10]:
# Turn the data into a DataFrame
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [11]:
# Add your own ratings into this data
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [12]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [13]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes


In [15]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes


In [16]:
# A sparse matrix will allow us to save memory
from scipy.sparse import coo_matrix

# Pass in first the array or list of the data you want in the cells, then the row positions - user index, then pass in the column positions
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [17]:
ratings_mat_coo.shape

(1259, 802870)

In [18]:
# Convert COO matrix to CSR matrix format
ratings_mat = ratings_mat_coo.tocsr()

In [19]:
#
interactions[interactions["user_id"] == "-1"]


Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
6,-1,356824,5,0,581743
7,-1,12125412,5,0,59763
8,-1,139069,5,0,124430
10,-1,76680,5,0,722098
11,-1,1898,5,0,276178


In [20]:
my_index = 0


In [21]:
# Cosine similarity will find the similarity between two rows in the matrix
from sklearn.metrics.pairwise import cosine_similarity

# Find how similar each user is to my tastes
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [22]:
import numpy as np

# Find the indices of the users that are most similar to us
indices = np.argpartition(similarity, -15)[-15:]

In [23]:
# Find the user ids 
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [24]:
# Take ourselves out of the df
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [25]:
# How many times does each book appear
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [26]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3.833333
100322,1,0.000000
100365,1,0.000000
10046142,1,0.000000
1005,3,0.000000
...,...,...
99561,2,2.500000
99610,1,3.000000
99664,1,4.000000
9969571,3,2.333333


In [28]:
# Bring the book titles data set into a dataframe as a string
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [29]:
# Merge the recommendations with the titles
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [30]:
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

NameError: name 'book_rec' is not defined

In [None]:
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]


In [None]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]


In [None]:
# Replace any characters not in a-z, A-Z, 0-9, , removing them
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()


In [None]:

my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)


In [None]:
# Tilde means take anything that doesn't fall into this set, remove any books we've liked from our recommendatoins
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]


In [None]:
# Filter out ratings with a mean score below 4 stars
book_recs = book_recs[book_recs["mean"] >=4]


In [None]:
# Filter to more than two reviews
book_recs = book_recs[book_recs["count"]>2]


In [None]:
top_recs = book_recs.sort_values("mean", ascending=False)


In [None]:
# Provide clickable links to Goodreads
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

# Show the image of the val
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

# Format the URL column using make_clickable, format the cover_image column using show_image
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})