In [1]:
# Download these files

# books_titles.json
# https://drive.google.com/file/d/1Iqv9TROqNgYbUDijSaDegv4EPpxO97t3/view?usp=sharing

# goodreads_interactions.csv
# https://drive.google.com/open?id=1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon

# book_id_map.csv
# https://drive.google.com/uc?id=1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr

# liked_books.csv
# https://drive.google.com/file/d/1dhPhfD5hAOJjrdf8JhvbOPxDpF4qWYnb/view?usp=sharing

# Full code is at https://github.com/dataquestio/project-walkthroughs/tree/master/books

import pandas as pd

my_books = pd.read_csv("liked_books_full.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [None]:
%cd /content/drive/MyDrive/bookRecommendation

/content/drive/MyDrive/bookRecommendation


In [None]:
ls

book_id_map.csv  books_titles.json  goodreads_interactions.csv  liked_books.csv


In [None]:
import pandas as pd
my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [None]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


Mapping id of the interaction file and title file

In [None]:
#Reads line by line so that we don't have to strore everything in a pandas dataframe
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [None]:
# Look for users who has read the same books as us. set gives us unique values
book_set = set(my_books["book_id"])

In [None]:
# Lets check what is in interactions file
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


In [None]:
#check no. of lines in interactions file
!wc -l goodreads_interactions.csv

228648343 goodreads_interactions.csv


In [None]:
# Find the no. of overlap readers
overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        book_id = csv_book_mapping.get(csv_id)
        
        # if this is one of the books we have read, then add it to the list of overlap user
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [None]:
len(overlap_users)

316341

In [None]:
# The number of books the users have read that overlap with us is atleast 20%(random number)
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [None]:
# For every user that has read same books as us, look through the books they have read. 
#This will help in building recommendation
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])

In [None]:
len(interactions_list)

5638701

In [None]:
interactions_list[0]

['282', '627206', '4']

Collaborative filtering(Generate matrix- user item)

In [None]:
# convert to dataframe
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [None]:
# concatenate mybooks(list of liked books) and interactions
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [None]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,475178,0
5638697,804100,186074,0
5638698,804100,153008,0
5638699,804100,45107,0


In [None]:
# convert user_id and book_id to string and rating to number
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [None]:
interactions['user_id'].unique()

array(['-1', '282', '874', ..., '442043', '712588', '804100'],
      dtype=object)

In [None]:
len(interactions['user_id'].unique())

1259

In [None]:
# create a column called user index which categorizes each unique id into a category
# and gives a codes starting from 0(Basically map a userid to a position)
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [None]:
interactions.iloc[0]

user_id            -1
book_id       2517439
rating              5
user_index          0
book_index     414880
Name: 0, dtype: object

In [None]:
interactions['user_index'].unique()

array([   0,  555, 1216, ..., 1054, 1143, 1183], dtype=int16)

In [None]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [None]:
len(interactions['book_index'].unique())

802870

In [None]:
#no. of unique matrix cells
1259 * 802870 

1010813330

In [None]:
# create sparse matrix to save memory using scipy
from scipy.sparse import coo_matrix

ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [None]:
ratings_mat_coo

<1259x802870 sparse matrix of type '<class 'numpy.int64'>'
	with 5638755 stored elements in COOrdinate format>

In [None]:
# convert matrix from coo format to csr
ratings_mat = ratings_mat_coo.tocsr()

Find users similar to us

In [None]:
# find row position of our specific user and userid was -1 in predferences file
interactions[interactions["user_id"] == "-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
6,-1,356824,5,0,581743
7,-1,12125412,5,0,59763
8,-1,139069,5,0,124430
10,-1,76680,5,0,722098
11,-1,1898,5,0,276178


In [None]:
# this matches to user_index 0
my_index = 0

In [None]:
# find users who are similar to us(similar taste as us). Use cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [None]:
#how similar are we to us
similarity[0]

0.9999999999999999

In [None]:
similarity[2]

0.06143442518998915

In [None]:
# 15 users who are most similar to us(find positions)
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [None]:
indices

array([1188,  942,  218,  129,  496,  435, 1208,  795, 1213, 1210, 1143,
        321,  294,  862,    0])

In [None]:
# find users in interactions which are in the indices
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [None]:
# remove our user so that we don't get our own recommendation
similar_users = similar_users[similar_users["user_id"]!="-1"]

In [None]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
45312,4133,5359,3,942,632143
45313,4133,10464963,4,942,13492
45314,4133,3858,3,942,593622
45315,4133,11827808,4,942,51904
45316,4133,7913305,4,942,732465
...,...,...,...,...,...
5638521,712588,32388712,3,1143,543119
5638522,712588,16322,5,1143,183365
5638523,712588,860543,0,1143,759827
5638524,712588,853510,5,1143,756768


Create book recommendations

In [None]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [None]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3.833333
100322,1,0.000000
100365,1,0.000000
10046142,1,0.000000
1005,3,0.000000
...,...,...
99561,2,2.500000
99610,1,3.000000
99664,1,4.000000
9969571,3,2.333333


In [None]:
# Now to identify book titles from book_id we need the titles file
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [None]:
# merge with titles on book_id
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

In [None]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,6,3.833333,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,100322,1,0.000000,Assata: An Autobiography,11057,https://www.goodreads.com/book/show/100322.Assata,https://images.gr-assets.com/books/1328857268m...,assata an autobiography
2,100365,1,0.000000,The Mote in God's Eye,48736,https://www.goodreads.com/book/show/100365.The...,https://images.gr-assets.com/books/1399490037m...,the mote in gods eye
3,10046142,1,0.000000,Dancing in the Glory of Monsters: The Collapse...,2391,https://www.goodreads.com/book/show/10046142-d...,https://images.gr-assets.com/books/1328757755m...,dancing in the glory of monsters the collapse ...
4,1005,3,0.000000,Think and Grow Rich,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich
...,...,...,...,...,...,...,...,...
2849,99561,2,2.500000,Looking for Alaska,804587,https://www.goodreads.com/book/show/99561.Look...,https://images.gr-assets.com/books/1394798630m...,looking for alaska
2850,99610,1,3.000000,The Best Laid Plans,17434,https://www.goodreads.com/book/show/99610.The_...,https://images.gr-assets.com/books/1353374848m...,the best laid plans
2851,99664,1,4.000000,The Painted Veil,24606,https://www.goodreads.com/book/show/99664.The_...,https://images.gr-assets.com/books/1320421719m...,the painted veil
2852,9969571,3,2.333333,Ready Player One,376328,https://www.goodreads.com/book/show/9969571-re...,https://images.gr-assets.com/books/1500930947m...,ready player one


In [None]:
# books that are really popular among users like us(specifically tailored to our taste)
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])

In [None]:
#ho wmuch we might like the book
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [None]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [None]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [None]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\s+", " ", regex=True)

In [None]:
# remove books that we have already read
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [None]:
book_recs = book_recs[book_recs["mean"] >=4]

In [None]:
book_recs = book_recs[book_recs["count"]>2]

In [None]:
top_recs = book_recs.sort_values("mean", ascending=False)

In [None]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
2265,62291,5,4.8,"A Storm of Swords (A Song of Ice and Fire, #3)",477834,Goodreads,,a storm of swords a song of ice and fire 3,5.2e-05,0.000251
600,157993,3,4.333333,The Little Prince,763309,Goodreads,,the little prince,1.2e-05,5.1e-05
1103,22034,3,4.333333,The Godfather,259150,Goodreads,,the godfather,3.5e-05,0.00015
1176,2318271,3,4.333333,The Last Lecture,245804,Goodreads,,the last lecture,3.7e-05,0.000159
1909,4381,3,4.333333,Fahrenheit 451,591506,Goodreads,,fahrenheit 451,1.5e-05,6.6e-05
243,119322,4,4.25,"The Golden Compass (His Dark Materials, #1)",973154,Goodreads,,the golden compass his dark materials 1,1.6e-05,7e-05
1444,2767793,4,4.25,"The Hero of Ages (Mistborn, #3)",149260,Goodreads,,the hero of ages mistborn 3,0.000107,0.000456
2563,78983,4,4.25,"Kane and Abel (Kane and Abel, #1)",75215,Goodreads,,kane and abel kane and abel 1,0.000213,0.000904
244,119324,3,4.0,"The Subtle Knife (His Dark Materials, #2)",246697,Goodreads,,the subtle knife his dark materials 2,3.6e-05,0.000146
398,13497,4,4.0,"A Feast for Crows (A Song of Ice and Fire, #4)",437398,Goodreads,,a feast for crows a song of ice and fire 4,3.7e-05,0.000146
