In [1]:
import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)

In [2]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [3]:
my_books["book_id"] = my_books["book_id"].astype(str)

In [4]:
csv_book_mapping = {}

file_path = "Dataset/book_id_map.csv"

with open(file_path, "r") as f:
  while True:
    line = f.readline()
    if not line:
      break
    csv_id, book_id = line.strip().split(",")
    csv_book_mapping[csv_id] = book_id

In [5]:
book_set = set(my_books["book_id"])

In [8]:
overlap_users = {}

file_path_interaction = "Dataset/goodreads_interactions.csv"

with open(file_path_interaction, 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break

    user_id, csv_id, _ , rating, _ = line.strip().split(",")

    book_id = csv_book_mapping.get(csv_id)

    if book_id in book_set:
      if user_id not in overlap_users:
        overlap_users[user_id] = 1
      else:
        overlap_users[user_id] += 1

In [9]:
len(overlap_users)

316341

In [11]:
filter_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [12]:
interactions_list = []

with open(file_path_interaction, 'r') as f:
  while True:
    line = f.readline()
    if not line:
      break

    user_id, csv_id, _ , rating, _ = line.strip().split(",")

    if user_id in filter_overlap_users:
      book_id = csv_book_mapping[csv_id]
      interactions_list.append([user_id,book_id,rating])

In [25]:
interactions = pd.DataFrame(interactions_list, columns=["user_id","book_id","rating"])

In [26]:
interactions = pd.concat([my_books[["user_id","book_id","rating"]], interactions])

In [27]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,475178,0
5638697,804100,186074,0
5638698,804100,153008,0
5638699,804100,45107,0


In [28]:
interactions["book_id"] = interactions["book_id"].astype(str)
interactions["user_id"] = interactions["user_id"].astype(str)
interactions["rating"] = pd.to_numeric(interactions["rating"])

In [29]:
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

In [34]:
len(interactions["user_index"].unique())

1259

In [33]:
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes

In [35]:
len(interactions["book_index"].unique())

802870

In [36]:
from scipy.sparse import coo_matrix

rating_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

In [37]:
rating_mat_coo

<1259x802870 sparse matrix of type '<class 'numpy.int64'>'
	with 5638728 stored elements in COOrdinate format>

In [38]:
ratings_mat = rating_mat_coo.tocsr()

In [40]:
interactions[interactions["user_id"]=="-1"]

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
6,-1,356824,5,0,581743
7,-1,12125412,5,0,59763
8,-1,139069,5,0,124430
10,-1,76680,5,0,722098
11,-1,1898,5,0,276178


In [41]:
my_index = 0

In [42]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [43]:
import numpy as np

indices = np.argpartition(similarity, -15)[15:]

In [44]:
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

In [45]:
similar_users = similar_users[similar_users["user_id"] != "-1"]

In [110]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,282,627206,4,555,660744
1,282,960,4,555,789109
2,282,15931,4,555,161780
3,282,24178,3,555,393542
4,282,6310,4,555,662720
...,...,...,...,...,...
5638696,804100,475178,0,1183,617107
5638697,804100,186074,0,1183,258768
5638698,804100,153008,0,1183,141428
5638699,804100,45107,0,1183,611284


In [111]:
book_recs = similar_users.groupby("book_id").rating.agg(['count','mean'])

In [112]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,943,3.843054
10,45,3.444444
100,3,0.000000
1000,7,0.571429
10000,36,0.250000
...,...,...
9999894,1,0.000000
9999908,1,0.000000
9999925,9,1.222222
999999,4,0.000000


In [113]:
book_titles = pd.read_json("book_titles.json")
book_titles["book_id"] = book_titles["book_id"].astype(str)

In [114]:
book_recs = book_recs.merge(book_titles, how="inner", on="book_id")

In [115]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,943,3.843054,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,10,45,3.444444,"Harry Potter Collection (Harry Potter, #1-6)",25245,https://www.goodreads.com/book/show/10.Harry_P...,https://images.gr-assets.com/books/1328867351m...,harry potter collection harry potter 16
2,100,3,0.000000,Simply Beautiful Beading,75,https://www.goodreads.com/book/show/100.Simply...,https://s.gr-assets.com/assets/nophoto/book/11...,simply beautiful beading
3,1000,7,0.571429,Millionaire Women Next Door: The Many Journeys...,460,https://www.goodreads.com/book/show/1000.Milli...,https://s.gr-assets.com/assets/nophoto/book/11...,millionaire women next door the many journeys ...
4,10000,36,0.250000,The Face of Another,2079,https://www.goodreads.com/book/show/10000.The_...,https://images.gr-assets.com/books/1320415026m...,the face of another
...,...,...,...,...,...,...,...,...
669647,9999887,3,2.333333,Monkey and Elephant's Worst Fight Ever!,66,https://www.goodreads.com/book/show/9999887-mo...,https://images.gr-assets.com/books/1320415294m...,monkey and elephants worst fight ever
669648,9999894,1,0.000000,"Questions, Questions",151,https://www.goodreads.com/book/show/9999894-qu...,https://images.gr-assets.com/books/1344678736m...,questions questions
669649,9999925,9,1.222222,Spring Is Here,479,https://www.goodreads.com/book/show/9999925-sp...,https://s.gr-assets.com/assets/nophoto/book/11...,spring is here
669650,999999,4,0.000000,Roommates,46,https://www.goodreads.com/book/show/999999.Roo...,https://s.gr-assets.com/assets/nophoto/book/11...,roommates


In [116]:
book_recs["adjested_count"] = book_recs["count"]*(book_recs["count"]/ book_recs["ratings"])

In [117]:
book_recs["score"] = book_recs["mean"] * book_recs["adjested_count"]

In [118]:
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [119]:
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [120]:
my_books["mod_title"] = my_books["mod_title"].str.replace("\\s+"," ",regex=True)

In [121]:
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

In [122]:
book_recs = book_recs[book_recs["count"]>2]

In [123]:
book_recs = book_recs[book_recs["mean"]>4]

In [124]:
top_recs = book_recs.sort_values("score",ascending=False)

In [125]:
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjested_count,score
331977,2445888,3,4.666667,A Fascination for Fish: Adventures of an Under...,21,https://www.goodreads.com/book/show/2445888.A_...,https://s.gr-assets.com/assets/nophoto/book/11...,a fascination for fish adventures of an underw...,0.428571,2.000000
401212,2853760,3,5.000000,Follow the Polar Bears,26,https://www.goodreads.com/book/show/2853760-fo...,https://s.gr-assets.com/assets/nophoto/book/11...,follow the polar bears,0.346154,1.730769
283733,22220611,3,5.000000,Walt Disney's Cinderella,27,https://www.goodreads.com/book/show/22220611-w...,https://images.gr-assets.com/books/1407104891m...,walt disneys cinderella,0.333333,1.666667
375941,2634243,3,4.333333,Sammy the Seal,26,https://www.goodreads.com/book/show/2634243-sa...,https://s.gr-assets.com/assets/nophoto/book/11...,sammy the seal,0.346154,1.500000
285563,22318047,4,4.500000,টেনিদার অভিযান,59,https://www.goodreads.com/book/show/22318047,https://images.gr-assets.com/books/1401020110m...,,0.271186,1.220339
...,...,...,...,...,...,...,...,...,...,...
299014,22844196,3,4.666667,Harry Potter and the Order of the Phoenix (Har...,4152,https://www.goodreads.com/book/show/22844196-h...,https://images.gr-assets.com/books/1432278775m...,harry potter and the order of the phoenix harr...,0.002168,0.010116
97216,13562891,4,4.750000,Harry Potter and the Sorcerer's Stone (Harry P...,8016,https://www.goodreads.com/book/show/13562891-h...,https://images.gr-assets.com/books/1333153083m...,harry potter and the sorcerers stone harry pot...,0.001996,0.009481
591454,7439970,4,4.250000,The Handmaid's Tale,7809,https://www.goodreads.com/book/show/7439970-th...,https://images.gr-assets.com/books/1384395216m...,the handmaids tale,0.002049,0.008708
92703,13510444,3,5.000000,"The Lord of the Rings (The Lord of the Rings, ...",6226,https://www.goodreads.com/book/show/13510444-t...,https://images.gr-assets.com/books/1330855500m...,the lord of the rings the lord of the rings 13,0.001446,0.007228


In [126]:
def make_clickable(val):
  return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
  return '<img src="{}" width=50></img>'.format(val)

In [127]:
top_recs.style.format({'url':make_clickable, 'cover_image':show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjested_count,score
331977,2445888,3,4.666667,A Fascination for Fish: Adventures of an Underwater Pioneer,21,Goodreads,,a fascination for fish adventures of an underwater pioneer,0.428571,2.0
401212,2853760,3,5.0,Follow the Polar Bears,26,Goodreads,,follow the polar bears,0.346154,1.730769
283733,22220611,3,5.0,Walt Disney's Cinderella,27,Goodreads,,walt disneys cinderella,0.333333,1.666667
375941,2634243,3,4.333333,Sammy the Seal,26,Goodreads,,sammy the seal,0.346154,1.5
285563,22318047,4,4.5,টেনিদার অভিযান,59,Goodreads,,,0.271186,1.220339
597835,766128,3,4.333333,And Then It Rained... And Then the Sun Came Out...,38,Goodreads,,and then it rained and then the sun came out,0.236842,1.026316
628525,852671,3,4.666667,Just for Elephants,42,Goodreads,,just for elephants,0.214286,1.0
11644,104597,3,4.666667,"The Books of Magic, Volume 5: Girl in the Box",43,Goodreads,,the books of magic volume 5 girl in the box,0.209302,0.976744
65027,12670196,6,4.666667,The Road Not Taken,173,Goodreads,,the road not taken,0.208092,0.971098
364471,25855732,4,4.5,Treasure Island: A BabyLit® Shapes Primer (BabyLit Books),78,Goodreads,,treasure island a babylit shapes primer babylit books,0.205128,0.923077
