In [5]:
cd /content/drive/MyDrive/bookRecommendation

/content/drive/MyDrive/bookRecommendation


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Count the number of lines using wc**

In [6]:
!wc -l goodreads_books.json.gz

7588375 goodreads_books.json.gz


In [7]:
!ls -lh | grep goodreads_books.json.gz

-r-------- 1 root root 2.0G May 16  2019 goodreads_books.json.gz


In [8]:
# Read line by line since the file is very large
import gzip

with gzip.open("goodreads_books.json.gz") as f:
    line = f.readline()

In [None]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [9]:
# load json using python
import json

data = json.loads(line)
data

{'asin': '',
 'authors': [{'author_id': '604031', 'role': ''}],
 'average_rating': '4.00',
 'book_id': '5333265',
 'country_code': 'US',
 'description': '',
 'edition_information': '',
 'format': 'Paperback',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'is_ebook': 'false',
 'isbn': '0312853122',
 'isbn13': '9780312853129',
 'kindle_asin': '',
 'language_code': '',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'num_pages': '256',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'publication_day': '1',
 'publication_month': '9',
 'publication_year': '1984',
 'publisher': "St. Martin's Press",
 'ratings_count': '3',
 'series': [],
 'similar_books': [],
 'text_reviews_count': '1',
 'title': 'W.C. Fields: A Life on Film',
 'title_without_series': 'W.C. Fields: A Life on Film',
 'u

**Parse book metadata**

In [10]:
# Extract only the fileds which are rewuired from the json
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

In [11]:
# go line by line and parse each line
books_titles = []
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        # Only take books with more than 15 ratings to reduce the corpus
        if ratings > 15:
            books_titles.append(fields)

**Process the metadata**

In [12]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)
titles["ratings"] = pd.to_numeric(titles["ratings"])
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
titles["mod_title"] = titles["mod_title"].str.lower()
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)
titles = titles[titles["mod_title"].str.len() > 0]
titles.to_json("books_titles.json")

In [13]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondine ondine quartet 05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacqueline kennedy onassis friend of the arts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,the spaniards blackmailed bride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection


**Build Search engine by using TF-IDF matrix and cosine similarity**

In [14]:
# Search engine queries modified titles
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [15]:
# turn search query into vector and match it with the matrix
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # find indices of 10 largest similarity values
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [18]:
search("business", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
888736,290583,The Business,4622,Goodreads,,the business
7261,2355019,The Business,1326,Goodreads,,the business
330002,886124,The Business,295,Goodreads,,the business
1265227,19397193,"Down to Business (Business, #1)",211,Goodreads,,down to business business 1
860562,567680,The Business,79,Goodreads,,the business



**Create list of liked books**

In [None]:
liked_books = ['22543496', '39661', '12816830', '482060', "9401317", "9317691", "8153988"]

In [None]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [None]:
# form a dictionary of books where key is csv id from interactions and value is book id from json
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [None]:
len(csv_book_mapping)

2360651

In [None]:
csv_book_mapping['0']

'34684622'

In [None]:
# count no. of lines interaction data
!wc -l goodreads_interactions.csv

228648343 goodreads_interactions.csv


In [None]:
!ls -lh | grep goodreads_interactions.csv

**Find users who like same books as us**

In [None]:
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


In [None]:
overlap_users = set()

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            continue

        try:
            rating = int(rating)
        except ValueError:
            continue
        
        #convert csvid to book id
        book_id = csv_book_mapping[csv_id]
        
        # if the booke id is in our liked books, add the user to the list of overlapped users
        if book_id in liked_books and rating >= 4:
                overlap_users.add(user_id)

In [None]:
# Find books which the overlapped users have read
# reclines contains potential books we want to read
rec_lines = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])

In [None]:
#rank recommendations(books) in the rec_line
import pandas as pd

recs = pd.DataFrame(rec_lines, columns=["user_id", "book_id", "rating"])
recs["book_id"] = recs["book_id"].astype(str)

In [None]:
recs

Unnamed: 0,user_id,book_id,rating
0,64,3590,5
1,64,194373,5
2,64,8921,4
3,64,194366,4
4,64,482060,4
...,...,...,...
1560069,876054,78987,5
1560070,876054,8034188,0
1560071,876054,11870085,5
1560072,876054,89230,0


In [None]:
# Find the books with most ratings
top_recs = recs["book_id"].value_counts().head(10)

In [None]:
top_recs = top_recs.index.values

In [None]:
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

In [None]:
books_titles.head()

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook


In [None]:
books_titles[books_titles["book_id"].isin(top_recs)]

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
260319,11084145,Steve Jobs,594067,https://www.goodreads.com/book/show/11084145-s...,https://images.gr-assets.com/books/1327861368m...,steve jobs
284473,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
463463,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
615314,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
651619,22543496,Elon Musk: Inventing the Future,63849,https://www.goodreads.com/book/show/22543496-e...,https://images.gr-assets.com/books/1404411386m...,elon musk inventing the future
770177,7613,Animal Farm,1928931,https://www.goodreads.com/book/show/7613.Anima...,https://images.gr-assets.com/books/1424037542m...,animal farm
790927,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
876816,865,The Alchemist,1342863,https://www.goodreads.com/book/show/865.The_Al...,https://images.gr-assets.com/books/1483412266m...,the alchemist
878545,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...
995137,15881,Harry Potter and the Chamber of Secrets (Harry...,1821802,https://www.goodreads.com/book/show/15881.Harr...,https://images.gr-assets.com/books/1474169725m...,harry potter and the chamber of secrets harry ...


In [None]:
# find books that are more popular among users like us
all_recs = recs["book_id"].value_counts()

In [None]:
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ["book_id", "book_count"]

In [None]:
all_recs.head(5)

Unnamed: 0,book_id,book_count
0,22543496,2605
1,5470,1507
2,3,1507
3,2657,1382
4,11084145,1314


In [None]:
all_recs = all_recs.merge(books_titles, how="inner", on="book_id")

In [None]:
# Assign a score to compare each book( If a book is more popular among our set and less popular on goodreads it is going to be recommended)
# looking for books that are popular among users like us but not necessarily on all of goodreads
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

In [None]:
all_recs.sort_values("score", ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
706,34020885,168,The Seduction Expert (#1),115,https://www.goodreads.com/book/show/34020885-t...,https://images.gr-assets.com/books/1500463565m...,the seduction expert 1,245.426087
0,22543496,2605,Elon Musk: Inventing the Future,63849,https://www.goodreads.com/book/show/22543496-e...,https://images.gr-assets.com/books/1404411386m...,elon musk inventing the future,106.282401
195,482060,379,Sherlock Holmes,3501,https://www.goodreads.com/book/show/482060.She...,https://s.gr-assets.com/assets/nophoto/book/11...,sherlock holmes,41.028563
667,9317691,175,The Name of the Wind (The Kingkiller Chronicle...,1043,https://www.goodreads.com/book/show/9317691-th...,https://images.gr-assets.com/books/1360558233m...,the name of the wind the kingkiller chronicle 1,29.362416
467,21032488,223,"Doors of Stone (The Kingkiller Chronicle, #3)",2059,https://www.goodreads.com/book/show/21032488-d...,https://s.gr-assets.com/assets/nophoto/book/11...,doors of stone the kingkiller chronicle 3,24.152016
8279,26860699,23,"The Olympian Affair (The Cinder Spires, #2)",27,https://www.goodreads.com/book/show/26860699-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the olympian affair the cinder spires 2,19.592593
254,8153988,322,"The Eye of the World (Wheel of Time, #1)",5740,https://www.goodreads.com/book/show/8153988-th...,https://images.gr-assets.com/books/1465920672m...,the eye of the world wheel of time 1,18.063415
5258,18243345,34,"Nightblood (Warbreaker, #2)",66,https://www.goodreads.com/book/show/18243345-n...,https://s.gr-assets.com/assets/nophoto/book/11...,nightblood warbreaker 2,17.515152
490,23316526,215,"The Second Machine Age: Work, Progress, and Pr...",2951,https://www.goodreads.com/book/show/23316526-t...,https://images.gr-assets.com/books/1444712959m...,the second machine age work progress and prosp...,15.664182
548,25111341,197,The Industries of the Future,2765,https://www.goodreads.com/book/show/25111341-t...,https://images.gr-assets.com/books/1487178301m...,the industries of the future,14.035805


In [None]:
all_recs[all_recs["book_count"] > 200].sort_values("score", ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
0,22543496,2605,Elon Musk: Inventing the Future,63849,https://www.goodreads.com/book/show/22543496-e...,https://images.gr-assets.com/books/1404411386m...,elon musk inventing the future,106.282401
195,482060,379,Sherlock Holmes,3501,https://www.goodreads.com/book/show/482060.She...,https://s.gr-assets.com/assets/nophoto/book/11...,sherlock holmes,41.028563
467,21032488,223,"Doors of Stone (The Kingkiller Chronicle, #3)",2059,https://www.goodreads.com/book/show/21032488-d...,https://s.gr-assets.com/assets/nophoto/book/11...,doors of stone the kingkiller chronicle 3,24.152016
254,8153988,322,"The Eye of the World (Wheel of Time, #1)",5740,https://www.goodreads.com/book/show/8153988-th...,https://images.gr-assets.com/books/1465920672m...,the eye of the world wheel of time 1,18.063415
490,23316526,215,"The Second Machine Age: Work, Progress, and Pr...",2951,https://www.goodreads.com/book/show/23316526-t...,https://images.gr-assets.com/books/1444712959m...,the second machine age work progress and prosp...,15.664182
224,27276428,350,The Gene: An Intimate History,9874,https://www.goodreads.com/book/show/27276428-t...,https://images.gr-assets.com/books/1463591739m...,the gene an intimate history,12.40632
94,17660462,527,The Everything Store: Jeff Bezos and the Age o...,22546,https://www.goodreads.com/book/show/17660462-t...,https://images.gr-assets.com/books/1365394361m...,the everything store jeff bezos and the age of...,12.318327
491,22318382,215,Becoming Steve Jobs: The Evolution of a Reckle...,4224,https://www.goodreads.com/book/show/22318382-b...,https://images.gr-assets.com/books/1422811281m...,becoming steve jobs the evolution of a reckles...,10.943419
460,25666050,224,Algorithms to Live By: The Computer Science of...,4639,https://www.goodreads.com/book/show/25666050-a...,https://images.gr-assets.com/books/1454296875m...,algorithms to live by the computer science of ...,10.816124
499,12111823,212,"The Winds of Winter (A Song of Ice and Fire, #6)",4404,https://www.goodreads.com/book/show/12111823-t...,https://images.gr-assets.com/books/1465341854m...,the winds of winter a song of ice and fire 6,10.205268


In [None]:
popular_recs = all_recs[all_recs["book_count"] > 200].sort_values("score", ascending=False)

In [None]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)


popular_recs[~popular_recs["book_id"].isin(liked_books)].head(10).style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
467,21032488,223,"Doors of Stone (The Kingkiller Chronicle, #3)",2059,Goodreads,,doors of stone the kingkiller chronicle 3,24.152016
490,23316526,215,"The Second Machine Age: Work, Progress, and Prosperity in a Time of Brilliant Technologies",2951,Goodreads,,the second machine age work progress and prosperity in a time of brilliant technologies,15.664182
224,27276428,350,The Gene: An Intimate History,9874,Goodreads,,the gene an intimate history,12.40632
94,17660462,527,The Everything Store: Jeff Bezos and the Age of Amazon,22546,Goodreads,,the everything store jeff bezos and the age of amazon,12.318327
491,22318382,215,Becoming Steve Jobs: The Evolution of a Reckless Upstart into a Visionary Leader,4224,Goodreads,,becoming steve jobs the evolution of a reckless upstart into a visionary leader,10.943419
460,25666050,224,Algorithms to Live By: The Computer Science of Human Decisions,4639,Goodreads,,algorithms to live by the computer science of human decisions,10.816124
499,12111823,212,"The Winds of Winter (A Song of Ice and Fire, #6)",4404,Goodreads,,the winds of winter a song of ice and fire 6,10.205268
477,20527133,220,"Superintelligence: Paths, Dangers, Strategies",4931,Goodreads,,superintelligence paths dangers strategies,9.815453
507,11223478,209,One Click: Jeff Bezos and the Rise of Amazon.com,4725,Goodreads,,one click jeff bezos and the rise of amazoncom,9.244656
54,18050143,669,"Zero to One: Notes on Startups, or How to Build the Future",52400,Goodreads,,zero to one notes on startups or how to build the future,8.54124
