Let's take a look into the dataset good_reads.json

In [1]:
#counts the no. of lines in a file

!wc -l goodreads_books.json.gz

7588375 goodreads_books.json.gz


In [2]:
# How large this book is, in terms of file size
# To show all the files in the directory and search only the specific file.

!ls -lh | grep goodreads_books.json.gz


-rw-r--r-- 1 ashis 197609 2.0G Sep 17 22:12 goodreads_books.json.gz


To compress the dataset

In [4]:
# In Streaming fashion i.e. line by line reading

import gzip

with gzip.open("goodreads_books.json.gz",'r') as f:
    line = f.readline() # to read single line 


In [10]:
line # a single line

b'{"isbn": "", "text_reviews_count": "2", "series": [], "country_code": "US", "language_code": "ara", "popular_shelves": [{"count": "48397", "name": "to-read"}, {"count": "4724", "name": "classics"}, {"count": "3343", "name": "plays"}, {"count": "2412", "name": "currently-reading"}, {"count": "1734", "name": "shakespeare"}, {"count": "1373", "name": "drama"}, {"count": "1165", "name": "favorites"}, {"count": "1052", "name": "fiction"}, {"count": "706", "name": "classic"}, {"count": "635", "name": "play"}, {"count": "425", "name": "literature"}, {"count": "424", "name": "books-i-own"}, {"count": "412", "name": "owned"}, {"count": "411", "name": "school"}, {"count": "408", "name": "theatre"}, {"count": "263", "name": "poetry"}, {"count": "232", "name": "tragedy"}, {"count": "225", "name": "read-for-school"}, {"count": "212", "name": "british"}, {"count": "182", "name": "for-school"}, {"count": "165", "name": "theater"}, {"count": "159", "name": "english"}, {"count": "155", "name": "owned

In [11]:
# Use json module to load the single line in a Python dictionary

import json

json.loads(line)

{'isbn': '',
 'text_reviews_count': '2',
 'series': [],
 'country_code': 'US',
 'language_code': 'ara',
 'popular_shelves': [{'count': '48397', 'name': 'to-read'},
  {'count': '4724', 'name': 'classics'},
  {'count': '3343', 'name': 'plays'},
  {'count': '2412', 'name': 'currently-reading'},
  {'count': '1734', 'name': 'shakespeare'},
  {'count': '1373', 'name': 'drama'},
  {'count': '1165', 'name': 'favorites'},
  {'count': '1052', 'name': 'fiction'},
  {'count': '706', 'name': 'classic'},
  {'count': '635', 'name': 'play'},
  {'count': '425', 'name': 'literature'},
  {'count': '424', 'name': 'books-i-own'},
  {'count': '412', 'name': 'owned'},
  {'count': '411', 'name': 'school'},
  {'count': '408', 'name': 'theatre'},
  {'count': '263', 'name': 'poetry'},
  {'count': '232', 'name': 'tragedy'},
  {'count': '225', 'name': 'read-for-school'},
  {'count': '212', 'name': 'british'},
  {'count': '182', 'name': 'for-school'},
  {'count': '165', 'name': 'theater'},
  {'count': '159', 'name'

PARSING OUR BOOK METADATA

In [12]:
# Let's scale up the technique

def parse_fields(line): # it will only return the fields we care about
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "title": data["title_without_series"],
        "ratings": data["ratings_count"],
        "url": data["url"],
        "cover_image": data["image_url"]
    }

In [13]:
# To create a way to go line by line and parse each line

books_titles = []

with gzip.open("goodreads_books.json.gz", 'r') as f:
    while True:
        line = f.readline() # runs a single line
        if not line:
            break
        fields = parse_fields(line)
        
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 15:
            books_titles.append(fields)
            
            

PROCESSING BOOK METADATA WITH PYTHON-PANDAS

In [14]:
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [15]:
# Turn title_rating into a numerical column to do comparision

titles["ratings"] = pd.to_numeric(titles["ratings"])

To construct a search engine we want to minimize our search engine as much as possible. The way we do that by minimizing the number of characters.

In [16]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9]","", regex = True)

In [17]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,TheUnschooledWizardSunWolfandStarhawk12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,BestFriendsForever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,TheAeneidforBoysandGirls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,AllsFairyinLoveandWarAvalonWebofMagic8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,TheDevilsNotebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,OndineOndineQuartet05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,JacquelineKennedyOnassisFriendoftheArts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,TheSpaniardsBlackmailedBride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,TheChildrensClassicPoetryCollection


In [18]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [19]:
titles["mod_title"] = titles["mod_title"].str.replace("\s+"," ", regex = True)

In [20]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,theunschooledwizardsunwolfandstarhawk12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,bestfriendsforever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,theaeneidforboysandgirls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,allsfairyinloveandwaravalonwebofmagic8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,thedevilsnotebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondineondinequartet05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacquelinekennedyonassisfriendofthearts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,thespaniardsblackmailedbride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,thechildrensclassicpoetrycollection


In [21]:
titles = titles[titles["mod_title"].str.len() > 0]

In [22]:
titles.to_json("books_titles.json")

In [23]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,theunschooledwizardsunwolfandstarhawk12
1,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,bestfriendsforever
2,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,theaeneidforboysandgirls
3,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,allsfairyinloveandwaravalonwebofmagic8
4,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,thedevilsnotebook
...,...,...,...,...,...,...
1308952,17805813,"Ondine (Ondine Quartet, #0.5)",327,https://www.goodreads.com/book/show/17805813-o...,https://images.gr-assets.com/books/1379766592m...,ondineondinequartet05
1308953,331839,Jacqueline Kennedy Onassis: Friend of the Arts,18,https://www.goodreads.com/book/show/331839.Jac...,https://s.gr-assets.com/assets/nophoto/book/11...,jacquelinekennedyonassisfriendofthearts
1308954,2685097,The Spaniard's Blackmailed Bride,112,https://www.goodreads.com/book/show/2685097-th...,https://s.gr-assets.com/assets/nophoto/book/11...,thespaniardsblackmailedbride
1308955,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,thechildrensclassicpoetrycollection


BUILDING THE BOOK SEARCH ENGINE

In [25]:
# To create "Term Frequency", "Inverse Document Frequency", and finally Tf*Idf

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [26]:
#To do comparision 

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [29]:
search("All's Fairy in Love and War", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
656512,8575298,War,917,Goodreads,,war
406995,6514664,War,333,Goodreads,,war
840752,23167716,In,276,Goodreads,,in
1019009,18040556,Fairy,93,Goodreads,,fairy
22786,8173973,War,69,Goodreads,,war


CREATING THE LIST OF LIKED BOOKS

In [32]:
liked_books = ["17381863", "35005834", "6514664", "7327624", "331839"]