In [18]:
# Explore the data using wc -l is a command line utility that counts the lines in the file
# The exclamation point runs a command on the command line
!wc -l goodreads_books.json.gz

 7588375 goodreads_books.json.gz


In [20]:
# Show me the size of all the files in the directory
!ls -lh | grep goodreads_books.json.gz

-rw-r--r--@ 1 andrewcrawford  staff   1.9G Aug  5 14:16 goodreads_books.json.gz


In [1]:
# Reading using a streaming fashion - reading line by line so as not use as much memory. Gzip streams the file without unzipping it
import gzip 

with gzip.open("goodreads_books.json.gz", 'r') as f:
    line = f.readline()

In [2]:
# Use json to load each single line
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [3]:
# Scaling the same idea, create a dictionary loading each line as a json and extracting the data we want
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "title": data["title_without_series"],
        "ratings": data["ratings_count"],
        "url": data["url"],
        "cover_image": data["image_url"]
    }

In [4]:
# Create a way to parse line by line, loop through every line in the file to create a list of books that will fit in the memory
books_titles = []
with gzip.open("goodreads_books.json.gz", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
    
        try:
            #turn the ratings field into an integer
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5: # Only take books with at least five ratings
            books_titles.append(fields)

In [5]:
# Turn the books_title list into a dataFrame, from_dict will turn each dictionary into a row in the DataFrame
import pandas as pd

titles = pd.DataFrame.from_dict(books_titles)

In [6]:
# Turn ratings into a numerical column so we can do comparisons
titles["ratings"] = pd.to_numeric(titles["ratings"])

In [7]:
# Minimize search space when creating a search engine by getting rid of all unnecessary characters
# Replace any characters that don't fall in this set of characters a-z, A-Z, 0-9, space, and replace with nothing ""
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [8]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,Good Harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,The Unschooled Wizard Sun Wolf and Starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,Best Friends Forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,Runic Astrology Starcraft and Timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,The Aeneid for Boys and Girls
...,...,...,...,...,...,...
1782574,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://www.goodreads.com/book/show/3084038-th...,https://images.gr-assets.com/books/1494763458m...,This Sceptred Isle Vol 10 The Age of Victoria ...
1782575,26168430,Sherlock Holmes and the July Crisis,6,https://www.goodreads.com/book/show/26168430-s...,https://images.gr-assets.com/books/1440592011m...,Sherlock Holmes and the July Crisis
1782576,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,The Childrens Classic Poetry Collection
1782577,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://www.goodreads.com/book/show/22017381-1...,https://images.gr-assets.com/books/1398621236m...,101 Nights Volume One 101 Nights 13


In [9]:
# Lowercase everything in mod_title
titles["mod_title"] = titles["mod_title"].str.lower()

In [10]:
# Remove any spaces in a row, say three spaces in a row, replace with a single space
# \s+ means any spaces in a row, replace with a single space " "
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)

In [11]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
...,...,...,...,...,...,...
1782574,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://www.goodreads.com/book/show/3084038-th...,https://images.gr-assets.com/books/1494763458m...,this sceptred isle vol 10 the age of victoria ...
1782575,26168430,Sherlock Holmes and the July Crisis,6,https://www.goodreads.com/book/show/26168430-s...,https://images.gr-assets.com/books/1440592011m...,sherlock holmes and the july crisis
1782576,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection
1782577,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://www.goodreads.com/book/show/22017381-1...,https://images.gr-assets.com/books/1398621236m...,101 nights volume one 101 nights 13


In [12]:
# Remove any title that now is Null by checking the length where mod_title has characters
titles = titles[titles["mod_title"].str.len() > 0]

In [13]:
# Store data for future use
titles.to_json("books_titles.json")

In [14]:
# Sci Kit learn (machine learning library) has a Term Frequency Inverse Document Frequency matrix builder
# The vectorizer takes a list of strings and turns it into the Tfidf matrix

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [15]:
# Add code to turn a search query into a vector and then match it against the matrix and do a comparison
# Use Cosine similarity to do the comparison

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# Style this column by using this function to build an HTML element
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

# Using the style method in pandas, show the cover image
def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

# Search for a book based upon title as query
def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower()) # Remove all a-z, A-Z, 0-9, space characters, lowercase query
    query_vec = vectorizer.transform([processed]) # Turn query into a vector
    similarity = cosine_similarity(query_vec, tfidf).flatten() # Find similarity by passing in vector and matrix, flatten to get a numpy array
    indices = np.argpartition(similarity, -10)[-10:] # argpartition finds the ten largest similarity values
    results = titles.iloc[indices] # use the indices to index titles, this will give us the book title rows with the most similarity to our query
    results = results.sort_values("ratings", ascending=False) # Sort values on the number of ratings
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})


In [16]:
search("lord of thd rings", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
204655,11047557,The Lord of the Rings,2062,Goodreads,,the lord of the rings
1652008,39,The Lord of the Rings,257,Goodreads,,the lord of the rings
1280866,899801,The Lord of the Rings,77,Goodreads,,the lord of the rings
1332252,2527331,The Lord of the Rings,69,Goodreads,,the lord of the rings
293707,15347,The Lord of the Rings,27,Goodreads,,the lord of the rings


In [17]:
liked_books = [581526, 20562717, 944652, 11047557]