In [6]:
import pandas as pd
import gzip
import json


In [15]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "goodreads_id": data["book_id"], 
        "title": data["title_without_series"],  
        "cover_image": data["image_url"],
        "ratings": data["ratings_count"]
    }

In [16]:
books_titles = []
with gzip.open(r"C:\Users\Ben\Desktop\gphc-data\goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)

In [19]:
print(books_titles[10])

{'goodreads_id': '89371', 'title': 'The Te Of Piglet', 'cover_image': 'https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png', 'ratings': '11'}


In [23]:
titles = pd.DataFrame.from_dict(books_titles)

In [26]:
titles["ratings"] = pd.to_numeric(titles["ratings"])
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)
titles["mod_title"] = titles["mod_title"].str.lower()
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True)
titles = titles[titles["mod_title"].str.len() > 0]

In [27]:
titles.head()

Unnamed: 0,goodreads_id,title,cover_image,ratings,mod_title
0,1333909,Good Harbor,https://s.gr-assets.com/assets/nophoto/book/11...,10,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",https://images.gr-assets.com/books/1304100136m...,140,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,https://s.gr-assets.com/assets/nophoto/book/11...,51184,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,https://images.gr-assets.com/books/1413219371m...,15,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,https://s.gr-assets.com/assets/nophoto/book/11...,46,the aeneid for boys and girls


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])

In [29]:
inv_df = pd.read_csv(r"C:\Users\Ben\Desktop\gphc-data\inv_data.csv")

In [48]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

def search(query, vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = vectorizer.transform([query])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = titles.iloc[indices]
    results = results.sort_values("ratings", ascending=False)
    
    return int(results.head(1)['goodreads_id'].values[0])

In [49]:
search('harry potter child', vectorizer)

29058155

In [50]:
inv_df.head()

Unnamed: 0.1,Unnamed: 0,title,author_last,author_first
0,0,of mice and men,steinbeck,john
1,1,the unforgotten a novel,powell,laura
2,2,star wars the clone wars,hibbert,clare
3,3,star wars the clone wars,beecroft,simon
4,4,wow the pop up book of sports,foster,bruce


In [54]:
inv_df['gr_id'] = inv_df['title'].apply(search, vectorizer=vectorizer)

In [57]:
inv_df.sample(10)

Unnamed: 0.1,Unnamed: 0,title,author_last,author_first,gr_id
10208,10208,sales questions that close every deal 1 000 fi...,gschwandtner,gerhard,23183
192,192,americas sewing book,ley,sandra,6396253
58,58,the lies that bind,giffin,emily,11524267
10066,10066,always my dad,wyeth,sharon dennis,269518
7784,7784,the elephant alphabet book,yates,gene,1498767
422,422,signing exact english,gustason,gerilee,530316
2691,2691,the night before christmas the classic edition,moore,clement clarke,447041
2804,2804,the mighty miss malone,curtis,christopher paul,11288619
8677,8677,when grandma came,paton walsh,jill,962445
9990,9990,plain jane,michaels,fern,431396


In [85]:
def find_by_index(index):
    return titles[titles['goodreads_id'] == f'{index}']['cover_image'].values[0]

In [86]:
find_by_index(23183)

'https://s.gr-assets.com/assets/nophoto/book/111x148-bcc042a9c91a29c1d680899eff700a03.png'

In [87]:
inv_df['cover_img_url'] = inv_df['gr_id'].apply(find_by_index)

In [90]:
inv_df.sample(10)

Unnamed: 0.1,Unnamed: 0,title,author_last,author_first,gr_id,cover_img_url
8740,8740,meet josefina an american girl,tripp,valerie,722946,https://images.gr-assets.com/books/1375159319m...
11341,11341,strange animals,whitcombe,bobbie,3023338,https://s.gr-assets.com/assets/nophoto/book/11...
8771,8771,deadly decisions,reichs,kathleen j,15720979,https://images.gr-assets.com/books/1340577803m...
3391,3391,the wailing wind,hillerman,tony,24874341,https://images.gr-assets.com/books/1423751051m...
773,773,pete the cat the wheels on the bus,dean,james,859744,https://images.gr-assets.com/books/1309213567m...
1838,1838,my friend maggie,harrison,hannah e,27833832,https://images.gr-assets.com/books/1453059498m...
8330,8330,ageless body timeless mind the quantum alterna...,chopra,deepak,836512,https://images.gr-assets.com/books/1333578683m...
4242,4242,mrs mcnosh and the great big squash,weeks,sarah,1059041,https://images.gr-assets.com/books/1358747130m...
6427,6427,ricky ricottas giant robot vs the mutant mosqu...,pilkey,dav,532634,https://s.gr-assets.com/assets/nophoto/book/11...
1615,1615,dangerous,hale,shannon,8585924,https://images.gr-assets.com/books/1369672030m...


In [91]:
inv_df.to_csv('inv_data_gr.csv')