In [4]:
import gzip

with gzip.open("./dataset/goodreads_books.json.gz", 'r') as f:
    line = f.readline()

In [5]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [7]:
import json

json.loads(line)

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [8]:
def parse_fields(line):
    data = json.loads(line)

    return{
        "book_id": data["book_id"], 
        "title": data["title_without_series"], 
        "ratings": data["ratings_count"], 
        "url": data["url"], 
        "cover_image": data["image_url"]
    }

In [9]:
book_titles = []

with gzip.open("./dataset/goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)

        try:
            ratings = int(fields['ratings'])
        except ValueError:
            continue

        if ratings > 5:
            book_titles.append(fields)

In [10]:
import pandas as pd

titles = pd.DataFrame.from_dict(book_titles)

In [11]:
titles.head(20)

Unnamed: 0,book_id,title,ratings,url,cover_image
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...
5,378460,The Wanting of Levine,12,https://www.goodreads.com/book/show/378460.The...,https://s.gr-assets.com/assets/nophoto/book/11...
6,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...
7,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...
8,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...
9,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...


In [12]:
titles['ratings'] = pd.to_numeric(titles['ratings'])

In [15]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)  # removing anything other than alphabets and digits

In [16]:
titles["mod_title"] = titles["mod_title"].str.lower() # all titles are in lower case

In [17]:
titles["mod_title"] = titles["mod_title"].str.replace("\s+", " ", regex=True) # Remove any extra spaces

In [18]:
titles = titles[titles["mod_title"].str.len() > 0] # Remove books with no titles

In [20]:
titles.head(10) # here we can see the new column with modified title

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
5,378460,The Wanting of Levine,12,https://www.goodreads.com/book/show/378460.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the wanting of levine
6,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,98,https://www.goodreads.com/book/show/6066812-al...,https://images.gr-assets.com/books/1316637798m...,alls fairy in love and war avalon web of magic 8
7,287149,The Devil's Notebook,986,https://www.goodreads.com/book/show/287149.The...,https://images.gr-assets.com/books/1328768789m...,the devils notebook
8,6066814,"Crowner Royal (Crowner John Mystery, #13)",186,https://www.goodreads.com/book/show/6066814-cr...,https://images.gr-assets.com/books/1328724803m...,crowner royal crowner john mystery 13
9,33394837,The House of Memory (Pluto's Snitch #2),269,https://www.goodreads.com/book/show/33394837-t...,https://images.gr-assets.com/books/1493114742m...,the house of memory plutos snitch 2


In [21]:
titles.to_json("./dataset/books_title.json")

In [22]:
titles

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
...,...,...,...,...,...,...
1782574,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://www.goodreads.com/book/show/3084038-th...,https://images.gr-assets.com/books/1494763458m...,this sceptred isle vol 10 the age of victoria ...
1782575,26168430,Sherlock Holmes and the July Crisis,6,https://www.goodreads.com/book/show/26168430-s...,https://images.gr-assets.com/books/1440592011m...,sherlock holmes and the july crisis
1782576,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection
1782577,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://www.goodreads.com/book/show/22017381-1...,https://images.gr-assets.com/books/1398621236m...,101 nights volume one 101 nights 13


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

tfidf = vectorizer.fit_transform(titles["mod_title"])  # a matrix of similarity 

In [26]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# Writing html tag to get a Clickable url link of books
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

# Showing the book's cover image
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

def search(query,vectorizer):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())       # to lower and clean query
    query_vec = vectorizer.transform([processed])                # covert query to matrix
    similarity = cosine_similarity(query_vec, tfidf).flatten()   # Find Similar titles to query
    indices = np.argpartition(similarity, -10)[-10:]             # Return top 10 similar results with their indices
    results = titles.iloc[indices]                               # Find and store results
    results = results.sort_values("ratings", ascending=False)    # Find Best result based on Max no. of Ratings
    
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})

In [28]:
search("The Alchemist", vectorizer)

Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
1194072,865,The Alchemist,1342863,Goodreads,,the alchemist
397596,71000,The Alchemist,2738,Goodreads,,the alchemist
848815,1175481,The Alchemist,1748,Goodreads,,the alchemist
146065,6721985,The Alchemist,295,Goodreads,,the alchemist
1401437,17470676,The Alchemist,53,Goodreads,,the alchemist


In [29]:
# custom user's licked books, storing their bookid's
liked_books = ["8132407", "31147619", "29983711", "1194072"]

In [31]:
csv_book_mapping = {}

with open("./dataset/book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        
        csv_id, book_id = line.split(",")
        csv_book_mapping[csv_id] = book_id

In [32]:
len(csv_book_mapping)

2360651

In [33]:
csv_book_mapping

{'book_id_csv': 'book_id\n',
 '0': '34684622\n',
 '1': '34536488\n',
 '2': '34017076\n',
 '3': '71730\n',
 '4': '30422361\n',
 '5': '33503613\n',
 '6': '33517540\n',
 '7': '34467031\n',
 '8': '6383669\n',
 '9': '486625\n',
 '10': '19161852\n',
 '11': '32620335\n',
 '12': '13152847\n',
 '13': '25735618\n',
 '14': '24375664\n',
 '15': '35097384\n',
 '16': '34974754\n',
 '17': '836610\n',
 '18': '6565837\n',
 '19': '25899336\n',
 '20': '26114127\n',
 '21': '18245960\n',
 '22': '6392944\n',
 '23': '22078596\n',
 '24': '6644782\n',
 '25': '46164\n',
 '26': '27833614\n',
 '27': '10419045\n',
 '28': '255127\n',
 '29': '28251002\n',
 '30': '23168277\n',
 '31': '32075825\n',
 '32': '30622471\n',
 '33': '22591134\n',
 '34': '31580409\n',
 '35': '133394\n',
 '36': '23848190\n',
 '37': '16130\n',
 '38': '30139664\n',
 '39': '32191710\n',
 '40': '29939161\n',
 '41': '45252\n',
 '42': '32283423\n',
 '43': '43615\n',
 '44': '29905580\n',
 '45': '18364442\n',
 '46': '873993\n',
 '47': '26721984\n',
 '

In [34]:
overlap_users = set()

with open("./dataset/goodreads_interactions.csv", 'r') as f:
    while True:
        

IndentationError: expected an indented block (1348445689.py, line 4)