In [3]:
import gzip  # Import the gzip module to handle .gz compressed files

filename = "goodreads_books.json.gz"  # The name of the compressed JSON file containing book data
# Initialize a counter to count the number of lines in the file
line_count = 0
with gzip.open(filename, 'rt', encoding='utf-8') as f:   # Open the compressed file in text mode with UTF-8 encoding
    for line in f:
        line_count += 1

print(f"Number of lines: {line_count}")  # Print the total number of lines (i.e., number of books in the dataset)

Number of lines: 2360655


In [4]:
import gzip  # Import the gzip module to handle compressed .gz files

# Open the compressed JSON file
with gzip.open("goodreads_books.json.gz") as f:
    # Read the first line from the file (represents one book's metadata)
    line = f.readline()


In [5]:
line

b'{"isbn": "0312853122", "text_reviews_count": "1", "series": [], "country_code": "US", "language_code": "", "popular_shelves": [{"count": "3", "name": "to-read"}, {"count": "1", "name": "p"}, {"count": "1", "name": "collection"}, {"count": "1", "name": "w-c-fields"}, {"count": "1", "name": "biography"}], "asin": "", "is_ebook": "false", "average_rating": "4.00", "kindle_asin": "", "similar_books": [], "description": "", "format": "Paperback", "link": "https://www.goodreads.com/book/show/5333265-w-c-fields", "authors": [{"author_id": "604031", "role": ""}], "publisher": "St. Martin\'s Press", "num_pages": "256", "publication_day": "1", "isbn13": "9780312853129", "publication_month": "9", "edition_information": "", "publication_year": "1984", "url": "https://www.goodreads.com/book/show/5333265-w-c-fields", "image_url": "https://images.gr-assets.com/books/1310220028m/5333265.jpg", "book_id": "5333265", "ratings_count": "3", "work_id": "5400751", "title": "W.C. Fields: A Life on Film", "t

In [6]:
import json  # Import the json module to parse JSON data

# Parse the JSON string (read from the file) into a Python dictionary
data = json.loads(line)

# Display the parsed data (the metadata of the first book)
data


{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [7]:
# Define a function to extract selected fields from a JSON line
def parse_fields(line):
    # Parse the JSON string into a Python dictionary
    data = json.loads(line)
    
    # Return a new dictionary containing only the relevant fields
    return {
        "book_id": data["book_id"],               # Unique ID of the book
        "title": data["title_without_series"],    # Book title (without series info)
        "ratings": data["ratings_count"],         # Total number of ratings the book has received
        "url": data["url"],                       # URL to the book's Goodreads page
        "cover_image": data["image_url"]          # URL to the book's cover image
    }


In [8]:
# Initialize an empty list to store book data that meets the filter condition
books_titles = []

# Open the compressed file for reading
with gzip.open("goodreads_books.json.gz") as f:
    while True:
        # Read one line at a time
        line = f.readline()
        
        # If the end of the file is reached, break the loop
        if not line:
            break
        
        # Parse the fields we're interested in using the previously defined function
        fields = parse_fields(line)
        
        # Try to convert the ratings field to an integer (handle any bad data)
        try:
            ratings = int(fields["ratings"])
        except ValueError:
            continue  # Skip this record if ratings count is not a valid integer
        
        # Filter: keep only books that have more than 5 ratings
        if ratings > 5:
            books_titles.append(fields)  # Add the filtered book to the list


In [9]:
import pandas as pd  # Import pandas for data manipulation

# Convert the list of dictionaries (filtered book data) into a pandas DataFrame
titles = pd.DataFrame.from_dict(books_titles)


In [10]:
# Convert the 'ratings' column to numeric type (integer or float)
titles["ratings"] = pd.to_numeric(titles["ratings"])


In [11]:
# Create a new column 'mod_title' by removing special characters from the book titles
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)


In [12]:
titles["mod_title"] = titles["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True)

In [13]:
titles["mod_title"] = titles["mod_title"].str.lower()

In [14]:
# Replace multiple spaces with a single space in 'mod_title' column
titles["mod_title"] = titles["mod_title"].str.replace("\\s+", " ", regex=True)

# Filter out any rows where 'mod_title' is now empty (after cleaning)
titles = titles[titles["mod_title"].str.len() > 0]

# Save the cleaned DataFrame to a JSON file for later use
titles.to_json("books_titles.json")


In [15]:
titles


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls
...,...,...,...,...,...,...
1782574,3084038,"This Sceptred Isle, Vol. 10: The Age of Victor...",12,https://www.goodreads.com/book/show/3084038-th...,https://images.gr-assets.com/books/1494763458m...,this sceptred isle vol 10 the age of victoria ...
1782575,26168430,Sherlock Holmes and the July Crisis,6,https://www.goodreads.com/book/show/26168430-s...,https://images.gr-assets.com/books/1440592011m...,sherlock holmes and the july crisis
1782576,2342551,The Children's Classic Poetry Collection,36,https://www.goodreads.com/book/show/2342551.Th...,https://s.gr-assets.com/assets/nophoto/book/11...,the childrens classic poetry collection
1782577,22017381,"101 Nights: Volume One (101 Nights, #1-3)",70,https://www.goodreads.com/book/show/22017381-1...,https://images.gr-assets.com/books/1398621236m...,101 nights volume one 101 nights 13


In [16]:
# Import the TfidfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF Vectorizer object
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the 'mod_title' column and transform the text data into TF-IDF feature vectors
tfidf = vectorizer.fit_transform(titles["mod_title"])


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# Function to convert Goodreads URLs into clickable links in the output
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

# Function to display book cover images as clickable thumbnails linking to Goodreads
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

# Search function to find books matching the query
def search(query, vectorizer):
    # Clean the query text: remove any character that is not letter, number, or space, and lowercase it
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    
    # Convert the cleaned query into a TF-IDF vector using the vectorizer
    query_vec = vectorizer.transform([query])
    
    # Calculate cosine similarity between the query vector and all book title vectors (tfidf)
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    # Get indices of the top 10 most similar books (unsorted)
    indices = np.argpartition(similarity, -10)[-10:]
    
    # Select those books from the titles DataFrame
    results = titles.iloc[indices]
    
    # Sort the results by number of ratings (descending) to prioritize popular books
    results = results.sort_values("ratings", ascending=False)
    
    # Return the top 5 results styled with clickable URLs and cover images
    return results.head(5).style.format({'url': make_clickable, 'cover_image': show_image})


In [18]:
# Perform a search for the book title "harry potter and the prisoner of azkaban"
# using the previously created TF-IDF vectorizer and the search function.
# This will return the top 5 matching books with clickable Goodreads links and cover images.
search("harry potter and the prisoner of azkaban", vectorizer)


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
754545,464164,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",22794,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
1236294,28765183,Harry Potter and the Prisoner of Azkaban,3066,Goodreads,,harry potter and the prisoner of azkaban
343661,1132457,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",372,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
1202032,1516338,"Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)",246,Goodreads,,harry potter and the prisoner of azkaban harry potter 3
440131,13564690,Harry Potter and the Prisoner of Azkaban,216,Goodreads,,harry potter and the prisoner of azkaban


In [19]:
# Define a list of book IDs that the user likes (these IDs correspond to one of the datasets)
liked_books = ["4408", "31147619", "29983711", "9401317", "9317691", "8153988", "20494944"]

# Load the book ID mapping file which helps link book IDs between different datasets
df = pd.read_csv("book_id_map.csv")

# Print the first few rows of the book ID map to understand its structure
print(df.head())


   book_id_csv   book_id
0            0  34684622
1            1  34536488
2            2  34017076
3            3     71730
4            4  30422361


In [20]:
# Create an empty dictionary to store the mapping between CSV book IDs and Goodreads book IDs
csv_book_mapping = {}

# Open the book ID mapping CSV file and read it line by line
with open("book_id_map.csv", "r") as f:
    for line in f:
        # Skip empty lines
        if line.strip() == "":
            continue
        
        # Split each line by comma into csv_id and book_id
        csv_id, book_id = line.strip().split(",")
        
        # Store the mapping from csv_id to book_id in the dictionary
        csv_book_mapping[csv_id] = book_id


In [21]:
# Retrieve the Goodreads book ID that corresponds to the CSV book ID '0' from the mapping dictionary
csv_book_mapping['0']


'34684622'

In [22]:
# Load the Goodreads interactions dataset which contains user-book ratings
df = pd.read_csv("goodreads_interactions.csv")

# Print the total number of rows (interactions) in the dataset
print("Number of rows:", len(df))


Number of rows: 228648342


In [23]:
import os

# Define the path to the Goodreads interactions file
file_path = "goodreads_interactions.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Get the file size in megabytes (convert from bytes to MB)
    file_size = os.path.getsize(file_path) / (1024 * 1024)
    
    # Print the file size with two decimal places
    print(f"File size: {file_size:.2f} MB")
else:
    # Print a message if the file does not exist
    print("File not found.")


File size: 4118.56 MB


In [24]:
overlap_users = set()

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            continue

        try:
            rating = int(rating)
        except ValueError:
            continue
        
        book_id = csv_book_mapping[csv_id]
        
        if book_id in liked_books and rating >= 4:
                overlap_users.add(user_id)

In [25]:
# Iterate through the interactions file line by line
# For each line, check if the user_id is in the set of overlap_users (users who liked the same books)
# If yes, convert the csv_id to the real book_id using the mapping
# Append the user_id, book_id, and rating to rec_lines list for further processing
rec_lines = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in overlap_users:
            book_id = csv_book_mapping[csv_id]
            rec_lines.append([user_id, book_id, rating])

In [26]:
# Create a pandas DataFrame from the collected recommendation lines
# Assign column names: user_id, book_id, and rating
# Ensure that the book_id column is of string type for consistency in later processing
import pandas as pd

recs = pd.DataFrame(rec_lines, columns=["user_id", "book_id", "rating"])
recs["book_id"] = recs["book_id"].astype(str)

In [27]:
recs

Unnamed: 0,user_id,book_id,rating
0,284,977284,3
1,284,890054,4
2,284,837153,3
3,284,1586480,4
4,284,41814,5
...,...,...,...
1530252,873216,127455,0
1530253,873216,10365343,0
1530254,873216,16131077,0
1530255,873216,18781576,0


In [28]:
# Count how many times each book_id appears in the recommendations and get the top 10 most recommended books
top_recs = recs["book_id"].value_counts().head(10)
top_recs = top_recs.index.values

# Load the books metadata from the previously saved JSON file
books_titles = pd.read_json("books_titles.json")

# Ensure the 'book_id' column is of string type for consistent comparison
books_titles["book_id"] = books_titles["book_id"].astype(str)

# Display the first few rows of the books metadata dataframe to verify loading
books_titles.head()


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
0,1333909,Good Harbor,10,https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...,good harbor
1,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",140,https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...,the unschooled wizard sun wolf and starhawk 12
2,6066819,Best Friends Forever,51184,https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...,best friends forever
3,287140,Runic Astrology: Starcraft and Timekeeping in ...,15,https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...,runic astrology starcraft and timekeeping in t...
4,287141,The Aeneid for Boys and Girls,46,https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...,the aeneid for boys and girls


In [29]:
# Filter the books metadata to show only the books that are in the top recommended list
books_titles[books_titles["book_id"].isin(top_recs)]


Unnamed: 0,book_id,title,ratings,url,cover_image,mod_title
71398,77203,The Kite Runner,1848782,https://www.goodreads.com/book/show/77203.The_...,https://images.gr-assets.com/books/1484565687m...,the kite runner
386663,2767052,"The Hunger Games (The Hunger Games, #1)",4899965,https://www.goodreads.com/book/show/2767052-th...,https://images.gr-assets.com/books/1447303603m...,the hunger games the hunger games 1
546297,5107,The Catcher in the Rye,2086945,https://www.goodreads.com/book/show/5107.The_C...,https://images.gr-assets.com/books/1398034300m...,the catcher in the rye
630937,4671,The Great Gatsby,2758812,https://www.goodreads.com/book/show/4671.The_G...,https://images.gr-assets.com/books/1490528560m...,the great gatsby
838525,5470,1984,2023937,https://www.goodreads.com/book/show/5470.1984,https://images.gr-assets.com/books/1348990566m...,1984
1031472,38447,The Handmaid's Tale,648783,https://www.goodreads.com/book/show/38447.The_...,https://images.gr-assets.com/books/1498057733m...,the handmaids tale
1077226,2657,To Kill a Mockingbird,3255518,https://www.goodreads.com/book/show/2657.To_Ki...,https://images.gr-assets.com/books/1361975680m...,to kill a mockingbird
1195873,18143977,All the Light We Cannot See,498685,https://www.goodreads.com/book/show/18143977-a...,https://images.gr-assets.com/books/1451445646m...,all the light we cannot see
1196415,3,Harry Potter and the Sorcerer's Stone (Harry P...,4765497,https://www.goodreads.com/book/show/3.Harry_Po...,https://images.gr-assets.com/books/1474154022m...,harry potter and the sorcerers stone harry pot...
1446134,29983711,Pachinko,8161,https://www.goodreads.com/book/show/29983711-p...,https://images.gr-assets.com/books/1462393298m...,pachinko


In [30]:
# Count how many times each book_id appears in the recommendations
# Convert the result to a DataFrame and rename the columns for clarity
all_recs = recs["book_id"].value_counts()
all_recs = all_recs.to_frame().reset_index()
all_recs.columns = ["book_id", "book_count"]
all_recs.head(5)


Unnamed: 0,book_id,book_count
0,2767052,1092
1,29983711,1089
2,2657,1074
3,3,1048
4,4671,1028


In [31]:
# Merge the recommendation counts with the book titles metadata on book_id
# Compute a score for each book: more recommended books with fewer total ratings get a higher score
all_recs = all_recs.merge(books_titles, how="inner", on="book_id")
all_recs["score"] = all_recs["book_count"] * (all_recs["book_count"] / all_recs["ratings"])

# Sort the books by score to get top recommendations
all_recs.sort_values("score", ascending=False).head(10)


Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
1,29983711,1089,Pachinko,8161,https://www.goodreads.com/book/show/29983711-p...,https://images.gr-assets.com/books/1462393298m...,pachinko,145.315648
7261,35069544,29,This Will Be My Undoing: Living at the Interse...,8,https://www.goodreads.com/book/show/35069544-t...,https://images.gr-assets.com/books/1502099764m...,this will be my undoing living at the intersec...,105.125
14752,29749094,15,"Superman (DC Icons, #4)",7,https://www.goodreads.com/book/show/29749094-s...,https://s.gr-assets.com/assets/nophoto/book/11...,superman dc icons 4,32.142857
238,4408,327,East of Eden,3447,https://www.goodreads.com/book/show/4408.East_...,https://images.gr-assets.com/books/1323882457m...,east of eden,31.020888
724,9317691,175,The Name of the Wind (The Kingkiller Chronicle...,1043,https://www.goodreads.com/book/show/9317691-th...,https://images.gr-assets.com/books/1360558233m...,the name of the wind the kingkiller chronicle 1,29.362416
236,32920226,328,"Sing, Unburied, Sing",4592,https://www.goodreads.com/book/show/32920226-s...,https://images.gr-assets.com/books/1499340866m...,sing unburied sing,23.428571
13156,36301023,17,"My Plain Jane (The Lady Janies, #2)",13,https://www.goodreads.com/book/show/36301023-m...,https://images.gr-assets.com/books/1507936746m...,my plain jane the lady janies 2,22.230769
13025,26857046,17,The Invisible Life of Addie La Rue,13,https://www.goodreads.com/book/show/26857046-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the invisible life of addie la rue,22.230769
16306,35430702,14,The Lady's Guide to Petticoats and Piracy,9,https://www.goodreads.com/book/show/35430702-t...,https://s.gr-assets.com/assets/nophoto/book/11...,the ladys guide to petticoats and piracy,21.777778
216,30753987,342,The Leavers,5602,https://www.goodreads.com/book/show/30753987-t...,https://images.gr-assets.com/books/1489158974m...,the leavers,20.878972


In [32]:
all_recs[all_recs["book_count"] > 200].sort_values("score", ascending=False).head(10)

Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
1,29983711,1089,Pachinko,8161,https://www.goodreads.com/book/show/29983711-p...,https://images.gr-assets.com/books/1462393298m...,pachinko,145.315648
238,4408,327,East of Eden,3447,https://www.goodreads.com/book/show/4408.East_...,https://images.gr-assets.com/books/1323882457m...,east of eden,31.020888
236,32920226,328,"Sing, Unburied, Sing",4592,https://www.goodreads.com/book/show/32920226-s...,https://images.gr-assets.com/books/1499340866m...,sing unburied sing,23.428571
216,30753987,342,The Leavers,5602,https://www.goodreads.com/book/show/30753987-t...,https://images.gr-assets.com/books/1489158974m...,the leavers,20.878972
249,8153988,322,"The Eye of the World (Wheel of Time, #1)",5740,https://www.goodreads.com/book/show/8153988-th...,https://images.gr-assets.com/books/1465920672m...,the eye of the world wheel of time 1,18.063415
441,33253215,236,The Heart's Invisible Furies,3629,https://www.goodreads.com/book/show/33253215-t...,https://images.gr-assets.com/books/1490803456m...,the hearts invisible furies,15.347479
71,30688435,533,Exit West,21378,https://www.goodreads.com/book/show/30688435-e...,https://images.gr-assets.com/books/1477324680m...,exit west,13.288848
251,32283423,321,American War,7776,https://www.goodreads.com/book/show/32283423-a...,https://images.gr-assets.com/books/1481494946m...,american war,13.251157
228,26025588,335,Behold the Dreamers,8793,https://www.goodreads.com/book/show/26025588-b...,https://images.gr-assets.com/books/1439643293m...,behold the dreamers,12.762993
345,17912498,266,The Queen of the Night,6115,https://www.goodreads.com/book/show/17912498-t...,https://images.gr-assets.com/books/1460425080m...,the queen of the night,11.570891


In [33]:
# Filter the recommendations to include only books with more than 200 overlapping user recommendations
popular_recs = all_recs[all_recs["book_count"] > 200].sort_values("score", ascending=False)

# Define helper functions to create clickable URLs and display cover images in the output
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

# Display the top 10 recommended books (excluding already liked books), formatted with clickable links and cover images
popular_recs[~popular_recs["book_id"].isin(liked_books)].head(10).style.format({'url': make_clickable, 'cover_image': show_image})


Unnamed: 0,book_id,book_count,title,ratings,url,cover_image,mod_title,score
236,32920226,328,"Sing, Unburied, Sing",4592,Goodreads,,sing unburied sing,23.428571
216,30753987,342,The Leavers,5602,Goodreads,,the leavers,20.878972
441,33253215,236,The Heart's Invisible Furies,3629,Goodreads,,the hearts invisible furies,15.347479
71,30688435,533,Exit West,21378,Goodreads,,exit west,13.288848
251,32283423,321,American War,7776,Goodreads,,american war,13.251157
228,26025588,335,Behold the Dreamers,8793,Goodreads,,behold the dreamers,12.762993
345,17912498,266,The Queen of the Night,6115,Goodreads,,the queen of the night,11.570891
325,28114515,276,The Wangs vs. the World,7044,Goodreads,,the wangs vs the world,10.81431
108,34273236,459,Little Fires Everywhere,21135,Goodreads,,little fires everywhere,9.968346
104,28815371,467,The Mothers,22346,Goodreads,,the mothers,9.759644


In [34]:
import pandas as pd

my_books = pd.read_csv("liked_books.csv", index_col=0)
my_books["book_id"] = my_books["book_id"].astype(str)

In [35]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [36]:
# Create a mapping from the CSV book IDs (used in the interactions dataset) to the Goodreads book IDs (used in the books metadata)
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

# Convert the user's liked books into a set for faster lookup when filtering later
book_set = set(my_books["book_id"])


In [37]:
# Build a dictionary of users who have rated the same books as the ones in our book set (liked books).
# For each user, count how many of the liked books they have rated.

overlap_users = {}

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        # Map CSV book ID to Goodreads book ID
        book_id = csv_book_mapping.get(csv_id)
        
        # If the book is in the user's liked books set, increment count for this user
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1


In [38]:
len(overlap_users)

316341

In [39]:
# Filter users who have a significant overlap with our liked books.
# Keep only users who have rated at least 20% of the books in our liked books list.

filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])
len(filtered_overlap_users)


1258

In [41]:
interactions_list = []

with open("goodreads_interactions.csv", 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break
        user_id, csv_id, _, rating, _ = line.split(",")
        
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interactions_list.append([user_id, book_id, rating])


In [42]:
# Get the total number of interactions collected for the filtered users
len(interactions_list)


5638701

In [43]:
interactions_list[0]


['282', '627206', '4']

In [44]:
# Convert the list of interactions into a DataFrame with columns: user_id, book_id, rating
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

# Combine (concatenate) the original user’s book list with the new interactions from similar users
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

# Display the combined DataFrame containing all interactions
interactions


Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,475178,0
5638697,804100,186074,0
5638698,804100,153008,0
5638699,804100,45107,0


In [45]:
# Convert book_id column to string type
interactions["book_id"] = interactions["book_id"].astype(str)

# Convert user_id column to string type
interactions["user_id"] = interactions["user_id"].astype(str)

# Convert rating column to numeric type (in case it's read as string)
interactions["rating"] = pd.to_numeric(interactions["rating"])

# Create a user_index column: map each unique user_id to a unique integer code
interactions["user_index"] = interactions["user_id"].astype("category").cat.codes

# Create a book_index column: map each unique book_id to a unique integer code
interactions["book_index"] = interactions["book_id"].astype("category").cat.codes


In [46]:
from scipy.sparse import coo_matrix

# Create a sparse matrix in COO format where:
# - data is the ratings
# - row indices are user indices
# - column indices are book indices
ratings_mat_coo = coo_matrix((interactions["rating"], (interactions["user_index"], interactions["book_index"])))

# Display the shape of the matrix: (number of users, number of books)
ratings_mat_coo.shape


(1259, 802870)

In [47]:
# Convert the COO sparse matrix to CSR format for efficient row slicing
ratings_mat = ratings_mat_coo.tocsr()

# Display all interactions for the user with user_id "-1" (our own user data)
interactions[interactions["user_id"] == "-1"]


Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
6,-1,356824,5,0,581743
7,-1,12125412,5,0,59763
8,-1,139069,5,0,124430
10,-1,76680,5,0,722098
11,-1,1898,5,0,276178


In [48]:
my_index = 0

In [49]:
# Import cosine similarity function from scikit-learn
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between our user (my_index) and all other users
# ratings_mat[my_index,:] gives the rating vector for our user
# similarity is a 1D array where each element represents similarity score with another user
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

# Display the similarity score with the first user (user at index 0)
similarity[0]


0.9999999999999999

In [50]:
import numpy as np

# Get the indices of the top 15 most similar users
# np.argpartition finds the indices of the top 15 values in similarity array
# (it is faster than full sorting when we only need top values)
indices = np.argpartition(similarity, -15)[-15:]

# Display the indices of these top similar users
indices


array([1188,  942,  218,  129,  496,  435, 1208,  795, 1213, 1210, 1143,
        321,  294,  862,    0], dtype=int64)

In [51]:
# Filter the interactions to include only the users with indices in 'indices'
# These are the most similar users to the target user
similar_users = interactions[interactions["user_index"].isin(indices)].copy()

# Exclude the current user (-1) from the similar users
similar_users = similar_users[similar_users["user_id"] != "-1"]

# Display the filtered interactions of similar users
similar_users


Unnamed: 0,user_id,book_id,rating,user_index,book_index
45312,4133,5359,3,942,632143
45313,4133,10464963,4,942,13492
45314,4133,3858,3,942,593622
45315,4133,11827808,4,942,51904
45316,4133,7913305,4,942,732465
...,...,...,...,...,...
5638521,712588,32388712,3,1143,543119
5638522,712588,16322,5,1143,183365
5638523,712588,860543,0,1143,759827
5638524,712588,853510,5,1143,756768


In [52]:
# Group the similar users' interactions by book_id
# Calculate the count of ratings and the average rating for each book
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

# Display the aggregated recommendation data
book_recs


Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3.833333
100322,1,0.000000
100365,1,0.000000
10046142,1,0.000000
1005,3,0.000000
...,...,...
99561,2,2.500000
99610,1,3.000000
99664,1,4.000000
9969571,3,2.333333


In [53]:
# Load book metadata from the JSON file and ensure 'book_id' is of string type
books_titles = pd.read_json("books_titles.json")
books_titles["book_id"] = books_titles["book_id"].astype(str)

# Merge the book recommendations with book metadata on 'book_id'
book_recs = book_recs.merge(books_titles, how="inner", on="book_id")

# Display the merged dataframe with recommendation stats and book details
book_recs


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,6,3.833333,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,100322,1,0.000000,Assata: An Autobiography,11057,https://www.goodreads.com/book/show/100322.Assata,https://images.gr-assets.com/books/1328857268m...,assata an autobiography
2,100365,1,0.000000,The Mote in God's Eye,48736,https://www.goodreads.com/book/show/100365.The...,https://images.gr-assets.com/books/1399490037m...,the mote in gods eye
3,10046142,1,0.000000,Dancing in the Glory of Monsters: The Collapse...,2391,https://www.goodreads.com/book/show/10046142-d...,https://images.gr-assets.com/books/1328757755m...,dancing in the glory of monsters the collapse ...
4,1005,3,0.000000,Think and Grow Rich,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich
...,...,...,...,...,...,...,...,...
2849,99561,2,2.500000,Looking for Alaska,804587,https://www.goodreads.com/book/show/99561.Look...,https://images.gr-assets.com/books/1394798630m...,looking for alaska
2850,99610,1,3.000000,The Best Laid Plans,17434,https://www.goodreads.com/book/show/99610.The_...,https://images.gr-assets.com/books/1353374848m...,the best laid plans
2851,99664,1,4.000000,The Painted Veil,24606,https://www.goodreads.com/book/show/99664.The_...,https://images.gr-assets.com/books/1320421719m...,the painted veil
2852,9969571,3,2.333333,Ready Player One,376328,https://www.goodreads.com/book/show/9969571-re...,https://images.gr-assets.com/books/1500930947m...,ready player one


In [55]:
# Calculate adjusted count to give more weight to books with more ratings, normalized by total ratings
book_recs["adjusted_count"] = book_recs["count"] * (book_recs["count"] / book_recs["ratings"])



In [56]:
# Calculate final recommendation score as product of mean rating and adjusted count
book_recs["score"] = book_recs["mean"] * book_recs["adjusted_count"]

In [57]:
# Remove books that the user has already read
book_recs = book_recs[~book_recs["book_id"].isin(my_books["book_id"])]

In [59]:
# Normalize titles in my_books for better comparison (remove special characters and lowercase)
my_books["mod_title"] = my_books["title"].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()
my_books["mod_title"] = my_books["mod_title"].str.replace("\\s+", " ", regex=True)


In [60]:
# Remove books with the same normalized title as the books already read
book_recs = book_recs[~book_recs["mod_title"].isin(my_books["mod_title"])]

# Filter recommended books to include only those with average rating >= 4
book_recs = book_recs[book_recs["mean"] >= 4]

# Keep only books with more than 2 ratings
book_recs = book_recs[book_recs["count"] > 2]

# Sort the final recommendations by highest average rating
top_recs = book_recs.sort_values("mean", ascending=False)

In [61]:
# Define helper functions to display clickable Goodreads links and book cover images
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val, val)

def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

# Apply formatting to the recommendation table to display clickable links and images
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})


Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
2265,62291,5,4.8,"A Storm of Swords (A Song of Ice and Fire, #3)",477834,Goodreads,,a storm of swords a song of ice and fire 3,5.2e-05,0.000251
600,157993,3,4.333333,The Little Prince,763309,Goodreads,,the little prince,1.2e-05,5.1e-05
1103,22034,3,4.333333,The Godfather,259150,Goodreads,,the godfather,3.5e-05,0.00015
1176,2318271,3,4.333333,The Last Lecture,245804,Goodreads,,the last lecture,3.7e-05,0.000159
1909,4381,3,4.333333,Fahrenheit 451,591506,Goodreads,,fahrenheit 451,1.5e-05,6.6e-05
243,119322,4,4.25,"The Golden Compass (His Dark Materials, #1)",973154,Goodreads,,the golden compass his dark materials 1,1.6e-05,7e-05
1444,2767793,4,4.25,"The Hero of Ages (Mistborn, #3)",149260,Goodreads,,the hero of ages mistborn 3,0.000107,0.000456
2563,78983,4,4.25,"Kane and Abel (Kane and Abel, #1)",75215,Goodreads,,kane and abel kane and abel 1,0.000213,0.000904
244,119324,3,4.0,"The Subtle Knife (His Dark Materials, #2)",246697,Goodreads,,the subtle knife his dark materials 2,3.6e-05,0.000146
398,13497,4,4.0,"A Feast for Crows (A Song of Ice and Fire, #4)",437398,Goodreads,,a feast for crows a song of ice and fire 4,3.7e-05,0.000146
