# Book Review Data from Goodreads : Preprocessing

+ Using this kaggle dataset: https://www.kaggle.com/datasets/bahramjannesarr/goodreads-book-datasets-10m?select=book1000k-1100k.csv
+ Data source: Goodreads (before the API was closed down)
+ Books: ~ 1.8M
+ Reviews: ~ 100.000

In [6]:
import json
import sys
import time
from pathlib import Path
from typing import List

import implicit
import pandas as pd
from scipy.sparse import csr_matrix, lil_matrix, save_npz

In [7]:
data_files = [x.name for x in Path(".").joinpath("data").glob("**/*") if x.is_file() and "checkpoint" not in str(x)]
book_files = [f"data/{x}" for x in data_files if x.startswith("book")]
rating_files = [f"data/{x}" for x in data_files if x.startswith("user")]

In [8]:
def df_from_files(files: List) -> pd.DataFrame:
    dfs = []
    for filename in files:
        df = pd.read_csv(filename, index_col=None, header=0)
        dfs.append(df)    
    df = pd.concat(dfs, axis=0, ignore_index=True)
    return df

In [9]:
books = df_from_files(book_files)
ratings = df_from_files(rating_files)

## Books

In [10]:
len(books)

1850310

In [11]:
books["Name"].value_counts()

Name
Collected Poems                                                             24
Jane Eyre                                                                   24
Selected Poems                                                              24
Little Women                                                                23
Hamlet                                                                      23
                                                                            ..
My Sweetest Friend: Recipes for the Perfect Friendship                       1
The King Of The Golden River: A Story                                        1
The Miracles of Archangel Michael                                            1
Fine Homebuilding on Baths and Kitchens                                      1
The Adaptive Economy: Adjustment Policies in Small, Low-Income Countries     1
Name: count, Length: 1636235, dtype: int64

In [12]:
books[books["Name"] == "Dune"]

Unnamed: 0,Id,Name,Authors,ISBN,Rating,PublishYear,PublishMonth,PublishDay,Publisher,RatingDist5,...,RatingDist3,RatingDist2,RatingDist1,RatingDistTotal,CountsOfReview,Language,pagesNumber,Description,Count of text reviews,PagesNumber
232891,1022863,Dune,Frank Herbert,9997516788.0,4.23,1965,1,6,Chilton,5:363642,...,3:95642,2:28756,1:16077,total:712319,35,eng,507.0,Set in the far future amidst a sprawling feuda...,35.0,
1095863,53732,Dune,Frank Herbert,,4.23,1990,1,9,Ace/Berkley Books,5:351539,...,3:92772,2:27968,1:15681,total:689160,772,eng,535.0,,,


In [13]:
books["title-author"] = books["Name"] + " - " + books["Authors"]

In [14]:
book_counts = books["title-author"].value_counts()
duplicate_book_counts = book_counts[book_counts > 1]
duplicate_book_counts

title-author
The Scarlet Letter - Nathaniel Hawthorne                                                         21
Jane Eyre - Charlotte Brontë                                                                     21
Hamlet - William Shakespeare                                                                     20
Madame Bovary - Gustave Flaubert                                                                 20
Mansfield Park - Jane Austen                                                                     20
                                                                                                 ..
Suburban Escape: The Art of California Sprawl - Ann M. Wolfe                                      2
Of Bicycles, Bakelites, and Bulbs: Toward a Theory of Sociotechnical Change - Wiebe E. Bijker     2
The Making of a Confederate: Walter Lenoir's Civil War - William L. Barney                        2
Playing for Keeps: Michael Jordan and the World He Made - David Halberstam             

In [15]:
stages = [" ", "░", "▒", "▓", "█"]

most_reviewed_duplicates = {}
total_duplicate_books = len(duplicate_book_counts.index)

# load cached data
with open('best_books.json', 'r') as f:
    most_reviewed_duplicates = json.load(f)
    if most_reviewed_duplicates:
        print(" [Loaded Cache]")

for i, book in enumerate(duplicate_book_counts.index):
    is_cached = book in most_reviewed_duplicates
    if is_cached:
        # skip if already cached
        continue
        
    duplicates = books[books["title-author"] == book]
    duplicates = duplicates[["Id", "title-author", "CountsOfReview"]]
    most_reviewed_entry = duplicates.loc[[(duplicates["CountsOfReview"].idxmax())]]
    book_id, title_author, _ = most_reviewed_entry.values[0]
    most_reviewed_duplicates[title_author] = book_id

    batch_size = 1000
    mini_batch_size = 20
    if (i % (mini_batch_size // len(stages))) == 0:
        if (i % batch_size) == 0 and i > 0:
            with open('best_books.json', 'w') as f:
                json.dump(most_reviewed_duplicates, f)
                print(" Cached progress")
        mini_batches_done = (i % batch_size) // mini_batch_size
        current_stage = stages[(i % mini_batch_size) // (mini_batch_size // len(stages))]
        formatted_i = str(f"{{:{len(str(total_duplicate_books))}d}}").format(i)
        percent_i = int((i / total_duplicate_books) * 100)
        sys.stdout.write("\r" + "▕" + ("█" * (mini_batches_done)) + current_stage + " " * (((batch_size // mini_batch_size) - 1) - mini_batches_done) + "▏" + f" [{formatted_i}/{total_duplicate_books}]" + f"({percent_i}%)")
        sys.stdout.flush()

with open('best_books.json', 'w') as f:
    json.dump(most_reviewed_duplicates, f)
    print(" Cached progress")

 [Loaded Cache]
 Cached progress


## Ratings

In [46]:
ratings

Unnamed: 0,ID,Name,Rating
0,6675,"Baxter, the Pig Who Wanted to Be Kosher",it was amazing
1,6675,Set This House in Order,really liked it
2,6675,Paradise Park,really liked it
3,7027,Paradise Park,liked it
4,6675,The Dead Fish Museum,it was amazing
...,...,...,...
362591,5403,"The MacGregors: Alan & Grant (The MacGregors, ...",it was ok
362592,5403,The MacGregors: Serena & Caine (The MacGregors...,it was ok
362593,5403,Time and Again: Time Was / Times Change,it was ok
362594,5403,"Dance Upon The Air (Three Sisters Island, #1)",really liked it


In [53]:
ratings["ID"].value_counts()

ID
4196    3637
4806    3236
4693    2736
1134    2089
284     2052
        ... 
4482       1
4460       1
4329       1
4399       1
4492       1
Name: count, Length: 8919, dtype: int64

In [54]:
ratings["ID"].value_counts().mean()

np.float64(40.654333445453524)

In [55]:
ratings["ID"].value_counts().median()

np.float64(1.0)