Data from here: https://mengtingwan.github.io/data/goodreads.html#datasets


In [None]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
from itertools import islice
import psycopg2
from psycopg2.extras import execute_values
pd.options.display.float_format = '{:,}'.format

def batched(iterable, n):
    "Batch data into tuples of length n. The last batch may be shorter."
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    it = iter(iterable)
    while (batch := tuple(islice(it, n))):
        yield batch

In [None]:
# setup db

con = psycopg2.connect(
    host="localhost",
    database="reviews",
    user="postgres",
    password="postgres"
    )
cur = con.cursor()

# todo support for multiple authors
cur.execute('''CREATE TABLE IF NOT EXISTS book(
            id TEXT PRIMARY KEY, 
            title TEXT NOT NULL, 
            author TEXT NOT NULL, 
            url TEXT NOT NULL, 
            year TEXT NOT NULL, 
            rating TEXT NOT NULL
            )''')
cur.execute('''CREATE TABLE IF NOT EXISTS review(
            id TEXT PRIMARY KEY, 
            bookId TEXT NOT NULL, 
            userId TEXT NOT NULL, 
            rating INTEGER NOT NULL, 
            text TEXT NOT NULL, 
            date TEXT NOT NULL, 
            nVotes TEXT NOT NULL,
            FOREIGN KEY (bookId) 
            REFERENCES book (id)
            ON UPDATE CASCADE ON DELETE CASCADE
    )''')
con.commit()

# Create db

In [None]:
# save books into db
file_name = 'goodreads_books.json.gz'
with gzip.open(file_name) as books:
    data = [
        (
            d['book_id'], 
            d['title'], 
            d['authors'][0]['author_id'] if len(d['authors']) > 0 else 'No Listed Author', 
            d['link'], 
            d['publication_year'], 
            d['average_rating']
            )
        for book in books
        # check if it is already in the db
        # if cur.execute('select id from book where id = ?', (d := json.loads(l))['book_id']).fetchone() is not None
        if (d := json.loads(book))
    ]
    execute_values(cur, 'insert into book values %s', data)
    con.commit()


In [None]:
# save reviews into db
file_name = 'goodreads_reviews_dedup.json.gz'
with gzip.open(file_name) as reviews:
    for i, review_batch in enumerate(batched(reviews, 1_000_000)):
        data = [
            (
                d['review_id'],
                d['book_id'],
                d['user_id'],
                d['rating'],
                d['review_text'],
                d['date_added'],
                d['n_votes']
            )
            for review in review_batch
            if (d := json.loads(review))
        ]
        execute_values(cur, 'insert into review values %s', data)
        con.commit()
        print(f'{i+1} batches processed')
        

# Process Data

In [None]:
# do sum instead of 'count(book.id)' as count() will count every row, even nulls meaning every book would have at least 1 review
cur.execute('''
            select book.id, title, sum((case when review.bookId is not null then 1 else 0 end))
            from book 
            left join review on book.id = review.bookId 
            group by book.id 
            order by sum((case when review.bookId is not null then 1 else 0 end)) desc
            ''')
df = pd.DataFrame(cur.fetchall(), columns=['id', 'title', 'count'])

In [None]:
df['count'].plot(kind='hist', logy=True, bins=100);

In [None]:
df[df['count'] < 1000]['count'].plot(kind='hist', logy=True, bins=100);

In [None]:
df[df['count'] < 100]['count'].plot(kind='hist', logy=True, bins=100);