In [1]:
import os
import pandas as pd
import re
import csv
import pickle
from pathlib import Path
import simplejson as json
from faker import Faker
from datetime import datetime
import random
import numpy as np

In [22]:
base_output_path = "out"
chosen_book_ids_path = os.path.join(base_output_path, "chosen_book_ids.pickle")
full_book_information_path = os.path.join(base_output_path, "full_book_information.csv")
export_path = os.path.join(base_output_path, "export")
book_path = os.path.join(export_path, "book.csv")
book_has_author_path = os.path.join(export_path, "book_has_author.csv")
book_similar_to_book_path = os.path.join(export_path, "book_similar_to_book.csv")
author_path = os.path.join(export_path, "author.csv")
review_path = os.path.join(export_path, "review.csv")
user_has_read_book_path = os.path.join(export_path, "user_has_read_book.csv")
user_owns_book_path = os.path.join(export_path, "user_owns_book.csv")
user_path = os.path.join(export_path, "user.csv")
# Sample files
review_sample_path = os.path.join(export_path, "review-sample.csv")
user_has_read_book_sample_path = os.path.join(export_path, "user_has_read_book-sample.csv")
user_owns_book_sample_path = os.path.join(export_path, "user_owns_book-sample.csv")
user_sample_path = os.path.join(export_path, "user-sample.csv")
#

Path(base_output_path).mkdir(parents=True, exist_ok=True)
Path(export_path).mkdir(parents=True, exist_ok=True)

In [7]:
# Generate the list of book ids that we want to use
work_json = os.path.join("meta_data", "goodreads_book_works.json")

try:
    os.remove(chosen_book_ids_path)
except:
    pass

chosen_book_ids = set()

for i, chunk in enumerate(pd.read_json(work_json, chunksize = 15000, lines=True)):
    # Only save works with > 1000 reviews
    chunk = chunk[pd.to_numeric(chunk['text_reviews_count']) > 1000]

    chosen_book_ids.update( chunk['best_book_id'] )

    print(f"Ran through {i} chunk(s)...", end="\r")

print("Finished generating chosen_book_ids")
print("Saving...")

with open(chosen_book_ids_path, 'wb') as handle:
    pickle.dump(chosen_book_ids, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Finished saving")

Finished generating chosen_book_ids
Saving...
Finished saving


In [8]:
len(chosen_book_ids)

8625

In [9]:
# Load the set of best book ids
with open(chosen_book_ids_path, 'rb') as handle:
    chosen_book_ids = pickle.load(handle)
print("Loaded chosen_book_ids")

Loaded chosen_book_ids


In [10]:
#book_json = os.path.join("meta_data", "goodreads_books.json")
book_json = os.path.join("genre", "children", "goodreads_books_children.json")

try:
    os.remove(full_book_information_path)
    os.remove(book_path)
except:
    pass

for i, chunk in enumerate(pd.read_json(book_json, chunksize = 10000, lines=True)):
    # Only save books with our chosen book id
    chunk = chunk[chunk["book_id"].isin(chosen_book_ids)]

    # Replace newlines with ' / ' and make everything single quotes
    chunk = chunk.replace({'\n': ' / ', '"': "'"}, regex=True)

    # Replace empty lines with NULL to make importing into mysql better
    chunk = chunk.replace('', 'NULL')

    # Only grab first 150 chars of title or 1500 chars of description
    chunk["title"] = chunk["title"].str[:150]
    chunk["description"] = chunk["description"].str[:1500]

    chunk.to_csv(full_book_information_path, index=False, mode='a', header=(i == 0))

    
    chunk = chunk[[
        'book_id',
        'title',
        'publication_year',
        'language_code',
        'description',
        'num_pages',
    ]]
    
    chunk.to_csv(book_path, index=False, mode='a', header=(i == 0))


    print(f"Ran through {i} chunk(s)...", end="\r")

print("Finished generating book information")

Finished generating book information


In [12]:
try:
    os.remove(book_has_author_path)
except:
    pass

author_id_pattern = re.compile(r"(?<='author_id': ')\d+(?=')")

def add_authors(author_list, book):
    book_id = book['book_id']
    authors_str = book['authors']

    for author_id in re.findall(author_id_pattern, authors_str):
        author_list.append((book_id, int(author_id)))

full_book_information = pd.read_csv(full_book_information_path)

author_list = []

full_book_information.apply(lambda book: add_authors(author_list, book), axis = 1) 

book_has_author = pd.DataFrame(author_list, columns=['book_id', 'author_id'])

book_has_author.to_csv(book_has_author_path, index=False, mode='w', header=True)

print("Finished generating book has author information")

Finished generating book has author information


In [13]:
try:
    os.remove(author_path)
except:
    pass
    
author_json = os.path.join("meta_data", "goodreads_book_authors.json")

book_has_author = pd.read_csv(book_has_author_path)
authors = set(book_has_author['author_id'].unique())

for i, chunk in enumerate(pd.read_json(author_json, chunksize = 10000, lines=True)):
    # Only save books with our chosen book id
    chunk = chunk[chunk["author_id"].isin(authors)]

    chunk = chunk[[
        "author_id",
        "name",
    ]]

    chunk.to_csv(author_path, index=False, mode='a', header=(i == 0))

    print(f"Ran through {i} chunk(s)...", end="\r")

print("Finished writing author file")

Finished writing author file


In [14]:
try:
    os.remove(book_similar_to_book_path)
except:
    pass

book_id_pattern = re.compile(r"\d+")

def add_similar_books(book_similar_to_book_list, book):
    book_id = book['book_id']
    similar_book_str = book['similar_books']
    for similar_id_str in re.findall(book_id_pattern, similar_book_str):
        similar_id = int(similar_id_str)
        if similar_id in chosen_book_ids:
            book_similar_to_book_list.append((book_id, similar_id))

full_book_information = pd.read_csv(full_book_information_path)

book_similar_to_book_list = []

full_book_information.apply(lambda book: add_similar_books(book_similar_to_book_list, book), axis = 1) 

book_similar_to_book = pd.DataFrame(book_similar_to_book_list, columns=['from_book_id', 'to_book_id'])

book_similar_to_book.to_csv(book_similar_to_book_path, index=False, mode='w', header=True)

print("Finished generating similar books")

Finished generating similar books


In [15]:
user_id_csv = os.path.join("book_shelves", "user_id_map.csv")

def add_user_ids(user_id_lookup, id_mapping):
    numeric_id = int(id_mapping['user_id_csv']) + 1 # Add 1 because otherwise is 0 index, which cannot import into unsigned field
    long_id = id_mapping['user_id']
    user_id_lookup[long_id] = numeric_id
    
user_id_lookup = {}

user_id_mapping_df = pd.read_csv(user_id_csv)

user_id_mapping_df.apply(lambda id_mapping: add_user_ids(user_id_lookup, id_mapping), axis = 1) 

print("Finished generting user id lookup table")

Finished generting user id lookup table


In [16]:
# Write csv for textual reviews
try:
    os.remove(review_path)
except:
    pass
    
# review_json = os.path.join("book_reviews", "goodreads_reviews_dedup.json")
review_json = os.path.join("genre", "children", "goodreads_reviews_children.json")


for i, chunk in enumerate(pd.read_json(review_json, chunksize = 100000, lines=True)):
    # Only save reviews for books in our chosen book id
    chunk = chunk[chunk["book_id"].isin(chosen_book_ids)]

    chunk = chunk[[
        "user_id",
        "book_id",
        "rating",
        "review_text"
    ]]

    chunk["user_id"] = chunk["user_id"].map(user_id_lookup)

    chunk.rename(columns = {"review_text": "text"}, inplace=True)


    # Replace newlines with ' / ' and make everything single quotes
    chunk = chunk.replace({'\n': ' / ', '"': "'"}, regex=True)

    # Replace empty lines with NULL to make importing into mysql better
    chunk = chunk.replace('', 'NULL')

    # Only grab first 150 chars of title or 1500 chars of description
    chunk["text"] = chunk["text"].str[:1000]
    
    chunk.to_csv(review_path, index=False, mode='a', header=(i == 0))

    print(f"Ran through {i} chunk(s)...", end="\r")

print("Finished writing review file")

Finished writing review file


In [21]:
# Write csv for whether a user has read books
try:
    os.remove(user_has_read_book_path)
    os.remove(user_owns_book_path)
except:
    pass
    
# interaction_json = os.path.join("book_shelves", "goodreads_interactions_dedup.json")
interaction_json = os.path.join("genre", "children", "goodreads_interactions_children.json")

def date_to_str(row):
    read_at = row["read_at"]
    if read_at == "": return "NULL"
    try:
        return datetime.strptime(read_at, "%a %b %d %H:%M:%S %z %Y").strftime("%Y-%m-%d")
    except Exception:
        return "NULL"


for i, chunk in enumerate(pd.read_json(interaction_json, chunksize = 1000000, lines=True)):

    chunk = chunk[chunk["is_read"] == True]

    # Only save reads for books in our chosen book id
    chunk = chunk[chunk["book_id"].isin(chosen_book_ids)]

    chunk["date_read"] = chunk.apply(date_to_str, axis=1)
    
    chunk["user_id"] = chunk["user_id"].map(user_id_lookup)

    chunk = chunk[[
        "user_id",
        "book_id",
        "date_read"
    ]]

    chunk.to_csv(user_has_read_book_path, index=False, mode='a', header=(i == 0))

    # Now onto whether a user owns a books
    chunk = chunk.sample(frac=0.6) # Assume 60% of read books are owned

    chunk = chunk[[ "user_id", "book_id" ]]

    chunk['media_type'] = np.random.choice(["softcover book", "hardcover book", "ebook", "audiobook"], chunk.shape[0])

    chunk.to_csv(user_owns_book_path, index=False, mode='a', header=(i == 0))

    print(f"Ran through {i} chunk(s)...", end="\r")

print("Finished writing user has read file")

Finished writing user has read file


In [23]:
# Write csv for sample of reviews
try:
    os.remove(review_sample_path)
except:
    pass

for i, chunk in enumerate(pd.read_csv(review_path, chunksize = 1000000)):

    # Sample 10% of our actual review count
    chunk = chunk.sample(frac=0.1)

    chunk.to_csv(review_sample_path, index=False, mode='a', header=(i == 0))

print("Finished writing sample of review file")

Finished writing sample of review file


In [39]:
# Write csv for sample of user has read book and user owns book
try:
    os.remove(user_has_read_book_sample_path)
    os.remove(user_owns_book_sample_path)
except:
    pass

for i, chunk in enumerate(pd.read_csv(user_has_read_book_path, chunksize = 100000)):

    # Sample 0.1% of our actual has read book
    chunk = chunk.sample(frac=0.005)

    chunk = chunk.replace(np.nan, 'NULL')

    chunk.to_csv(user_has_read_book_sample_path, index=False, mode='a', header=(i == 0))
    
print("Finished writing sample of user has read and user owns book")

Finished writing sample of user has read and user owns book


In [42]:
# Write csv for just user owns books sample
try:
    os.remove(user_owns_book_sample_path)
except:
    pass

for i, chunk in enumerate(pd.read_csv(user_has_read_book_sample_path, chunksize = 100000)):

    # Assume 60% of read books are owned
    chunk = chunk.sample(frac=0.6)
    
    chunk = chunk[[ "user_id", "book_id" ]]

    chunk['media_type'] = np.random.choice(["softcover book", "hardcover book", "ebook", "audiobook"], chunk.shape[0])

    chunk.to_csv(user_owns_book_sample_path, index=False, mode='a', header=(i == 0))
    
print("Finished writing sample of user has read and user owns book")

Finished writing sample of user has read and user owns book


In [40]:
# Write csv for all users
try:
    os.remove(user_path)
except:
    pass
    
user_ids = set()

for i, chunk in enumerate(pd.read_csv(review_path, chunksize = 1000000)):
    user_ids = user_ids.union(set(chunk['user_id'].unique()))
    print(f"Ran through {i} chunk(s)...", end="\r")

for j, chunk in enumerate(pd.read_csv(user_has_read_book_path, chunksize = 1000000)):
    user_ids = user_ids.union(set(chunk['user_id'].unique()))
    print(f"Ran through {i+j} chunk(s)...", end="\r")

count = len(user_ids)

fake = Faker()
Faker.seed(123)

def generate_user_info(user_id):
    username = fake.unique.user_name()
    email = fake.unique.safe_email()
    return [user_id, username, email]

print("Generting fake user info...", end="\r")
data = [generate_user_info(user_id) for user_id in user_ids]

print("Creating fake user data frame...", end="\r")
user_df = pd.DataFrame(data, columns=['user_id', 'username', 'email'])

print("Writing user to file...", end="\r")
user_df.to_csv(user_path, index=False, mode='w', header=True)


print("Finished writing user file")

Finished writing user file
me...

In [41]:
# Write csv for sample users
try:
    os.remove(user_sample_path)
except:
    pass
    
user_ids = set()

for i, chunk in enumerate(pd.read_csv(review_sample_path, chunksize = 1000000)):
    user_ids = user_ids.union(set(chunk['user_id'].unique()))
    print(f"Ran through {i} chunk(s)...", end="\r")

for j, chunk in enumerate(pd.read_csv(user_has_read_book_sample_path, chunksize = 1000000)):
    user_ids = user_ids.union(set(chunk['user_id'].unique()))
    print(f"Ran through {i+j} chunk(s)...", end="\r")

count = len(user_ids)

fake = Faker()
Faker.seed(123)

def generate_user_info(user_id):
    username = fake.unique.user_name()
    email = fake.unique.safe_email()
    return [user_id, username, email]

print("Generting fake user info...", end="\r")
data = [generate_user_info(user_id) for user_id in user_ids]

print("Creating fake user data frame...", end="\r")
user_df = pd.DataFrame(data, columns=['user_id', 'username', 'email'])

print("Writing user to file...", end="\r")
user_df.to_csv(user_sample_path, index=False, mode='w', header=True)


print("Finished writing user file")

Finished writing user fileame...
