This Notebook merges the Metadata and Reviewdata into one JSON file

In [4]:
import json, os

META_IN   = '../cleaning/filtered_meta_Books.json'
REVIEWS_IN= '../cleaning/filtered_Books.json'
MERGED_OUT = 'data/processed/merged.json'
os.makedirs(os.path.dirname(MERGED_OUT), exist_ok=True)

# Load metadata into dict
with open(META_IN,'r',encoding='utf-8') as meta:
    meta_map = {
        json.loads(line)['asin']: json.loads(line)
        for line in meta
    }



In [5]:
#load reviews and merge with metadata
merged = []
with open(REVIEWS_IN,'r',encoding='utf-8') as revs:
    for line in revs:
        rev = json.loads(line)
        book = meta_map.get(rev['asin'])
        categories = book.get('category',[None])
        # avoid genre being just 'Books'
        if len(categories)>1:
            genre = categories[1]
        else:
            genre = categories[0] 
        price=book.get('price')
        # filter out strange prices or books with missing prizes (mostly ebooks) in original data
        if (not price.startswith('$'))or price=='':
            continue
        # filter out books without title
        if book.get('title')=='':
            continue
        if not book: continue
        merged.append({
            'asin':    rev['asin'],
            'title':   book.get('title'),
            'author':  book.get('brand'),
            'genre':   genre,
            'price':   price,
            'rating':  rev.get('overall'),
            'text':    rev.get('reviewText'),
            'summary': rev.get('summary')
        })


In [6]:
# write to json file
with open(MERGED_OUT, 'w',encoding='utf-8') as f:
    json.dump(merged, f)
print(f"Wrote {len(merged)} records to {MERGED_OUT}")
print(merged[:5])

Wrote 398639 records to data/processed/merged.json
[{'asin': '0006378560', 'title': 'Africa : Despatches from a Fragile Continent', 'author': "Visit Amazon's Blaine Harden Page", 'genre': 'History', 'price': '$0.96', 'rating': 5.0, 'text': 'A friend gave me this book, and said "you gotta read this".  And you know what, I\'m really happy she did, because I learned a lot from Dispatches.  Harden does a great job of weaving different short stories, each with a unique slant and look at African life.  Both entertaining and educational, I\'m now fascinated with Africa and ready to read more!', 'summary': 'A great overview of African Life and Politics'}, {'asin': '0007130929', 'title': "Bart Simpson's Treehouse of Horror Spine-Tingling Spooktacular", 'author': 'By The Editors', 'genre': 'Humor & Entertainment', 'price': '$48.52', 'rating': 4.0, 'text': 'This  book  is very  funny  the genre of my book is comedy[Alejandro Delgado[', 'summary': 'The Simpsons Crazy  stories'}, {'asin': '00071309