In [1]:
# Cell 1: Imports & Paths
import json, random, os

# Paths (relative to this notebook's folder)
DATA_DIR     = os.path.join('..','book_data')
META_IN      = os.path.join(DATA_DIR, 'meta_Books.json')
REVIEWS_IN   = os.path.join(DATA_DIR, 'Books.json')
META_OUT     = 'filtered_meta_Books.json'
REVIEWS_OUT  = 'filtered_Books.json'

print("Metadata input exists:", os.path.exists(META_IN))
print("Reviews input exists: ", os.path.exists(REVIEWS_IN))

Metadata input exists: True
Reviews input exists:  True


In [2]:
# Cell 2: Peek at a metadata line to confirm schema
with open(META_IN,'r',encoding='utf-8') as f:
    sample = json.loads(f.readline())
        
print("Keys in metadata sample:", sample.keys())
print("Sample categories field:", sample.get('categories') or sample.get('category'))


Keys in metadata sample: dict_keys(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'])
Sample categories field: []


In [3]:
# Cell 3: Configuration & Collect genres → ASINS

genre_to_asins = {}
with open(META_IN,'r',encoding='utf-8') as f:
    for line in f:
        data = json.loads(line)
        asin = data.get('asin')
        cats = data.get('categories', []) or data.get('category', [])
        # skip if no categories
        if not cats:
            continue
        # flatten and map
        for genre in cats:
            genre_to_asins.setdefault(genre, set()).add(asin)

print("Total distinct genres found:", len(genre_to_asins))


Total distinct genres found: 1514


In [4]:
# Cell 4: Sample genres + ASINs
GENRES_TO_SELECT = 100
ENTRIES_PER_GENRE = 2000
all_genres = list(genre_to_asins)
selected_genres = random.sample(all_genres, min(GENRES_TO_SELECT, len(all_genres)))

sampled_asins = set()
for genre in selected_genres:
    asins = list(genre_to_asins[genre])
    sampled = random.sample(asins, min(ENTRIES_PER_GENRE, len(asins)))
    sampled_asins.update(sampled)
print("Selected genres:", selected_genres)
print("Total ASINs sampled:", len(sampled_asins))


Selected genres: ['Nursing', "Men's Health", '100% money back guarantee', 'Jordan', 'Agnosticism', 'Spiritual Growth', 'Mystery & Thrillers', 'Biomathematics', 'Nationalism', 'Automotive', 'Outdoor Cooking', 'Mirrored lens', 'Medicine &amp; Health Sciences', 'Occultism', 'Repair & Performance', 'Business & Professional Growth', 'Missions & Missionary Work', 'Composers & Musicians', 'Sales &amp; Selling', 'Mushrooms', 'Intelligence & Espionage', 'The Edge browser. Microsoft has retired the wrinkly old Internet Explorer browser and replaced it with an all-new, bare-bones one called Edge. Its designed to eat up very little screen space with controls, so that the Web pages youre reading get as much room as possible.', 'Power Tools', 'Access to this product is valid for 1 course enrollment. After this period, this product can be viewed under the "view expired" link in your CengageBrain account.', 'Author: Dr. Allen Lim, Chef Biju Thomas', 'Foreign Language Study', 'Parenting', 'Meditations'

In [5]:
# Cell 5: Write filtered metadata
with open(META_IN,'r',encoding='utf-8')  as infile, open(META_OUT, 'w', encoding='utf-8') as outfile:
    count = 0
    for line in infile:
        data = json.loads(line)
        if data.get('asin') in sampled_asins:
            outfile.write(json.dumps(data) + "\n")
            count += 1
print(f"Wrote {count} metadata lines to {META_OUT}")


Wrote 28640 metadata lines to filtered_meta_Books.json


In [6]:
# Cell 6: Write filtered reviews
with open(REVIEWS_IN,'r',encoding='utf-8')  as infile, open(REVIEWS_OUT, 'w', encoding='utf-8') as outfile:
    count = 0
    for line in infile:
        data = json.loads(line)
        if data.get('asin') in sampled_asins:
            outfile.write(json.dumps(data) + "\n")
            count += 1
print(f"Wrote {count} review lines to {REVIEWS_OUT}")


Wrote 537751 review lines to filtered_Books.json
