How to use: 

1.) Load the Movie & TV meta data and review data json files from https://amazon-reviews-2023.github.io

2.) With those folders in the same directory as this script, run all cells. This will randomly remove 99% of movies and their associated reviews to make the file size more managable. Estimated local processing time ~7 minutes. Console messages will update the user with progress periodically. 


*If you want to use the already reduced json (recommended) see the data folder. We could not include the json file that this script operates on because its size is substantially larger than github Repo limits and will require substantial download and processing time.

Credit: Script development assisted by ChatGPT 

In [1]:
import json
import random

# Input and output file names
file = "Movies_and_TV.jsonl"  # Original dataset
reduced_file = "All_Movies_tv_reduced.jsonl"  # First-stage filtered dataset

# Step 1: Count unique parent_asins without storing reviews
parent_asin_counts = {}
total_reviews = 0

print("Counting unique parent_asins...")

with open(file, 'r') as fp:
    for i, line in enumerate(fp, 1):
        review = json.loads(line.strip())

        # Ensure parent_asin exists
        if "parent_asin" in review:
            parent_asin = review["parent_asin"]
            parent_asin_counts[parent_asin] = parent_asin_counts.get(parent_asin, 0) + 1

        total_reviews += 1

        # Print progress every 1 million reviews
        if i % 1_000_000 == 0:
            print(f"Processed {i:,} reviews...")

print(f"Total reviews: {total_reviews:,}")
print(f"Total unique parent_asins: {len(parent_asin_counts):,}")

# Step 2: Randomly select 10% of parent_asins
num_selected = int(0.01 * len(parent_asin_counts))
selected_parents = set(random.sample(list(parent_asin_counts.keys()), num_selected))

print(f"Selected {num_selected:,} parent_asins (~1%)")

# Step 3: Stream through the file again and write only selected reviews
print("Filtering and writing selected reviews...")
written_reviews = 0

with open(file, 'r') as fp, open(reduced_file, 'w') as out_fp:
    for i, line in enumerate(fp, 1):
        review = json.loads(line.strip())

        # Ensure parent_asin exists and is selected
        if review.get("parent_asin") in selected_parents:
            out_fp.write(json.dumps(review) + "\n")
            written_reviews += 1

        # Print progress every 500,000 reviews
        if i % 500_000 == 0:
            print(f"Scanned {i:,} reviews, written {written_reviews:,} so far...")

print(f"Final written reviews: {written_reviews:,}")
print(f"Reduced dataset saved to {reduced_file}")

# Step 4: Overwrite the reduced file to keep only necessary fields
print("Filtering and overwriting the reduced dataset...")

temp_file = "temp_filtered.jsonl"  # Temporary file for safe writing

with open(reduced_file, 'r') as fp, open(temp_file, 'w') as out_fp:
    for i, line in enumerate(fp, 1):
        review = json.loads(line.strip())

        # Ensure required fields exist before writing
        filtered_review = {
            "rating": review.get("rating"),
            "title": review.get("title"),
            "text": review.get("text"),
            "parent_asin": review.get("parent_asin")
        }

        # Only write valid reviews (skip if missing required fields)
        if all(filtered_review.values()):
            out_fp.write(json.dumps(filtered_review) + "\n")

        # Print progress every 500,000 reviews
        if i % 500_000 == 0:
            print(f"Processed {i:,} reviews...")

# Replace original file with the filtered version
import os
os.replace(temp_file, reduced_file)

print(f"Finished processing and overwriting {reduced_file}")


Counting unique parent_asins...
Processed 1,000,000 reviews...
Processed 2,000,000 reviews...
Processed 3,000,000 reviews...
Processed 4,000,000 reviews...
Processed 5,000,000 reviews...
Processed 6,000,000 reviews...
Processed 7,000,000 reviews...
Processed 8,000,000 reviews...
Processed 9,000,000 reviews...
Processed 10,000,000 reviews...
Processed 11,000,000 reviews...
Processed 12,000,000 reviews...
Processed 13,000,000 reviews...
Processed 14,000,000 reviews...
Processed 15,000,000 reviews...
Processed 16,000,000 reviews...
Processed 17,000,000 reviews...
Total reviews: 17,328,314
Total unique parent_asins: 747,764
Selected 7,477 parent_asins (~1%)
Filtering and writing selected reviews...
Scanned 500,000 reviews, written 4,588 so far...
Scanned 1,000,000 reviews, written 9,038 so far...
Scanned 1,500,000 reviews, written 13,570 so far...
Scanned 2,000,000 reviews, written 18,123 so far...
Scanned 2,500,000 reviews, written 22,683 so far...
Scanned 3,000,000 reviews, written 27,50

In [2]:
# File paths
reviews_file = "All_Movies_tv_reduced.jsonl"  # The reduced review dataset
movies_file = "meta_Movies_and_TV.jsonl"  # Original movie metadata file
output_file = "movies_metadata_reduced.jsonl"  # Final filtered movie metadata

# Step 1: Load the set of parent_asins that exist in the reduced reviews dataset
print("Loading parent_asins from reduced reviews...")

valid_parent_asins = set()
with open(reviews_file, 'r') as fp:
    for i, line in enumerate(fp, 1):
        review = json.loads(line.strip())
        if "parent_asin" in review:
            valid_parent_asins.add(review["parent_asin"])

        # Print progress every 1 million reviews
        if i % 1_000_000 == 0:
            print(f"Processed {i:,} reviews...")

print(f"Total unique parent_asins in reduced reviews: {len(valid_parent_asins):,}")

# Step 2: Filter movie metadata based on valid parent_asins
print("Filtering movie metadata...")
kept_movies = 0

with open(movies_file, 'r') as fp, open(output_file, 'w') as out_fp:
    for i, line in enumerate(fp, 1):
        movie = json.loads(line.strip())

        # Keep only movies with parent_asin in valid_parent_asins
        if movie.get("parent_asin") in valid_parent_asins:
            out_fp.write(json.dumps(movie) + "\n")
            kept_movies += 1

        # Print progress every 500,000 movies
        if i % 500_000 == 0:
            print(f"Scanned {i:,} movies, kept {kept_movies:,} so far...")

print(f"Final kept movies: {kept_movies:,}")
print(f"Filtered movie metadata saved to {output_file}")


Loading parent_asins from reduced reviews...
Total unique parent_asins in reduced reviews: 7,477
Filtering movie metadata...
Scanned 500,000 movies, kept 5,037 so far...
Final kept movies: 7,477
Filtered movie metadata saved to movies_metadata_reduced.jsonl
