In [1]:
import json
import pandas as pd
import random
import gc  # For garbage collection / memory management
from collections import Counter
from google.colab import drive
drive.mount('/content/gdrive') # Mount Google Drive
absolute_path = "/content/gdrive/My Drive/Projects/amazon-reviews/"
dataset_path = absolute_path + "Datasets/"

Mounted at /content/gdrive


# Download and Extract Amazon Review Datasets

This section downloads the review datasets from UCSD and extracts them to the dataset path.

## Important Notes for OUR Large Files

UCSD datasets contain millions of reviews. Here's how we handled them efficiently:

1. **Chunked Processing**: Files are processed in 100K review chunks to avoid memory overflow
2. **No Early Stopping**: By default, we process entire files to ensure enough samples for rare ratings (2.0, 3.0)
3. **Colab Compatibility**: Works on T4 GPU (free tier), but L4 or A100 recommended for faster processing
4. **Memory Issues**: To avoid memory errors, we could reduce the parameters bellow (though this may result in insufficient samples after deduplication):
   - Set `CHUNK_SIZE` to 50000
   - Set `MAX_FILTERED_PER_CATEGORY` to 100000 (or higher if possible)

In [2]:
import gzip
import shutil
import os
from urllib.request import urlretrieve
from urllib.parse import urlparse

def download_and_extract(url, dataset_path):
    """
    Downloads a compressed JSONL file and extracts it.
    Skips download if file already exists to save time.
    """
    # Parse filename from URL
    filename = os.path.basename(urlparse(url).path)
    gz_path = os.path.join(dataset_path, filename)
    jsonl_path = gz_path.replace('.gz', '')

    # Skip if already extracted
    if os.path.exists(jsonl_path):
        print(f"{filename.replace('.gz', '')} already exists, skipping download.")
        return jsonl_path

    # Download compressed file
    print(f"Downloading {filename}...")
    urlretrieve(url, gz_path)
    print(f"Downloaded {filename}")

    # Extract to JSONL
    print(f"Extracting {filename}...")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(jsonl_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Extracted to {os.path.basename(jsonl_path)}")

    # Clean up compressed file to save space
    os.remove(gz_path)
    print(f"Removed {filename} to save space\n")

    return jsonl_path

# Filter Reviews and Count

This section filters reviews based on required criteria and returns statistics.

In [3]:
def is_valid_review(review):
    """
    Validates that a review meets all required criteria.
    We need ratings, title, text, images, and verified purchase status.
    """
    return (
        isinstance(review.get("rating"), (int, float)) and not pd.isna(review["rating"]) and
        isinstance(review.get("title"), str) and review["title"].strip() != "" and
        isinstance(review.get("text"), str) and review["text"].strip() != "" and
        isinstance(review.get("images"), list) and len(review["images"]) > 0 and
        review.get("verified_purchase") == True
    )

def filter_and_count_reviews(jsonl_path, chunk_size=100000, max_filtered=None):
    """
    Processes large JSONL files in chunks to avoid memory issues.
    Filters reviews based on validation criteria and returns clean dataset.
    """
    category_name = os.path.basename(jsonl_path).replace('.jsonl', '')
    print(f"\n{'='*60}")
    print(f"Processing: {category_name}")
    print(f"{'='*60}")

    filtered_reviews = []
    total_count = 0
    chunk_num = 0

    print("Loading and filtering reviews in chunks...")

    with open(jsonl_path, "r") as f:
        chunk = []

        for line in f:
            try:
                review = json.loads(line)
                chunk.append(review)
                total_count += 1

                # Process when we hit chunk size
                if len(chunk) >= chunk_size:
                    chunk_num += 1
                    valid_reviews = [r for r in chunk if is_valid_review(r)]
                    filtered_reviews.extend(valid_reviews)

                    print(f"  Chunk {chunk_num}: Processed {total_count:,} total | "
                          f"Filtered so far: {len(filtered_reviews):,}")

                    # Clear chunk to free memory
                    chunk = []

                    # Early exit if we have enough
                    if max_filtered and len(filtered_reviews) >= max_filtered:
                        print(f"  Reached target of {max_filtered:,} filtered reviews. Stopping early.")
                        break

            except json.JSONDecodeError:
                total_count += 1
                continue

        # Process any remaining reviews
        if chunk and (not max_filtered or len(filtered_reviews) < max_filtered):
            chunk_num += 1
            valid_reviews = [r for r in chunk if is_valid_review(r)]
            filtered_reviews.extend(valid_reviews)
            print(f"  Chunk {chunk_num}: Processed {total_count:,} total | "
                  f"Filtered so far: {len(filtered_reviews):,}")

    # Convert to DataFrame
    filtered_df = pd.DataFrame(filtered_reviews)
    filtered_count = len(filtered_df)

    print(f"\n{'='*60}")
    print(f"Total reviews scanned: {total_count:,}")
    print(f"Filtered reviews: {filtered_count:,}")
    print(f"Retention rate: {(filtered_count/total_count*100):.2f}%")
    print(f"{'='*60}")

    return filtered_df, total_count, filtered_count

# Create Balanced Datasets

This section creates balanced datasets with a maximum of 10,000 entries each.

In [4]:
import re

def preprocess_text(text):
    """
    Clean and normalize text for NLP/LLM models.
    Handles lowercasing, special character removal, and whitespace normalization.
    """
    if not isinstance(text, str):
        return ""

    # Normalize to lowercase
    text = text.lower()

    # Strip out URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Keep only alphanumeric characters and basic punctuation
    text = re.sub(r'[^a-z0-9\s.,!?\'\-]', ' ', text)

    # Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text)

    # Clean up edges
    text = text.strip()

    return text

def create_balanced_dataset(filtered_df, category_name, dataset_path, max_entries=10000):
    """
    Creates a perfectly balanced dataset with equal representation of all rating classes.
    Preprocesses text, removes duplicates, and ensures exactly max_entries records.
    """
    print(f"\nCreating balanced dataset for {category_name}...")

    # Work with a copy to avoid pandas warnings
    filtered_df = filtered_df.copy()

    # Remove duplicate products (same ASIN)
    print(f"Original filtered dataset: {len(filtered_df):,} entries")
    filtered_df = filtered_df.drop_duplicates(subset=["asin"])
    print(f"After removing duplicate products: {len(filtered_df):,} entries")

    # Clean and normalize all text fields
    print("\nPreprocessing text fields...")
    filtered_df['title'] = filtered_df['title'].apply(preprocess_text)
    filtered_df['text'] = filtered_df['text'].apply(preprocess_text)
    print("Text preprocessing complete")

    # Drop any entries that became empty after preprocessing
    before_cleaning = len(filtered_df)
    filtered_df = filtered_df[
        (filtered_df['title'].str.strip() != '') &
        (filtered_df['text'].str.strip() != '')
    ].copy()
    after_cleaning = len(filtered_df)
    removed = before_cleaning - after_cleaning

    if removed > 0:
        print(f"Removed {removed:,} entries with empty title or text after preprocessing")
    print(f"Entries after text cleaning: {after_cleaning:,}")

    # Shuffle for random sampling
    filtered_df = filtered_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print("Dataset shuffled for random sampling")

    # Calculate how many samples we need per rating
    ratings = sorted(filtered_df["rating"].unique())
    print(f"\nRatings found: {ratings}")

    samples_per_rating = max_entries // len(ratings)
    print(f"Target samples per rating: {samples_per_rating}")
    print(f"Target total entries: {samples_per_rating * len(ratings)}")

    # Sample equally from each rating class
    balanced_samples = []

    for r in ratings:
        subset = filtered_df[filtered_df["rating"] == r]
        available = len(subset)
        to_sample = min(samples_per_rating, available)

        if to_sample < samples_per_rating:
            print(f"  Warning: Rating {r} only has {available:,} samples (need {samples_per_rating:,})")

        if to_sample > 0:
            sampled = subset.sample(n=to_sample, random_state=42)
            balanced_samples.append(sampled)

        print(f"  Rating {r}: {available:,} available, sampled {to_sample:,}")

    # Combine all rating samples
    balanced_df = pd.concat(balanced_samples, ignore_index=True)

    # Shuffle again to mix up the ratings
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f"\nFinal balanced dataset created: {len(balanced_df):,} entries")

    # Select only the columns we need
    final_df = balanced_df[["rating", "title", "text", "images", "asin", "user_id", "verified_purchase"]].copy()

    # Show final distribution
    print("\nFinal rating distribution:")
    rating_counts = Counter(final_df["rating"])
    for r in sorted(rating_counts.keys()):
        print(f"  Rating {r}: {rating_counts[r]:,}")

    total_final = sum(rating_counts.values())
    print(f"\nTotal entries: {total_final:,}")

    if total_final == max_entries:
        print(f"Success: Exactly {max_entries:,} entries as required")
    else:
        print(f"Warning: Got {total_final:,} entries instead of {max_entries:,}")

    # Show a preview of the cleaned data
    print("\nSample of preprocessed data:")
    sample_row = final_df.iloc[0]
    print(f"  Rating: {sample_row['rating']}")
    print(f"  Title: {sample_row['title'][:80]}...")
    print(f"  Text: {sample_row['text'][:80]}...")

    # Save to CSV
    output_filename = f"{category_name}_balanced.csv"
    output_path = os.path.join(dataset_path, output_filename)
    final_df.to_csv(output_path, index=False)
    print(f"\nSaved balanced dataset to: {output_filename}")
    print("Ready for NLP/LLM processing")

    return output_path

# Main Execution: Process All Categories

Define the URLs and process each category.

In [5]:
# Define the URLs for the 5 categories
urls = [
    "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Electronics.jsonl.gz",
    "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Amazon_Fashion.jsonl.gz",
    "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Books.jsonl.gz",
    "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Automotive.jsonl.gz",
    "https://mcauleylab.ucsd.edu/public_datasets/data/amazon_2023/raw/review_categories/Video_Games.jsonl.gz"
]

print(f"Dataset path: {dataset_path}")
print(f"Processing {len(urls)} categories...\n")

Dataset path: /content/gdrive/My Drive/Projects/amazon-reviews/Datasets/
Processing 5 categories...



## Configuration

In [6]:
# Configuration for processing large datasets
# Adjust based on your Colab tier and available memory

# Number of reviews to process at once (lower = less memory, slower)
CHUNK_SIZE = 100000  # Free tier: 50000-100000, Pro: 200000+

# Max reviews to collect per category before stopping
# Set to None to process entire file (recommended for L4 GPU)
MAX_FILTERED_PER_CATEGORY = None  # Process all data to ensure balanced dataset

# Target size for final balanced dataset
MAX_BALANCED_ENTRIES = 10000

print("Configuration:")
print(f"  Chunk size: {CHUNK_SIZE:,} reviews")
print(f"  Max filtered per category: {'No limit (process entire file)' if MAX_FILTERED_PER_CATEGORY is None else f'{MAX_FILTERED_PER_CATEGORY:,} reviews'}")
print(f"  Max balanced entries: {MAX_BALANCED_ENTRIES:,} reviews")
print("\nNote: Processing entire files to ensure enough samples for rare ratings (2.0, 3.0)")
print("This may take longer but guarantees a truly balanced dataset.")

Configuration:
  Chunk size: 100,000 reviews
  Max filtered per category: No limit (process entire file)
  Max balanced entries: 10,000 reviews

Note: Processing entire files to ensure enough samples for rare ratings (2.0, 3.0)
This may take longer but guarantees a truly balanced dataset.


## Step 1: Download and Extract Files

In [7]:
# # Download and extract all datasets
# jsonl_paths = []

# for url in urls:
#     jsonl_path = download_and_extract(url, dataset_path)
#     jsonl_paths.append(jsonl_path)

# print(f"\n{'='*60}")
# print(f"All {len(jsonl_paths)} datasets downloaded and extracted")
# print(f"{'='*60}")

## Step 2: Filter Reviews and Collect Statistics

In [8]:
# Process each category and collect statistics
jsonl_paths = [dataset_path+"Amazon_Fashion.jsonl", dataset_path+"Automotive.jsonl", dataset_path+"Books.jsonl", dataset_path+"Electronics.jsonl", dataset_path+"Video_Games.jsonl"]

filtered_data = []
statistics = []

for jsonl_path in jsonl_paths:
    filtered_df, total, filtered = filter_and_count_reviews(
        jsonl_path,
        chunk_size=CHUNK_SIZE,
        max_filtered=MAX_FILTERED_PER_CATEGORY
    )
    category_name = os.path.basename(jsonl_path).replace('.jsonl', '')

    filtered_data.append({
        'category': category_name,
        'df': filtered_df
    })

    statistics.append({
        'Category': category_name,
        'Total Reviews': total,
        'Filtered Reviews': filtered,
        'Retention %': f"{(filtered/total*100):.2f}%"
    })

    # Force garbage collection to free memory
    gc.collect()

# Display summary statistics
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
stats_df = pd.DataFrame(statistics)
print(stats_df.to_string(index=False))
print("="*80)


Processing: Amazon_Fashion
Loading and filtering reviews in chunks...
  Chunk 1: Processed 100,000 total | Filtered so far: 4,350
  Chunk 2: Processed 200,000 total | Filtered so far: 9,537
  Chunk 3: Processed 300,000 total | Filtered so far: 13,920
  Chunk 4: Processed 400,000 total | Filtered so far: 18,454
  Chunk 5: Processed 500,000 total | Filtered so far: 23,422
  Chunk 6: Processed 600,000 total | Filtered so far: 27,938
  Chunk 7: Processed 700,000 total | Filtered so far: 32,550
  Chunk 8: Processed 800,000 total | Filtered so far: 37,138
  Chunk 9: Processed 900,000 total | Filtered so far: 41,751
  Chunk 10: Processed 1,000,000 total | Filtered so far: 47,289
  Chunk 11: Processed 1,100,000 total | Filtered so far: 52,641
  Chunk 12: Processed 1,200,000 total | Filtered so far: 58,268
  Chunk 13: Processed 1,300,000 total | Filtered so far: 63,593
  Chunk 14: Processed 1,400,000 total | Filtered so far: 69,369
  Chunk 15: Processed 1,500,000 total | Filtered so far: 75,12

## Step 3: Create Balanced Datasets (Max 10,000 entries each)

In [9]:
# Create balanced datasets for each category
output_paths = []

for data in filtered_data:
    category = data['category']
    df = data['df']

    output_path = create_balanced_dataset(df, category, dataset_path, max_entries=MAX_BALANCED_ENTRIES)
    output_paths.append(output_path)

print("\n" + "="*80)
print("ALL DATASETS CREATED SUCCESSFULLY")
print("="*80)
print(f"\nCreated {len(output_paths)} balanced datasets:")
for path in output_paths:
    print(f"  {os.path.basename(path)}")


Creating balanced dataset for Amazon_Fashion...
Original filtered dataset: 133,984 entries
After removing duplicate products: 98,464 entries

Preprocessing text fields...
Text preprocessing complete
Removed 269 entries with empty title or text after preprocessing
Entries after text cleaning: 98,195
Dataset shuffled for random sampling

Ratings found: [np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0)]
Target samples per rating: 2000
Target total entries: 10000
  Rating 1.0: 16,129 available, sampled 2,000
  Rating 2.0: 6,946 available, sampled 2,000
  Rating 3.0: 7,696 available, sampled 2,000
  Rating 4.0: 12,549 available, sampled 2,000
  Rating 5.0: 54,875 available, sampled 2,000

Final balanced dataset created: 10,000 entries

Final rating distribution:
  Rating 1.0: 2,000
  Rating 2.0: 2,000
  Rating 3.0: 2,000
  Rating 4.0: 2,000
  Rating 5.0: 2,000

Total entries: 10,000
Success: Exactly 10,000 entries as required

Sample of preprocessed data: