In [2]:
# Import libraries
import pandas as pd
import numpy as np
import gzip
import json
import urllib.request
import warnings
warnings.filterwarnings('ignore')

# Try to import tqdm for progress bars (optional)
try:
    from tqdm import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    print("üí° Tip: Install tqdm for progress bars: pip install tqdm")

print("‚úÖ Libraries imported")

üí° Tip: Install tqdm for progress bars: pip install tqdm
‚úÖ Libraries imported


In [3]:
# ============================================================================
# CONFIGURATION
# ============================================================================

SAMPLE_SIZE = 50000  # Number of reviews to load (increase for final run)
RANDOM_STATE = 42

print(f"Configuration:")
print(f"  Sample size: {SAMPLE_SIZE:,}")
print(f"  Random state: {RANDOM_STATE}")

Configuration:
  Sample size: 50,000
  Random state: 42


In [None]:
# ============================================================================
# LOAD AMAZON ELECTRONICS REVIEWS (2014 Dataset - Reliable)
# ============================================================================

print("=" * 70)
print("LOADING AMAZON ELECTRONICS REVIEWS DATASET")
print("Source: UCSD McAuley Lab (Stanford SNAP)")
print("=" * 70)

url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"

print(f"\nDownloading {SAMPLE_SIZE:,} reviews...")
print("‚è≥ This may take 1-2 minutes...\n")

try:
    import time
    from tqdm import tqdm
    
    # Download file with progress
    start_time = time.time()
    print("üì• Downloading dataset...")
    urllib.request.urlretrieve(url, 'data/electronics.json.gz')
    download_time = time.time() - start_time
    print(f"‚úÖ Download complete ({download_time:.1f}s)")
    
    # Load JSONL.gz file with progress bar
    print("\nüìñ Loading reviews...")
    reviews = []
    start_time = time.time()
    
    with gzip.open('data/electronics.json.gz', 'rt', encoding='utf-8') as f:
        # Use tqdm for progress bar if available, otherwise fallback
        try:
            for i, line in enumerate(tqdm(f, total=SAMPLE_SIZE, desc="Loading", unit="reviews")):
                if i >= SAMPLE_SIZE:
                    break
                try:
                    review = json.loads(line)
                    # Validate required fields
                    if 'overall' in review and 'reviewText' in review:
                        reviews.append(review)
                except json.JSONDecodeError:
                    continue  # Skip malformed lines
        except:
            # Fallback without tqdm
            for i, line in enumerate(f):
                if i >= SAMPLE_SIZE:
                    break
                if i % 10000 == 0 and i > 0:
                    print(f"  Loaded {i:,} reviews...")
                try:
                    review = json.loads(line)
                    if 'overall' in review and 'reviewText' in review:
                        reviews.append(review)
                except json.JSONDecodeError:
                    continue
    
    load_time = time.time() - start_time
    df_raw = pd.DataFrame(reviews)
    
    print(f"\n‚úÖ SUCCESS! Loaded {len(df_raw):,} reviews in {load_time:.1f}s")
    print(f"   Columns: {df_raw.columns.tolist()}")
    print(f"   Memory usage: {df_raw.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
except ImportError:
    # Fallback if tqdm not available
    print("‚ö†Ô∏è  tqdm not available, using basic progress...")
    try:
        urllib.request.urlretrieve(url, 'data/electronics.json.gz')
        reviews = []
        with gzip.open('data/electronics.json.gz', 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= SAMPLE_SIZE:
                    break
                if i % 10000 == 0 and i > 0:
                    print(f"  Loaded {i:,} reviews...")
                try:
                    review = json.loads(line)
                    if 'overall' in review and 'reviewText' in review:
                        reviews.append(review)
                except json.JSONDecodeError:
                    continue
        df_raw = pd.DataFrame(reviews)
        print(f"\n‚úÖ SUCCESS! Loaded {len(df_raw):,} reviews")
        print(f"   Columns: {df_raw.columns.tolist()}")
    except Exception as e:
        print(f"‚ùå Error: {e}")
        df_raw = None
except Exception as e:
    print(f"‚ùå Error: {e}")
    import traceback
    traceback.print_exc()
    df_raw = None

LOADING AMAZON ELECTRONICS REVIEWS DATASET
Source: UCSD McAuley Lab (Stanford SNAP)

Downloading 50,000 reviews...
‚è≥ This may take 1-2 minutes...

‚ö†Ô∏è  tqdm not available, using basic progress...
  Loaded 10,000 reviews...
  Loaded 20,000 reviews...
  Loaded 30,000 reviews...
  Loaded 40,000 reviews...

‚úÖ SUCCESS! Loaded 50,000 reviews
   Columns: ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime']


In [5]:
# Preview raw data
print("\nüìã Raw Data Preview:")
df_raw.head(3)


üìã Raw Data Preview:


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"


Step 2: Data Cleaning

In [6]:
# ============================================================================
# DATA CLEANING
# ============================================================================

print("=" * 70)
print("DATA CLEANING")
print("=" * 70)

# Validate input
if df_raw is None or len(df_raw) == 0:
    raise ValueError("‚ùå No data loaded! Check previous cell for errors.")

# Rename columns to standard names
df = df_raw.rename(columns={
    'overall': 'rating',
    'reviewText': 'text'
}).copy()

print(f"\nOriginal size: {len(df):,} reviews")

# Keep only relevant columns (with error handling)
required_cols = ['text', 'rating']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
    raise ValueError(f"‚ùå Missing required columns: {missing_cols}")

df = df[required_cols].copy()

# Step 1: Remove missing values
missing_before = df.isna().sum()
print(f"\nüìä Missing values:")
print(f"   text: {missing_before['text']:,} ({missing_before['text']/len(df)*100:.2f}%)")
print(f"   rating: {missing_before['rating']:,} ({missing_before['rating']/len(df)*100:.2f}%)")

df = df.dropna(subset=['text', 'rating'])
print(f"‚úÖ After removing nulls: {len(df):,} reviews ({len(df)/len(df_raw)*100:.1f}% retained)")

# Step 2: Remove very short reviews
short_reviews = (df['text'].str.len() < 10).sum()
print(f"\nüìè Short reviews (<10 chars): {short_reviews:,}")
df = df[df['text'].str.len() >= 10]
print(f"‚úÖ After removing short reviews: {len(df):,} reviews")

# Step 3: Convert rating to integer (with validation)
try:
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    invalid_ratings = df['rating'].isna().sum()
    if invalid_ratings > 0:
        print(f"‚ö†Ô∏è  Found {invalid_ratings:,} invalid ratings (non-numeric)")
        df = df.dropna(subset=['rating'])
    df['rating'] = df['rating'].astype(int)
except Exception as e:
    print(f"‚ùå Error converting ratings: {e}")
    raise

# Step 4: Verify ratings are 1-5
invalid_range = (~df['rating'].between(1, 5)).sum()
if invalid_range > 0:
    print(f"‚ö†Ô∏è  Found {invalid_range:,} ratings outside 1-5 range")
df = df[df['rating'].between(1, 5)]
print(f"‚úÖ After rating validation: {len(df):,} reviews")

# Step 5: Remove duplicate reviews (optional but recommended)
duplicates = df.duplicated(subset=['text']).sum()
if duplicates > 0:
    print(f"\nüîÑ Found {duplicates:,} duplicate reviews")
    df = df.drop_duplicates(subset=['text'], keep='first')
    print(f"‚úÖ After removing duplicates: {len(df):,} reviews")

# Reset index
df = df.reset_index(drop=True)

# Final validation
print(f"\n{'='*70}")
print(f"‚úÖ Final cleaned dataset: {len(df):,} reviews")
print(f"   Data retention: {len(df)/len(df_raw)*100:.1f}%")
print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

DATA CLEANING

Original size: 50,000 reviews

üìä Missing values:
   text: 0 (0.00%)
   rating: 0 (0.00%)
‚úÖ After removing nulls: 50,000 reviews (100.0% retained)

üìè Short reviews (<10 chars): 40
‚úÖ After removing short reviews: 49,960 reviews
‚úÖ After rating validation: 49,960 reviews

üîÑ Found 7 duplicate reviews
‚úÖ After removing duplicates: 49,953 reviews

‚úÖ Final cleaned dataset: 49,953 reviews
   Data retention: 99.9%
   Memory usage: 30.6 MB


In [7]:
# Preview cleaned data
print("\nüìã Cleaned Data Preview:")
df.head()


üìã Cleaned Data Preview:


Unnamed: 0,text,rating
0,We got this GPS for my husband who is an (OTR)...,5
1,"I'm a professional OTR truck driver, and I bou...",1
2,"Well, what can I say. I've had this unit in m...",3
3,"Not going to write a long review, even thought...",2
4,I've had mine for a year and here's what we go...,1


Step 3: Class Distribution

In [8]:
# ============================================================================
# CLASS DISTRIBUTION
# ============================================================================

print("=" * 70)
print("CLASS DISTRIBUTION")
print("=" * 70)

rating_counts = df['rating'].value_counts().sort_index()

print("\nüìä Rating Distribution:")
for rating, count in rating_counts.items():
    pct = count / len(df) * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"   {rating} ‚≠ê: {count:>6,} ({pct:>5.1f}%) {bar}")

print(f"\n   Total: {len(df):,} reviews")

CLASS DISTRIBUTION

üìä Rating Distribution:
   1 ‚≠ê:  2,835 (  5.7%) ‚ñà‚ñà
   2 ‚≠ê:  2,160 (  4.3%) ‚ñà‚ñà
   3 ‚≠ê:  3,963 (  7.9%) ‚ñà‚ñà‚ñà
   4 ‚≠ê: 10,101 ( 20.2%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà
   5 ‚≠ê: 30,894 ( 61.8%) ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà

   Total: 49,953 reviews


In [9]:
# Check class imbalance
print("\n‚ö†Ô∏è Class Imbalance Analysis:")
majority_class = rating_counts.max()
minority_class = rating_counts.min()
imbalance_ratio = majority_class / minority_class

print(f"   Majority class (5-star): {majority_class:,}")
print(f"   Minority class: {minority_class:,}")
print(f"   Imbalance ratio: {imbalance_ratio:.1f}:1")


‚ö†Ô∏è Class Imbalance Analysis:
   Majority class (5-star): 30,894
   Minority class: 2,160
   Imbalance ratio: 14.3:1


Step 4: Text Statistics

In [10]:
# ============================================================================
# TEXT STATISTICS
# ============================================================================

print("=" * 70)
print("TEXT STATISTICS")
print("=" * 70)

# Calculate text length
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("\nüìè Review Length (characters):")
print(f"   Min:    {df['text_length'].min():,}")
print(f"   Max:    {df['text_length'].max():,}")
print(f"   Mean:   {df['text_length'].mean():,.0f}")
print(f"   Median: {df['text_length'].median():,.0f}")

print("\nüìù Word Count:")
print(f"   Min:    {df['word_count'].min():,}")
print(f"   Max:    {df['word_count'].max():,}")
print(f"   Mean:   {df['word_count'].mean():,.0f}")
print(f"   Median: {df['word_count'].median():,.0f}")

# Drop helper columns before saving
df = df.drop(columns=['text_length', 'word_count'])

TEXT STATISTICS

üìè Review Length (characters):
   Min:    10
   Max:    15,567
   Mean:   578
   Median: 337

üìù Word Count:
   Min:    2
   Max:    2,845
   Mean:   105
   Median: 63


Step 5: Save Cleaned Data

In [None]:
# ============================================================================
# SAVE CLEANED DATA
# ============================================================================

print("=" * 70)
print("SAVING DATA")
print("=" * 70)

# Save to CSV
output_file = 'data/amazon_electronics_cleaned.csv'
df.to_csv(output_file, index=False)

print(f"\n‚úÖ Saved to: {output_file}")
print(f"   Rows: {len(df):,}")
print(f"   Columns: {list(df.columns)}")

# Verify save
df_verify = pd.read_csv(output_file)
print(f"\n‚úÖ Verified: Loaded {len(df_verify):,} rows from saved file")

SAVING DATA

‚úÖ Saved to: amazon_electronics_cleaned.csv
   Rows: 49,953
   Columns: ['text', 'rating']

‚úÖ Verified: Loaded 49,953 rows from saved file


In [12]:
# Download for Google Colab
try:
    from google.colab import files
    files.download(output_file)
    print("üì• Download started...")
except:
    print("Not in Colab - file saved locally")

Not in Colab - file saved locally


In [13]:
# Final summary
print("\n" + "=" * 70)
print("üìã NOTEBOOK 1 COMPLETE")
print("=" * 70)
print(f"\nDataset: Amazon Electronics Reviews")
print(f"Source: UCSD McAuley Lab")
print(f"Reviews: {len(df):,}")
print(f"Output: {output_file}")


üìã NOTEBOOK 1 COMPLETE

Dataset: Amazon Electronics Reviews
Source: UCSD McAuley Lab
Reviews: 49,953
Output: amazon_electronics_cleaned.csv
