# üìä Notebook 1: Data Loading & Preprocessing
## Final Project - Ordinal vs Nominal Sentiment Analysis
### Atharv Chaudhary

---

**Purpose:** Load Amazon Electronics Reviews, clean, and save for other notebooks.

**Output:** `amazon_electronics_cleaned.csv`

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import gzip
import json
import urllib.request
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported")

## Step 1: Load Dataset

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

SAMPLE_SIZE = 50000  # Number of reviews to load (increase for final run)
RANDOM_STATE = 42

print(f"Configuration:")
print(f"  Sample size: {SAMPLE_SIZE:,}")
print(f"  Random state: {RANDOM_STATE}")

In [None]:
# ============================================================================
# LOAD AMAZON ELECTRONICS REVIEWS (2014 Dataset - Reliable)
# ============================================================================

print("=" * 70)
print("LOADING AMAZON ELECTRONICS REVIEWS DATASET")
print("Source: UCSD McAuley Lab (Stanford SNAP)")
print("=" * 70)

url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"

print(f"\nDownloading {SAMPLE_SIZE:,} reviews...")
print("‚è≥ This may take 1-2 minutes...\n")

try:
    # Download file
    urllib.request.urlretrieve(url, 'electronics.json.gz')
    
    # Load JSONL.gz file
    reviews = []
    with gzip.open('electronics.json.gz', 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= SAMPLE_SIZE:
                break
            if i % 10000 == 0 and i > 0:
                print(f"  Loaded {i:,} reviews...")
            reviews.append(json.loads(line))
    
    df_raw = pd.DataFrame(reviews)
    print(f"\n‚úÖ SUCCESS! Loaded {len(df_raw):,} reviews")
    print(f"   Columns: {df_raw.columns.tolist()}")
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    df_raw = None

In [None]:
# Preview raw data
print("\nüìã Raw Data Preview:")
df_raw.head(3)

## Step 2: Data Cleaning

In [None]:
# ============================================================================
# DATA CLEANING
# ============================================================================

print("=" * 70)
print("DATA CLEANING")
print("=" * 70)

# Rename columns to standard names
df = df_raw.rename(columns={
    'overall': 'rating',
    'reviewText': 'text'
}).copy()

print(f"\nOriginal size: {len(df):,} reviews")

# Keep only relevant columns
df = df[['text', 'rating']].copy()

# Step 1: Remove missing values
missing_before = df.isna().sum()
print(f"\nMissing values: text={missing_before['text']}, rating={missing_before['rating']}")
df = df.dropna(subset=['text', 'rating'])
print(f"After removing nulls: {len(df):,} reviews")

# Step 2: Remove very short reviews
df = df[df['text'].str.len() >= 10]
print(f"After removing short reviews (<10 chars): {len(df):,} reviews")

# Step 3: Convert rating to integer
df['rating'] = df['rating'].astype(int)

# Step 4: Verify ratings are 1-5
df = df[df['rating'].between(1, 5)]
print(f"After rating validation: {len(df):,} reviews")

# Reset index
df = df.reset_index(drop=True)

print(f"\n‚úÖ Final cleaned dataset: {len(df):,} reviews")

In [None]:
# Preview cleaned data
print("\nüìã Cleaned Data Preview:")
df.head()

## Step 3: Class Distribution

In [None]:
# ============================================================================
# CLASS DISTRIBUTION
# ============================================================================

print("=" * 70)
print("CLASS DISTRIBUTION")
print("=" * 70)

rating_counts = df['rating'].value_counts().sort_index()

print("\nüìä Rating Distribution:")
for rating, count in rating_counts.items():
    pct = count / len(df) * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"   {rating} ‚≠ê: {count:>6,} ({pct:>5.1f}%) {bar}")

print(f"\n   Total: {len(df):,} reviews")

In [None]:
# Check class imbalance
print("\n‚ö†Ô∏è Class Imbalance Analysis:")
majority_class = rating_counts.max()
minority_class = rating_counts.min()
imbalance_ratio = majority_class / minority_class

print(f"   Majority class (5-star): {majority_class:,}")
print(f"   Minority class: {minority_class:,}")
print(f"   Imbalance ratio: {imbalance_ratio:.1f}:1")

## Step 4: Text Statistics

In [None]:
# ============================================================================
# TEXT STATISTICS
# ============================================================================

print("=" * 70)
print("TEXT STATISTICS")
print("=" * 70)

# Calculate text length
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print("\nüìè Review Length (characters):")
print(f"   Min:    {df['text_length'].min():,}")
print(f"   Max:    {df['text_length'].max():,}")
print(f"   Mean:   {df['text_length'].mean():,.0f}")
print(f"   Median: {df['text_length'].median():,.0f}")

print("\nüìù Word Count:")
print(f"   Min:    {df['word_count'].min():,}")
print(f"   Max:    {df['word_count'].max():,}")
print(f"   Mean:   {df['word_count'].mean():,.0f}")
print(f"   Median: {df['word_count'].median():,.0f}")

# Drop helper columns before saving
df = df.drop(columns=['text_length', 'word_count'])

## Step 5: Save Cleaned Data

In [None]:
# ============================================================================
# SAVE CLEANED DATA
# ============================================================================

print("=" * 70)
print("SAVING DATA")
print("=" * 70)

# Save to CSV
output_file = 'amazon_electronics_cleaned.csv'
df.to_csv(output_file, index=False)

print(f"\n‚úÖ Saved to: {output_file}")
print(f"   Rows: {len(df):,}")
print(f"   Columns: {list(df.columns)}")

# Verify save
df_verify = pd.read_csv(output_file)
print(f"\n‚úÖ Verified: Loaded {len(df_verify):,} rows from saved file")

In [None]:
# Download for Google Colab
try:
    from google.colab import files
    files.download(output_file)
    print("üì• Download started...")
except:
    print("Not in Colab - file saved locally")

---
## ‚úÖ Summary

**Data loaded and cleaned!**

| Metric | Value |
|--------|-------|
| Total reviews | See output above |
| Columns | text, rating |
| Rating range | 1-5 |
| Output file | amazon_electronics_cleaned.csv |

**Next:** Run `2_EDA_Visualization.ipynb`

In [None]:
# Final summary
print("\n" + "=" * 70)
print("üìã NOTEBOOK 1 COMPLETE")
print("=" * 70)
print(f"\nDataset: Amazon Electronics Reviews")
print(f"Source: UCSD McAuley Lab")
print(f"Reviews: {len(df):,}")
print(f"Output: {output_file}")
print("\n‚Üí Next: Run Notebook 2 (EDA & Visualization)")