In [1]:
# SIMPLE DATASET LOADING SCRIPT
# Try each method until one works!

import pandas as pd
import numpy as np

print("=" * 70)
print("LOADING AMAZON ELECTRONICS REVIEWS DATASET")
print("=" * 70)

# ============================================================================
# METHOD 1: HuggingFace Datasets (RECOMMENDED - Most Reliable)
# ============================================================================
print("\n[METHOD 1] Trying HuggingFace Datasets...")
print("Run this first: pip install datasets")
print("-" * 70)

try:
    from datasets import load_dataset
    
    print("Loading Electronics reviews (first 10,000)...")
    print("This may take 2-3 minutes on first download...\n")
    
    dataset = load_dataset(
        "McAuley-Lab/Amazon-Reviews-2023", 
        "raw_review_Electronics",
        split="full[:10000]",  # Load first 10k reviews
        trust_remote_code=True
    )
    
    # Convert to pandas DataFrame
    df = pd.DataFrame(dataset)
    
    print("‚úÖ SUCCESS! Dataset loaded via HuggingFace")
    print(f"   Shape: {df.shape}")
    print(f"   Columns: {df.columns.tolist()}")
    print("\nFirst 3 reviews:")
    print(df.head(3))
    
    # Show class distribution
    if 'rating' in df.columns:
        print("\nRating Distribution:")
        print(df['rating'].value_counts().sort_index())
    
except ImportError:
    print("‚ùå 'datasets' package not installed")
    print("   Install with: pip install datasets")
    df = None
    
except Exception as e:
    print(f"‚ùå Error: {e}")
    df = None

# ============================================================================
# METHOD 2: Kaggle API (If Method 1 fails)
# ============================================================================
if df is None:
    print("\n" + "=" * 70)
    print("[METHOD 2] Trying Kaggle API...")
    print("Run this first: pip install kaggle")
    print("-" * 70)
    
    try:
        import kaggle
        import zipfile
        import json
        import os
        
        print("Downloading dataset from Kaggle...")
        kaggle.api.dataset_download_files(
            'wajahat1064/amazon-reviews-data-2023',
            path='./data',
            unzip=True
        )
        
        # Find Electronics file
        data_dir = './data'
        files = os.listdir(data_dir)
        print(f"Available files: {files}")
        
        # Try to find Electronics file
        electronics_file = None
        for f in files:
            if 'Electronics' in f or 'electronics' in f:
                electronics_file = f
                break
        
        if electronics_file:
            file_path = os.path.join(data_dir, electronics_file)
            print(f"Loading {electronics_file}...")
            
            # Load JSONL file
            reviews = []
            with open(file_path, 'r') as f:
                for i, line in enumerate(f):
                    if i >= 10000:  # Limit to 10k
                        break
                    reviews.append(json.loads(line))
            
            df = pd.DataFrame(reviews)
            print(f"‚úÖ SUCCESS! Loaded {len(df)} reviews")
            print(f"   Columns: {df.columns.tolist()}")
            print("\nFirst 3 reviews:")
            print(df.head(3))
        else:
            print("‚ùå Could not find Electronics file")
            df = None
            
    except ImportError:
        print("‚ùå 'kaggle' package not installed")
        print("   Install with: pip install kaggle")
        df = None
    except Exception as e:
        print(f"‚ùå Error: {e}")
        df = None

# ============================================================================
# METHOD 3: Old Amazon Dataset from UCSD (Backup - smaller but works!)
# ============================================================================
if df is None:
    print("\n" + "=" * 70)
    print("[METHOD 3] Trying Old Amazon Dataset (2014)...")
    print("This is smaller but reliable!")
    print("-" * 70)
    
    try:
        import gzip
        import json
        import urllib.request
        
        url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz"
        
        print("Downloading from UCSD (may take 1-2 minutes)...")
        
        # Download file
        urllib.request.urlretrieve(url, 'electronics.json.gz')
        
        # Load JSONL.gz file
        reviews = []
        with gzip.open('electronics.json.gz', 'rt', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i >= 10000:  # Limit to 10k
                    break
                reviews.append(json.loads(line))
        
        df = pd.DataFrame(reviews)
        print(f"‚úÖ SUCCESS! Loaded {len(df)} reviews from 2014 dataset")
        print(f"   Columns: {df.columns.tolist()}")
        print("\nFirst 3 reviews:")
        print(df.head(3))
        
        # Show rating distribution
        if 'overall' in df.columns:
            print("\nRating Distribution:")
            print(df['overall'].value_counts().sort_index())
        
    except Exception as e:
        print(f"‚ùå Error: {e}")
        df = None

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 70)
if df is not None:
    print("‚úÖ DATASET LOADED SUCCESSFULLY!")
    print("=" * 70)
    print(f"Total reviews: {len(df)}")
    print(f"Columns: {list(df.columns)}")
    print("\nDataFrame 'df' is ready to use!")
    print("\nNext: Run the analysis script to get preliminary results")
else:
    print("‚ùå ALL METHODS FAILED")
    print("=" * 70)
    print("\nQuick Fix - Install datasets package:")
    print("  !pip install datasets")
    print("\nThen run this script again!")

LOADING AMAZON ELECTRONICS REVIEWS DATASET

[METHOD 1] Trying HuggingFace Datasets...
Run this first: pip install datasets
----------------------------------------------------------------------


  from .autonotebook import tqdm as notebook_tqdm
`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'McAuley-Lab/Amazon-Reviews-2023' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


Loading Electronics reviews (first 10,000)...
This may take 2-3 minutes on first download...

‚ùå Error: Dataset scripts are no longer supported, but found Amazon-Reviews-2023.py

[METHOD 2] Trying Kaggle API...
Run this first: pip install kaggle
----------------------------------------------------------------------
‚ùå 'kaggle' package not installed
   Install with: pip install kaggle

[METHOD 3] Trying Old Amazon Dataset (2014)...
This is smaller but reliable!
----------------------------------------------------------------------
Downloading from UCSD (may take 1-2 minutes)...
‚úÖ SUCCESS! Loaded 10000 reviews from 2014 dataset
   Columns: ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime']

First 3 reviews:
       reviewerID        asin     reviewerName   helpful  \
0   AO94DHGC771SJ  0528881469          amazdnu    [0, 0]   
1   AMO214LNFCEI4  0528881469  Amazon Customer  [12, 15]   
2  A3N7T0DY83Y4IG  0528881469    C

In [3]:
# PRELIMINARY ANALYSIS SCRIPT
# Run this AFTER loading the dataset (df should already exist)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error
import re
import warnings
warnings.filterwarnings('ignore')

print("=" * 70)
print("PRELIMINARY RESULTS ANALYSIS")
print("=" * 70)

# Check that df exists
try:
    print(f"\n‚úì Dataset loaded: {len(df)} reviews")
except NameError:
    print("‚ùå ERROR: 'df' not found. Please run the loading script first!")
    exit()

# ============================================================================
# STEP 1: DATA PREPROCESSING
# ============================================================================
print("\n[1/5] Preprocessing data...")

# Use the correct column names from the loaded dataset
text_col = 'reviewText'
rating_col = 'overall'

# Clean data
df_clean = df[[text_col, rating_col]].dropna()
print(f"‚úì Removed {len(df) - len(df_clean)} rows with missing values")

# Convert ratings to integers (1-5)
df_clean[rating_col] = df_clean[rating_col].astype(int)
df_clean = df_clean[df_clean[rating_col].isin([1, 2, 3, 4, 5])]
print(f"‚úì Final dataset: {len(df_clean)} reviews")

# Show class distribution
print("\nüìä Class Distribution:")
dist = df_clean[rating_col].value_counts().sort_index()
for rating, count in dist.items():
    pct = (count / len(df_clean)) * 100
    print(f"   {rating}-star: {count:5d} ({pct:5.1f}%)")

# ============================================================================
# STEP 2: TEXT PREPROCESSING
# ============================================================================
print("\n[2/5] Preprocessing text...")

def clean_text(text):
    """Basic text cleaning"""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join(text.split())
    return text

df_clean['cleaned_text'] = df_clean[text_col].apply(clean_text)
df_clean = df_clean[df_clean['cleaned_text'].str.len() >= 10]
print(f"‚úì Text cleaned. Reviews after filtering: {len(df_clean)}")

# ============================================================================
# STEP 3: FEATURE EXTRACTION
# ============================================================================
print("\n[3/5] Extracting TF-IDF features...")

X = df_clean['cleaned_text']
y = df_clean[rating_col]

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF Vectorization (max 1000 features)
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.8)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"‚úì Training set: {X_train_tfidf.shape[0]} samples")
print(f"‚úì Test set: {X_test_tfidf.shape[0]} samples")
print(f"‚úì Features: {X_train_tfidf.shape[1]} TF-IDF terms")

# ============================================================================
# STEP 4: TRAIN MODELS
# ============================================================================
print("\n[4/5] Training classification models...")

print("   Training Naive Bayes...")
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)

print("   Training Logistic Regression...")
lr_model = LogisticRegression(max_iter=500, random_state=42, multi_class='multinomial')
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)

print("‚úì Models trained successfully")

# ============================================================================
# STEP 5: EVALUATE AND GENERATE RESULTS
# ============================================================================
print("\n[5/5] Generating Results...")
print("\n" + "=" * 70)
print("PRELIMINARY RESULTS SUMMARY")
print("=" * 70)

# Calculate metrics
nb_acc = accuracy_score(y_test, nb_pred)
nb_mae = mean_absolute_error(y_test, nb_pred)
nb_cm = confusion_matrix(y_test, nb_pred)

lr_acc = accuracy_score(y_test, lr_pred)
lr_mae = mean_absolute_error(y_test, lr_pred)
lr_cm = confusion_matrix(y_test, lr_pred)

print(f"\nüìä MULTINOMIAL NAIVE BAYES:")
print(f"   Accuracy: {nb_acc*100:.1f}% ({nb_acc:.4f})")
print(f"   MAE: {nb_mae:.2f}")

print(f"\nüìä LOGISTIC REGRESSION (Nominal):")
print(f"   Accuracy: {lr_acc*100:.1f}% ({lr_acc:.4f})")
print(f"   MAE: {lr_mae:.2f}")

# Confusion Matrix Analysis
print("\nüìà CONFUSION MATRIX ANALYSIS:")

def calc_adjacent_errors(cm):
    """Calculate % of errors that are adjacent ratings (off by 1)"""
    total_errors = np.sum(cm) - np.trace(cm)
    if total_errors == 0:
        return 0
    adjacent_errors = 0
    for i in range(len(cm)):
        for j in range(len(cm)):
            if abs(i - j) == 1:  # Adjacent ratings
                adjacent_errors += cm[i][j]
    return (adjacent_errors / total_errors) * 100

nb_adj = calc_adjacent_errors(nb_cm)
lr_adj = calc_adjacent_errors(lr_cm)

print(f"   Naive Bayes: {nb_adj:.1f}% of errors are adjacent ratings (e.g., 4‚Üî5)")
print(f"   Logistic Regression: {lr_adj:.1f}% of errors are adjacent ratings")

# Per-class F1 scores
print("\nüìä PER-CLASS F1-SCORES:")
nb_report = classification_report(y_test, nb_pred, output_dict=True, zero_division=0)
lr_report = classification_report(y_test, lr_pred, output_dict=True, zero_division=0)

print("\nNaive Bayes:")
for rating in [1, 2, 3, 4, 5]:
    if str(rating) in nb_report:
        f1 = nb_report[str(rating)]['f1-score']
        print(f"   {rating}-star: F1 = {f1:.3f}")

print("\nLogistic Regression:")
for rating in [1, 2, 3, 4, 5]:
    if str(rating) in lr_report:
        f1 = lr_report[str(rating)]['f1-score']
        print(f"   {rating}-star: F1 = {f1:.3f}")

# ============================================================================
# GENERATE PARAGRAPH FOR PROPOSAL
# ============================================================================
print("\n" + "=" * 70)
print("üìù COPY THIS PARAGRAPH INTO YOUR PROPOSAL:")
print("=" * 70)

# Get F1 scores for 5-star and 3-star
f1_5star = lr_report.get('5', {}).get('f1-score', 0)
f1_3star = lr_report.get('3', {}).get('f1-score', 0)

paragraph = f"""Initial experiments on {len(df_clean):,} Electronics reviews show promising 
directions. Using TF-IDF features (max 1,000 features), Multinomial Naive Bayes 
achieved {nb_acc*100:.1f}% accuracy with MAE of {nb_mae:.2f}, while Logistic Regression 
(nominal treatment) achieved {lr_acc*100:.1f}% accuracy with MAE of {lr_mae:.2f}. 
Confusion matrix analysis reveals that {lr_adj:.0f}% of misclassifications occur 
between adjacent ratings (particularly 4‚Üî5 stars), supporting our hypothesis that 
ordinal treatment may improve performance. The class imbalance is evident‚Äîmodels 
achieve {f1_5star*100:.0f}% F1-score for 5-star reviews but only {f1_3star*100:.0f}% 
for 3-star reviews. These preliminary findings motivate our investigation into 
whether ordinal methods can reduce MAE by better modeling rating structure while 
addressing the adjacent-rating confusion problem."""

print("\n" + paragraph)

print("\n" + "=" * 70)
print("‚úÖ PRELIMINARY RESULTS COMPLETE!")
print("=" * 70)
print("\nüìã NEXT STEPS:")
print("   1. Copy the paragraph above")
print("   2. Paste it into the 'Preliminary Results' section of your proposal")
print("   3. Add your 2 groupmates' names")
print("   4. Export to PDF and submit!")
print("\n" + "=" * 70)

PRELIMINARY RESULTS ANALYSIS

‚úì Dataset loaded: 10000 reviews

[1/5] Preprocessing data...
‚úì Removed 0 rows with missing values
‚úì Final dataset: 10000 reviews

üìä Class Distribution:
   1-star:   572 (  5.7%)
   2-star:   450 (  4.5%)
   3-star:   822 (  8.2%)
   4-star:  2095 ( 20.9%)
   5-star:  6061 ( 60.6%)

[2/5] Preprocessing text...
‚úì Text cleaned. Reviews after filtering: 9991

[3/5] Extracting TF-IDF features...
‚úì Training set: 7992 samples
‚úì Test set: 1999 samples
‚úì Features: 1000 TF-IDF terms

[4/5] Training classification models...
   Training Naive Bayes...
   Training Logistic Regression...
‚úì Models trained successfully

[5/5] Generating Results...

PRELIMINARY RESULTS SUMMARY

üìä MULTINOMIAL NAIVE BAYES:
   Accuracy: 60.5% (0.6053)
   MAE: 0.74

üìä LOGISTIC REGRESSION (Nominal):
   Accuracy: 62.6% (0.6258)
   MAE: 0.62

üìà CONFUSION MATRIX ANALYSIS:
   Naive Bayes: 53.4% of errors are adjacent ratings (e.g., 4‚Üî5)
   Logistic Regression: 62.4% of