In [1]:
!pip install datasets pandas scikit-learn



In [3]:
# Import required libraries
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset from Hugging Face
print("Loading Amazon Product Reviews dataset...")
dataset = load_dataset("m-ric/amazon_product_reviews_datafiniti")

# Convert to pandas DataFrame for easier manipulation
df = dataset['train'].to_pandas()

print(f"Original dataset size: {len(df)}")
print("\nAvailable columns:")
print(df.columns.tolist())

Loading Amazon Product Reviews dataset...
Original dataset size: 6000

Available columns:
['brand', 'primaryCategories', 'reviews.numHelpful', 'reviews.rating', 'reviews.text']


In [4]:
# See what columns are actually available
print("Available columns in the dataset:")
print(df.columns.tolist())
print(f"\nTotal columns: {len(df.columns)}")

# look at the first row to understand the structure
print("\nFirst row sample:")
print(df.iloc[1])

Available columns in the dataset:
['brand', 'primaryCategories', 'reviews.numHelpful', 'reviews.rating', 'reviews.text']

Total columns: 5

First row sample:
brand                                     2
primaryCategories           Health & Beauty
reviews.numHelpful                      NaN
reviews.rating                            5
reviews.text          always need batteries
Name: 1, dtype: object


In [5]:
# explore the dataset with the correct columns
print("Dataset sample and structure:")
print(f"First review text: {df['reviews.text'].iloc[0]}")
print(f"First review rating: {df['reviews.rating'].iloc[0]}")
print(f"Brand: {df['brand'].iloc[0]}")
print(f"Primary Category: {df['primaryCategories'].iloc[0]}")

print("\nRating distribution:")
rating_counts = df['reviews.rating'].value_counts().sort_index()
print(rating_counts)

print(f"\nTotal reviews: {len(df)}")

Dataset sample and structure:
First review text: Amazon's batteries are great. I've had no problems with them leaking or corroding. They last as long as any other AA battery I've used too.The AmazonBasics brand has been great for so many different things, and I definitely recommend these batteries.
First review rating: 5
Brand: 2
Primary Category: Health & Beauty

Rating distribution:
1     502
2     306
3     627
4    2786
5    1779
Name: reviews.rating, dtype: int64

Total reviews: 6000


In [6]:
# Check for missing values in key columns
print("Missing values in key columns:")
print(f"reviews.text: {df['reviews.text'].isna().sum()} missing")
print(f"reviews.rating: {df['reviews.rating'].isna().sum()} missing")
print(f"brand: {df['brand'].isna().sum()} missing")
print(f"primaryCategories: {df['primaryCategories'].isna().sum()} missing")

print(f"\nTotal rows in dataset: {len(df)}")

Missing values in key columns:
reviews.text: 0 missing
reviews.rating: 0 missing
brand: 0 missing
primaryCategories: 0 missing

Total rows in dataset: 6000


In [7]:
# Define the preprocessing function
def preprocess_reviews_data(df, sample_size=300):
    """
    Preprocess the Amazon reviews dataset for sentiment classification
    """
    # Create a clean DataFrame with only the columns we need
    clean_data = []
    
    for idx, row in df.iterrows():
        review_text = row.get('reviews.text', '')
        review_rating = row.get('reviews.rating', None)
        
        # Skip rows with missing text or rating
        if not review_text or pd.isna(review_rating) or review_text == '':
            continue
            
        # Convert rating to binary sentiment
        if review_rating in [4, 5]:
            sentiment = 1  # Positive
        elif review_rating in [1, 2]:
            sentiment = 0  # Negative
        else:
            continue  # Skip 3-star (neutral) reviews
            
        clean_data.append({
            'text': str(review_text).strip().lower(),
            'label': sentiment,
            'rating': review_rating,
            'brand': row.get('brand', 'unknown'),
            'category': row.get('primaryCategories', 'unknown')
        })
    
    # Convert to DataFrame
    clean_df = pd.DataFrame(clean_data)
    
    # Remove duplicates and very short reviews
    clean_df = clean_df.drop_duplicates(subset=['text'])
    clean_df = clean_df[clean_df['text'].str.len() > 10]
    
    print(f"After cleaning: {len(clean_df)} reviews")
    
    # Balance the classes and take a sample
    positive_reviews = clean_df[clean_df['label'] == 1]
    negative_reviews = clean_df[clean_df['label'] == 0]
    
    print(f"Positive reviews available: {len(positive_reviews)}")
    print(f"Negative reviews available: {len(negative_reviews)}")
    
    # Take equal samples from each class (adjust if one class has fewer)
    n_each = min(len(positive_reviews), len(negative_reviews), sample_size // 2)
    
    balanced_df = pd.concat([
        positive_reviews.sample(n_each, random_state=42),
        negative_reviews.sample(n_each, random_state=42)
    ])
    
    return balanced_df

# Now apply preprocessing
print("Preprocessing data...")
processed_df = preprocess_reviews_data(df, sample_size=300)

print(f"\nFinal processed dataset size: {len(processed_df)}")
print("\nClass distribution:")
print(processed_df['label'].value_counts())
print(f"Positive reviews: {(processed_df['label'] == 1).sum()}")
print(f"Negative reviews: {(processed_df['label'] == 0).sum()}")

Preprocessing data...
After cleaning: 4158 reviews
Positive reviews available: 3463
Negative reviews available: 695

Final processed dataset size: 300

Class distribution:
1    150
0    150
Name: label, dtype: int64
Positive reviews: 150
Negative reviews: 150


In [8]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    processed_df, 
    test_size=0.3, 
    random_state=42,
    stratify=processed_df['label']
)

print(f"Train: {len(train_df)}, Test: {len(test_df)}")

Train: 210, Test: 90


In [10]:
# Test the baseline on RANDOM examples
class NaiveBaseline:
    def __init__(self):
        self.positive_words = ['great', 'good', 'excellent', 'love', 'awesome', 'perfect', 'best', 'recommend']
        self.negative_words = ['bad', 'terrible', 'awful', 'hate', 'worst', 'waste', 'broken', 'disappointed']
    
    def predict(self, text):
        text = text.lower()
        pos_count = sum(1 for word in self.positive_words if word in text)
        neg_count = sum(1 for word in self.negative_words if word in text)
        
        return 1 if pos_count > neg_count else 0

# Create the baseline classifier
baseline = NaiveBaseline()

# Test on RANDOM examples from your dataset
print("TESTING NAIVE BASELINE ON RANDOM REVIEWS:")
print("=" * 60)

# Get 6 random examples (3 positive, 3 negative)
random_positive = processed_df[processed_df['label'] == 1].sample(3, random_state=None)  # None = different each time
random_negative = processed_df[processed_df['label'] == 0].sample(3, random_state=None)

sample_reviews = pd.concat([random_positive, random_negative])

# Test each review
for i, (index, review) in enumerate(sample_reviews.iterrows()):
    text = review['text']
    true_label = review['label']
    predicted_label = baseline.predict(text)
    
    # Count keywords found
    pos_count = sum(1 for word in baseline.positive_words if word in text.lower())
    neg_count = sum(1 for word in baseline.negative_words if word in text.lower())
    
    print(f"\nExample {i+1}:")
    print(f"Review: {text[:10000]}")
    print(f"True Rating: {review['rating']} stars ‚Üí True Label: {'POSITIVE' if true_label == 1 else 'NEGATIVE'}")
    print(f"Predicted: {'POSITIVE' if predicted_label == 1 else 'NEGATIVE'}")
    print(f"Keywords found: {pos_count} positive, {neg_count} negative")
    print(f"Correct? {'‚úÖ YES' if true_label == predicted_label else '‚ùå NO'}")
    print("-" * 50)

TESTING NAIVE BASELINE ON RANDOM REVIEWS:

Example 1:
Review: great quality batteries. comes in a small box, perfect for storing.
True Rating: 5 stars ‚Üí True Label: POSITIVE
Predicted: POSITIVE
Keywords found: 2 positive, 0 negative
Correct? ‚úÖ YES
--------------------------------------------------

Example 2:
Review: best product value , amazon know what they doing (:good price for quantity , for sure will buy more .
True Rating: 5 stars ‚Üí True Label: POSITIVE
Predicted: POSITIVE
Keywords found: 2 positive, 0 negative
Correct? ‚úÖ YES
--------------------------------------------------

Example 3:
Review: amazon kindle fire has a lot of free app and can be used by any one that wants to get online anywhere
True Rating: 4 stars ‚Üí True Label: POSITIVE
Predicted: NEGATIVE
Keywords found: 0 positive, 0 negative
Correct? ‚ùå NO
--------------------------------------------------

Example 4:
Review: i'm planning on returning this tablet just as soon as i can!`
True Rating: 2 stars ‚Üí T

In [11]:
# Let's also test some tricky cases with random sampling
print("\nTESTING RANDOM TRICKY CASES:")
print("=" * 60)

# Find reviews that contain both positive and negative words (mixed sentiment)
mixed_reviews = []
for index, row in processed_df.iterrows():
    text = row['text'].lower()
    pos_count = sum(1 for word in baseline.positive_words if word in text)
    neg_count = sum(1 for word in baseline.negative_words if word in text)
    
    # Look for reviews that have both positive and negative keywords
    if pos_count > 0 and neg_count > 0:
        mixed_reviews.append((index, row, pos_count, neg_count))

# Take 3 random mixed reviews
if mixed_reviews:
    import random
    random_mixed = random.sample(mixed_reviews, min(3, len(mixed_reviews)))
    
    for i, (index, review, pos_count, neg_count) in enumerate(random_mixed):
        predicted = baseline.predict(review['text'])
        
        print(f"\nMixed Sentiment Example {i+1}:")
        print(f"Text: {review['text'][:10000]}")
        print(f"True Rating: {review['rating']} stars ‚Üí True Label: {'POSITIVE' if review['label'] == 1 else 'NEGATIVE'}")
        print(f"Predicted: {'POSITIVE' if predicted == 1 else 'NEGATIVE'}")
        print(f"Keywords found: {pos_count} positive, {neg_count} negative")
        print(f"Correct? {'‚úÖ YES' if review['label'] == predicted else '‚ùå NO'}")
        print("-" * 50)
else:
    print("No mixed sentiment reviews found in this sample.")


TESTING RANDOM TRICKY CASES:

Mixed Sentiment Example 1:
Text: really good price for a reason, first two batteries i used were dead. do not waste your money!
True Rating: 1 stars ‚Üí True Label: NEGATIVE
Predicted: NEGATIVE
Keywords found: 1 positive, 1 negative
Correct? ‚úÖ YES
--------------------------------------------------

Mixed Sentiment Example 2:
Text: it's not bad for the novelty of alexa. might as well get the echo. sound it not bad and i'm no audiophile. might be good when my echo dots come in. is it worth it? there are better bluetooth speakers out there.
True Rating: 4 stars ‚Üí True Label: POSITIVE
Predicted: NEGATIVE
Keywords found: 1 positive, 1 negative
Correct? ‚ùå NO
--------------------------------------------------

Mixed Sentiment Example 3:
Text: the tap is a great concept, i love my echo so a portable one was that much better. however the sound is pretty terrible, no bass and can be scratchy.. it was a good concept just poor execution.
True Rating: 2 stars ‚Ü

In [12]:
# Run the baseline on the entire test set to get overall accuracy
print("\nEVALUATING NAIVE BASELINE ON ENTIRE TEST SET:")
print("=" * 60)

from sklearn.metrics import accuracy_score, classification_report

# Get predictions for all test examples
y_true = test_df['label'].tolist()
y_pred_baseline = []
for text in test_df['text']:
    y_pred_baseline.append(baseline.predict(text))

# Calculate overall accuracy
baseline_accuracy = accuracy_score(y_true, y_pred_baseline)
print(f"Overall Baseline Accuracy: {baseline_accuracy:.3f}")

# Detailed performance report
print("\nDetailed Performance Report:")
print(classification_report(y_true, y_pred_baseline, target_names=['negative', 'positive']))

# Show some random correct and incorrect predictions
print("\nRANDOM CORRECT AND INCORRECT PREDICTIONS:")
print("=" * 60)

# Add predictions to test_df for analysis
test_df_with_pred = test_df.copy()
test_df_with_pred['predicted'] = y_pred_baseline
test_df_with_pred['correct'] = test_df_with_pred['label'] == test_df_with_pred['predicted']

# Get 2 correct and 2 incorrect random examples
correct_examples = test_df_with_pred[test_df_with_pred['correct'] == True].sample(2, random_state=None)
incorrect_examples = test_df_with_pred[test_df_with_pred['correct'] == False].sample(min(2, len(test_df_with_pred[test_df_with_pred['correct'] == False])), random_state=None)

print("CORRECT PREDICTIONS:")
for i, (index, row) in enumerate(correct_examples.iterrows()):
    print(f"\nCorrect Example {i+1}:")
    print(f"Text: {row['text'][:10000]}")
    print(f"True: {'POSITIVE' if row['label'] == 1 else 'NEGATIVE'}, Predicted: {'POSITIVE' if row['predicted'] == 1 else 'NEGATIVE'}")

print("\nINCORRECT PREDICTIONS:")
for i, (index, row) in enumerate(incorrect_examples.iterrows()):
    print(f"\nIncorrect Example {i+1}:")
    print(f"Text: {row['text'][:10000]}")
    print(f"True: {'POSITIVE' if row['label'] == 1 else 'NEGATIVE'}, Predicted: {'POSITIVE' if row['predicted'] == 1 else 'NEGATIVE'}")


EVALUATING NAIVE BASELINE ON ENTIRE TEST SET:
Overall Baseline Accuracy: 0.756

Detailed Performance Report:
              precision    recall  f1-score   support

    negative       0.69      0.91      0.79        45
    positive       0.87      0.60      0.71        45

    accuracy                           0.76        90
   macro avg       0.78      0.76      0.75        90
weighted avg       0.78      0.76      0.75        90


RANDOM CORRECT AND INCORRECT PREDICTIONS:
CORRECT PREDICTIONS:

Correct Example 1:
Text: wore out quickly
True: NEGATIVE, Predicted: NEGATIVE

Correct Example 2:
Text: good deal for rechargeable. they all work and seem to preform well.
True: POSITIVE, Predicted: POSITIVE

INCORRECT PREDICTIONS:

Incorrect Example 1:
Text: i don't recommend buying this. after 1 month of buying this, it won't charge or turn on.
True: NEGATIVE, Predicted: POSITIVE

Incorrect Example 2:
Text: better than buying 12 duracell betters for the same price. used primarily for my kids

In [13]:
!pip install transformers torch



In [14]:
# =============================================================================
# AI PIPELINE IMPLEMENTATION
# =============================================================================

from transformers import DistilBertTokenizer, DistilBertModel
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load pre-trained model and tokenizer
print("Loading DistilBERT model and tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.to(device)
model.eval()  # Set model to evaluation mode

print("Model loaded successfully!")

# Function to get embeddings from the model
def get_embeddings(texts, batch_size=16):
    """
    Get [CLS] token embeddings for a list of texts in batches to save memory
    """
    embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize the batch
        inputs = tokenizer(
            batch_texts, 
            return_tensors='pt', 
            truncation=True, 
            padding=True, 
            max_length=128
        )
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():  # Disable gradient calculation for inference
            outputs = model(**inputs)
            # Extract the [CLS] token embedding (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu().numpy())
    
    # Combine all batches
    return np.vstack(embeddings)

print("\nGenerating embeddings for training data...")
# Generate embeddings for the training texts
X_train_embeddings = get_embeddings(train_df['text'].tolist())
y_train = train_df['label'].tolist()

print("Generating embeddings for testing data...")
# Generate embeddings for the testing texts
X_test_embeddings = get_embeddings(test_df['text'].tolist())
y_test = test_df['label'].tolist()

print(f"Training embeddings shape: {X_train_embeddings.shape}")
print(f"Test embeddings shape: {X_test_embeddings.shape}")

# Train a classifier on the embeddings
print("\nTraining Logistic Regression classifier on embeddings...")
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train_embeddings, y_train)

# Evaluate the AI Pipeline
y_pred_ai = clf.predict(X_test_embeddings)
ai_accuracy = accuracy_score(y_test, y_pred_ai)

print(f"\n{'=' * 60}")
print("AI PIPELINE PERFORMANCE")
print(f"{'=' * 60}")
print(f"Overall AI Pipeline Accuracy: {ai_accuracy:.3f}")
print("\nDetailed Performance Report:")
print(classification_report(y_test, y_pred_ai, target_names=['negative', 'positive']))

Using device: cpu
Loading DistilBERT model and tokenizer...


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Model loaded successfully!

Generating embeddings for training data...
Generating embeddings for testing data...
Training embeddings shape: (210, 768)
Test embeddings shape: (90, 768)

Training Logistic Regression classifier on embeddings...

AI PIPELINE PERFORMANCE
Overall AI Pipeline Accuracy: 0.856

Detailed Performance Report:
              precision    recall  f1-score   support

    negative       0.85      0.87      0.86        45
    positive       0.86      0.84      0.85        45

    accuracy                           0.86        90
   macro avg       0.86      0.86      0.86        90
weighted avg       0.86      0.86      0.86        90



In [24]:
# =============================================================================
# TEST AI PIPELINE ON RANDOM TRICKY CASES
# =============================================================================

print(f"\n{'=' * 60}")
print("TESTING AI PIPELINE ON RANDOM MIXED SENTIMENT REVIEWS")
print(f"{'=' * 60}")

# Find NEW random mixed sentiment reviews each time
mixed_reviews_new = []
for index, row in processed_df.iterrows():
    text = row['text'].lower()
    pos_count = sum(1 for word in baseline.positive_words if word in text)
    neg_count = sum(1 for word in baseline.negative_words if word in text)
    
    # Look for reviews that have both positive and negative keywords
    if pos_count > 0 and neg_count > 0:
        mixed_reviews_new.append((index, row, pos_count, neg_count))

# Take 3 RANDOM mixed reviews (different each time)
if mixed_reviews_new:
    random_mixed_new = random.sample(mixed_reviews_new, min(3, len(mixed_reviews_new)))
    
    for i, (index, review, pos_count, neg_count) in enumerate(random_mixed_new):
        # Get AI prediction
        review_embedding = get_embeddings([review['text']])
        ai_prediction = clf.predict(review_embedding)[0]
        
        print(f"\nMixed Sentiment Example {i+1}:")
        print(f"Text: {review['text'][:10000]}")
        print(f"True Rating: {review['rating']} stars ‚Üí True Label: {'POSITIVE' if review['label'] == 1 else 'NEGATIVE'}")
        print(f"Baseline Pred: {'POSITIVE' if baseline.predict(review['text']) == 1 else 'NEGATIVE'}")
        print(f"AI Pred: {'POSITIVE' if ai_prediction == 1 else 'NEGATIVE'}")
        print(f"Keywords found: {pos_count} positive, {neg_count} negative")
        baseline_correct = '‚úÖ' if baseline.predict(review['text']) == review['label'] else '‚ùå'
        ai_correct = '‚úÖ' if ai_prediction == review['label'] else '‚ùå'
        print(f"Baseline: {baseline_correct}, AI: {ai_correct}")
        print("-" * 50)
else:
    print("No mixed sentiment reviews found in this sample.")

# =============================================================================
# TEST ON RANDOM REGULAR REVIEWS (POSITIVE AND NEGATIVE)
# =============================================================================

print(f"\n{'=' * 60}")
print("TESTING AI PIPELINE ON RANDOM REGULAR REVIEWS")
print(f"{'=' * 60}")

# Get 2 random positive and 2 random negative reviews (different each time)
random_positive = processed_df[processed_df['label'] == 1].sample(2, random_state=None)
random_negative = processed_df[processed_df['label'] == 0].sample(2, random_state=None)
random_reviews = pd.concat([random_positive, random_negative])

print("RANDOM REVIEW TESTING:")
for i, (index, review) in enumerate(random_reviews.iterrows()):
    # Get AI prediction
    review_embedding = get_embeddings([review['text']])
    ai_prediction = clf.predict(review_embedding)[0]
    
    print(f"\nRandom Review {i+1}:")
    print(f"Text: {review['text'][:10000]}")
    print(f"True Rating: {review['rating']} stars ‚Üí True Label: {'POSITIVE' if review['label'] == 1 else 'NEGATIVE'}")
    print(f"Baseline Pred: {'POSITIVE' if baseline.predict(review['text']) == 1 else 'NEGATIVE'}")
    print(f"AI Pred: {'POSITIVE' if ai_prediction == 1 else 'NEGATIVE'}")
    baseline_correct = '‚úÖ' if baseline.predict(review['text']) == review['label'] else '‚ùå'
    ai_correct = '‚úÖ' if ai_prediction == review['label'] else '‚ùå'
    print(f"Baseline: {baseline_correct}, AI: {ai_correct}")
    print("-" * 50)

print("\n" + "=" * 60)
print("ANALYSIS COMPLETE!")
print("=" * 60)

# =============================================================================
# RANDOM CORRECT AND INCORRECT PREDICTIONS FROM AI PIPELINE
# =============================================================================

print(f"\n{'=' * 60}")
print("RANDOM CORRECT AND INCORRECT PREDICTIONS FROM AI PIPELINE:")
print(f"{'=' * 60}") 

# Create a fresh copy with AI predictions to ensure we have the right columns
test_analysis_df = test_df.copy()
test_analysis_df['ai_pred'] = y_pred_ai
test_analysis_df['ai_correct'] = test_analysis_df['label'] == test_analysis_df['ai_pred']

# Get 2 correct and 2 incorrect random examples from AI (DIFFERENT EACH TIME)
correct_ai_examples = test_analysis_df[test_analysis_df['ai_correct'] == True].sample(2, random_state=None)
incorrect_ai_examples = test_analysis_df[test_analysis_df['ai_correct'] == False].sample(
    min(2, len(test_analysis_df[test_analysis_df['ai_correct'] == False])), 
    random_state=None  # No fixed random state = different each time
)

print("CORRECT AI PREDICTIONS:")
for i, (index, row) in enumerate(correct_ai_examples.iterrows()):
    print(f"\nCorrect AI Example {i+1}:")
    print(f"Text: {row['text'][:10000]}")
    print(f"True: {'POSITIVE' if row['label'] == 1 else 'NEGATIVE'}, AI Predicted: {'POSITIVE' if row['ai_pred'] == 1 else 'NEGATIVE'}")
    print(f"Rating: {row['rating']} stars")
    print("-" * 50)

print("\nINCORRECT AI PREDICTIONS:")
if len(incorrect_ai_examples) > 0:
    for i, (index, row) in enumerate(incorrect_ai_examples.iterrows()):
        print(f"\nIncorrect AI Example {i+1}:")
        print(f"Text: {row['text'][:10000]}")
        print(f"True: {'POSITIVE' if row['label'] == 1 else 'NEGATIVE'}, AI Predicted: {'POSITIVE' if row['ai_pred'] == 1 else 'NEGATIVE'}")
        print(f"Rating: {row['rating']} stars")
        print("-" * 50)
else:
    print("No incorrect predictions found! Perfect AI performance!")

# =============================================================================
# FINAL SUMMARY WITH RANDOM INSIGHTS
# =============================================================================

print(f"\n{'=' * 60}")
print("FINAL SUMMARY")
print(f"{'=' * 60}")

# Calculate some final statistics
total_test_reviews = len(test_df)
correct_ai_predictions = (test_analysis_df['ai_correct'] == True).sum()
incorrect_ai_predictions = (test_analysis_df['ai_correct'] == False).sum()

print(f"Total Test Reviews: {total_test_reviews}")
print(f"Correct AI Predictions: {correct_ai_predictions} ({correct_ai_predictions/total_test_reviews*100:.1f}%)")
print(f"Incorrect AI Predictions: {incorrect_ai_predictions} ({incorrect_ai_predictions/total_test_reviews*100:.1f}%)")
print(f"Final AI Pipeline Accuracy: {ai_accuracy:.3f}")

if 'baseline_accuracy' in locals():
    print(f"Baseline Accuracy: {baseline_accuracy:.3f}")
    improvement = ai_accuracy - baseline_accuracy
    print(f"Improvement: +{improvement:.3f} ({improvement*100:.1f}%)")

# Show a random interesting fact
interesting_facts = [
    f"AI correctly classified {correct_ai_predictions} out of {total_test_reviews} reviews",
    f"That's like getting {correct_ai_predictions} test questions right out of {total_test_reviews}",
    f"The AI makes mistakes on only {incorrect_ai_predictions} reviews",
    f"Run this again to see different examples of AI performance!"
]

print(f"\nüí° Random Insight: {random.choice(interesting_facts)}")

print(f"\n{'=' * 60}")
print("ANALYSIS COMPLETED SUCCESSFULLY!")
print("=" * 60)
print("üí° TIP: Run this code again to see different random examples!")
print("=" * 60)


TESTING AI PIPELINE ON RANDOM MIXED SENTIMENT REVIEWS

Mixed Sentiment Example 1:
Text: i accidentally cracked my screen on the first day when the voyage was too close to my rocking chair and neither best buy nor amazon would do anything to help. well, amazon offered $15 off purchasing the paperwhite and that wasn't something i wanted at all. now i'm stuck with a $199 broken voyage. not a happy customer. i realize it was my fault but thought they should do something more to minimize the loss and keep my business.
True Rating: 2 stars ‚Üí True Label: NEGATIVE
Baseline Pred: NEGATIVE
AI Pred: NEGATIVE
Keywords found: 1 positive, 1 negative
Baseline: ‚úÖ, AI: ‚úÖ
--------------------------------------------------

Mixed Sentiment Example 2:
True Rating: 1 stars ‚Üí True Label: NEGATIVE
Baseline Pred: NEGATIVE
AI Pred: NEGATIVE
Keywords found: 1 positive, 2 negative
Baseline: ‚úÖ, AI: ‚úÖ
--------------------------------------------------

Mixed Sentiment Example 3:
Text: very disappointe

In [17]:
# =============================================================================
# COMPARISON: BASELINE vs AI PIPELINE
# =============================================================================

print(f"\n{'=' * 60}")
print("COMPARISON: Naive Baseline vs. AI Pipeline")
print(f"{'=' * 60}")

# Use the correct variable names that you defined earlier
print(f"Naive Baseline Accuracy: {baseline_accuracy:.3f}")  # Changed from 'accuracy' to 'baseline_accuracy'
print(f"AI Pipeline Accuracy:    {ai_accuracy:.3f}")        # Changed from 'accuracy_ai' to 'ai_accuracy'
print(f"Improvement:             +{ai_accuracy - baseline_accuracy:.3f}")

# Show some examples where AI pipeline corrects baseline errors
print(f"\n{'=' * 60}")
print("EXAMPLES WHERE AI PIPELINE CORRECTS BASELINE ERRORS")
print(f"{'=' * 60}")

# Add AI predictions to test dataframe - use consistent variable names
test_df_with_pred = test_df.copy()
test_df_with_pred['baseline_pred'] = y_pred_baseline  # Changed from 'y_pred' to 'y_pred_baseline'
test_df_with_pred['ai_pred'] = y_pred_ai              # This should be correct
test_df_with_pred['baseline_correct'] = test_df_with_pred['label'] == test_df_with_pred['baseline_pred']
test_df_with_pred['ai_correct'] = test_df_with_pred['label'] == test_df_with_pred['ai_pred']

# Find examples where baseline was wrong but AI was right
corrected_examples = test_df_with_pred[
    (test_df_with_pred['baseline_correct'] == False) & 
    (test_df_with_pred['ai_correct'] == True)
]

if len(corrected_examples) > 0:
    sample_corrected = corrected_examples.sample(min(2, len(corrected_examples)), random_state=42)
    
    for i, (idx, row) in enumerate(sample_corrected.iterrows()):
        print(f"\nCorrected Example {i+1}:")
        print(f"Review: {row['text'][:10000]}")
        print(f"True Rating: {row['rating']} stars ‚Üí True Label: {'POSITIVE' if row['label'] == 1 else 'NEGATIVE'}")
        print(f"Baseline Prediction: {'POSITIVE' if row['baseline_pred'] == 1 else 'NEGATIVE'}")
        print(f"AI Prediction: {'POSITIVE' if row['ai_pred'] == 1 else 'NEGATIVE'}")
        print("‚úÖ AI pipeline corrected baseline error!")
        print("-" * 50)
else:
    print("No correction examples found in this sample.")


COMPARISON: Naive Baseline vs. AI Pipeline
Naive Baseline Accuracy: 0.756
AI Pipeline Accuracy:    0.856
Improvement:             +0.100

EXAMPLES WHERE AI PIPELINE CORRECTS BASELINE ERRORS

Corrected Example 1:
Review: works as well as the name brand batteries but so much cheaper! easy to unpack also.
True Rating: 5 stars ‚Üí True Label: POSITIVE
Baseline Prediction: NEGATIVE
AI Prediction: POSITIVE
‚úÖ AI pipeline corrected baseline error!
--------------------------------------------------

Corrected Example 2:
Review: i don't recommend buying this. after 1 month of buying this, it won't charge or turn on.
True Rating: 1 stars ‚Üí True Label: NEGATIVE
Baseline Prediction: POSITIVE
AI Prediction: NEGATIVE
‚úÖ AI pipeline corrected baseline error!
--------------------------------------------------
