### Amazon Product Reviews

In [1]:
# amazon_reviews_nlp.ipynb
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    import os
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

print("spaCy model loaded successfully!")

# Sample Amazon reviews data (in practice, you'd load from a CSV file)
# Let's create a sample dataset
reviews_data = [
    {"review": "I absolutely love my new iPhone 14 Pro from Apple! The camera quality is amazing and battery life lasts all day.", "rating": 5},
    {"review": "Samsung Galaxy S23 has terrible battery life. Very disappointed with this purchase.", "rating": 2},
    {"review": "The Sony WH-1000XM4 headphones are incredible. Noise cancellation works perfectly!", "rating": 5},
    {"review": "Microsoft Surface Laptop is okay but overheats frequently. Not worth the price.", "rating": 3},
    {"review": "Google Pixel 7 Pro has an outstanding camera but the software has some bugs.", "rating": 4},
    {"review": "This HP laptop stopped working after 2 months. Worst purchase ever!", "rating": 1},
    {"review": "Apple MacBook Air with M2 chip is fantastic for programming and creative work.", "rating": 5},
    {"review": "Dell XPS 13 is lightweight and powerful, perfect for business trips.", "rating": 4},
    {"review": "The Bose QuietComfort headphones broke after 3 weeks. Poor build quality.", "rating": 2},
    {"review": "Lenovo ThinkPad is reliable as always. Great for office use.", "rating": 4},
    {"review": "Amazon Kindle Paperwhite is easy on the eyes and holds charge for weeks.", "rating": 5},
    {"review": "This Asus gaming laptop has amazing graphics but the fan is too loud.", "rating": 3},
    {"review": "Nintendo Switch OLED is perfect for family game nights. Kids love it!", "rating": 5},
    {"review": "Canon EOS R5 camera produces stunning photos but is quite expensive.", "rating": 4},
    {"review": "Fitbit Charge 5 tracks my fitness accurately and syncs well with my phone.", "rating": 4}
]

# Convert to DataFrame
df = pd.DataFrame(reviews_data)
print("Sample Reviews Data:")
print(df.head())

# Named Entity Recognition (NER)
def extract_entities(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        if ent.label_ in ['ORG', 'PRODUCT']:  # Focus on organizations and products
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char
            })
    return entities

# Apply NER to all reviews
df['entities'] = df['review'].apply(extract_entities)

print("\nNamed Entities Found:")
for i, row in df.iterrows():
    if row['entities']:
        print(f"Review {i+1}: {row['entities']}")

# Rule-based sentiment analysis
def rule_based_sentiment(text):
    doc = nlp(text.lower())
    
    # Define sentiment words
    positive_words = {
        'love', 'amazing', 'excellent', 'fantastic', 'perfect', 'great', 'good',
        'outstanding', 'incredible', 'awesome', 'best', 'wonderful', 'superb',
        'brilliant', 'exceptional', 'stellar', 'phenomenal', 'remarkable'
    }
    
    negative_words = {
        'terrible', 'awful', 'horrible', 'disappointed', 'bad', 'worst', 'poor',
        'broken', 'junk', 'garbage', 'trash', 'useless', 'worthless', 'frustrating',
        'annoying', 'problem', 'issue', 'defective', 'failed'
    }
    
    # Count positive and negative words
    positive_count = sum(1 for token in doc if token.text in positive_words)
    negative_count = sum(1 for token in doc if token.text in negative_words)
    
    # Determine sentiment
    if positive_count > negative_count:
        return 'positive'
    elif negative_count > positive_count:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis
df['predicted_sentiment'] = df['review'].apply(rule_based_sentiment)

# Compare with actual ratings
def rating_to_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

df['actual_sentiment'] = df['rating'].apply(rating_to_sentiment)

print("\nSentiment Analysis Results:")
print(df[['review', 'rating', 'predicted_sentiment', 'actual_sentiment']])

# Visualize sentiment distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
df['actual_sentiment'].value_counts().plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Actual Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
df['predicted_sentiment'].value_counts().plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Predicted Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Extract and analyze brands and products
def extract_brands_products(entities):
    brands = []
    products = []
    
    for entity in entities:
        if entity['label'] == 'ORG':
            brands.append(entity['text'])
        elif entity['label'] == 'PRODUCT':
            products.append(entity['text'])
    
    return brands, products

df['brands'], df['products'] = zip(*df['entities'].apply(extract_brands_products))

# Analyze most mentioned brands
all_brands = [brand for sublist in df['brands'] for brand in sublist]
brand_counts = Counter(all_brands)

print("\nMost Mentioned Brands:")
for brand, count in brand_counts.most_common():
    print(f"{brand}: {count} mentions")

# Analyze brand sentiment
brand_sentiment = {}
for _, row in df.iterrows():
    for brand in row['brands']:
        if brand not in brand_sentiment:
            brand_sentiment[brand] = []
        brand_sentiment[brand].append(row['predicted_sentiment'])

print("\nBrand Sentiment Analysis:")
for brand, sentiments in brand_sentiment.items():
    positive_pct = (sentiments.count('positive') / len(sentiments)) * 100
    print(f"{brand}: {positive_pct:.1f}% positive")

# Visualize NER for a sample review
sample_review = df.iloc[0]['review']
doc = nlp(sample_review)

print(f"\nSample Review NER Visualization:")
print(f"Review: {sample_review}")
print("Entities found:")
for ent in doc.ents:
    print(f"  - {ent.text} ({ent.label_})")

# Create a summary report
print("\n" + "="*50)
print("NLP ANALYSIS SUMMARY REPORT")
print("="*50)
print(f"Total Reviews Analyzed: {len(df)}")
print(f"Accuracy of Rule-based Sentiment: {(df['predicted_sentiment'] == df['actual_sentiment']).mean():.2%}")
print(f"Most Mentioned Brand: {brand_counts.most_common(1)[0][0]}")
print(f"Overall Positive Reviews: {(df['predicted_sentiment'] == 'positive').mean():.2%}")

ModuleNotFoundError: No module named 'spacy'