In [None]:
import spacy
import pandas as pd
from collections import Counter
import re

# Load spaCy model (you'll need to install: python -m spacy download en_core_web_sm)
try:
    nlp = spacy.load("en_core_web_sm")
except IOError:
    print("Please install the English model: python -m spacy download en_core_web_sm")
    exit()

# Sample Amazon product reviews data
sample_reviews = [
    "I absolutely love my new iPhone 14! Apple has outdone themselves with this product. The camera quality is amazing and the battery life is fantastic.",
    "The Samsung Galaxy S23 is okay but I expected better. The screen is nice but Samsung could improve the software experience.",
    "Terrible experience with this Sony headphones. The sound quality is poor and they broke after just one week. Would not recommend Sony products.",
    "Amazing Nike Air Max sneakers! Nike always delivers quality products. Very comfortable and stylish. Highly recommend!",
    "The Canon EOS R5 camera is professional grade. Canon has created an excellent tool for photographers. Worth every penny!",
    "Disappointed with my new MacBook Pro. Apple used to make better laptops. This one overheats and the keyboard feels cheap.",
    "Love my new AirPods Pro! Apple's noise cancellation technology is incredible. Best purchase I've made this year.",
    "The Dell XPS 13 laptop is decent. Dell has good build quality but the price is a bit high for what you get.",
    "Awful Microsoft Surface tablet. Microsoft needs to improve their hardware quality. Screen flickering issues from day one.",
    "Excellent Amazon Echo Dot! Amazon's Alexa is so helpful. Great value for money and works perfectly with my smart home setup."
]

class AmazonReviewAnalyzer:
    def __init__(self):
        self.nlp = nlp

        # Define positive and negative sentiment words
        self.positive_words = {
            'love', 'amazing', 'excellent', 'fantastic', 'great', 'wonderful',
            'perfect', 'outstanding', 'brilliant', 'awesome', 'incredible',
            'superb', 'remarkable', 'impressive', 'beautiful', 'comfortable',
            'stylish', 'recommend', 'best', 'quality', 'helpful', 'value'
        }

        self.negative_words = {
            'hate', 'terrible', 'awful', 'horrible', 'bad', 'poor', 'worst',
            'disappointing', 'useless', 'broken', 'cheap', 'overpriced',
            'disappointed', 'issues', 'problems', 'flickering', 'overheats',
            'expensive', 'slow', 'laggy', 'defective', 'failed', 'annoying',
            'frustrating', 'waste', 'regret', 'faulty', 'unreliable'
        }

        # Brand patterns for better extraction
        self.brand_patterns = {
            'Apple', 'Samsung', 'Sony', 'Nike', 'Canon', 'Dell',
            'Microsoft', 'Amazon', 'Google', 'LG', 'HP', 'Lenovo',
            'Adidas', 'Puma', 'Nikon', 'Panasonic', 'Bose', 'JBL'
        }

    def extract_entities(self, text):
        """Extract named entities from text with focus on products and brands"""
        doc = self.nlp(text)

        entities = {
            'PERSON': [],
            'ORG': [],  # Organizations (often brands)
            'PRODUCT': [],  # Products
            'BRANDS': [],  # Detected brands
            'MONEY': [],
            'GPE': []  # Geopolitical entities
        }

        # Extract standard spaCy entities
        for ent in doc.ents:
            if ent.label_ in entities:
                entities[ent.label_].append(ent.text.strip())

        # Custom brand detection (case-insensitive)
        text_lower = text.lower()
        for brand in self.brand_patterns:
            if brand.lower() in text_lower:
                # Avoid duplicates
                if brand not in entities['BRANDS']:
                    entities['BRANDS'].append(brand)

        # Extract potential product names (capitalized phrases)
        product_pattern = r'\b[A-Z][a-zA-Z0-9\s\-]{2,30}\b'
        potential_products = re.findall(product_pattern, text)

        # Filter and clean product names
        for product in potential_products:
            product = product.strip()
            # Skip single words that are likely not products
            if len(product.split()) == 1 and not any(char.isdigit() for char in product):
                continue
            # Check if it's likely a product (contains model numbers or specific terms)
            if (any(char.isdigit() for char in product) or
                any(term in product.lower() for term in ['pro', 'max', 'plus', 'air', 'mini', 'ultra', 'echo', 'galaxy', 'iphone', 'macbook', 'surface', 'xps'])):
                entities['PRODUCT'].append(product)

        return entities

    def analyze_sentiment(self, text):
        """Rule-based sentiment analysis"""
        doc = self.nlp(text.lower())

        positive_score = 0
        negative_score = 0

        # Count positive and negative words
        for token in doc:
            # Check for compound words and phrases
            token_text = token.text
            if token_text in self.positive_words:
                positive_score += 1
            elif token_text in self.negative_words:
                negative_score += 1

        # Check for multi-word phrases like "not recommend"
        text_lower = text.lower()
        if "not recommend" in text_lower or "don't recommend" in text_lower or "wouldn't recommend" in text_lower:
            negative_score += 2
        if "highly recommend" in text_lower or "definitely recommend" in text_lower:
            positive_score += 2

        # Check for negations that might flip sentiment
        negation_words = {'not', 'no', 'never', 'nothing', 'nowhere', 'nobody', 'none', 'neither', "n't", 'dont', "don't"}
        tokens_list = [token.text for token in doc]

        # Simple negation handling - look ahead 2-3 words after negation
        for i, token in enumerate(tokens_list):
            if token in negation_words:
                # Check next 1-3 tokens after negation
                for j in range(1, min(4, len(tokens_list) - i)):
                    if i + j < len(tokens_list):
                        next_token = tokens_list[i + j]
                        if next_token in self.positive_words:
                            positive_score = max(0, positive_score - 1)
                            negative_score += 1
                            break  # Only flip the first positive word found

        # Determine overall sentiment
        total_words = positive_score + negative_score
        if total_words == 0:
            sentiment = "NEUTRAL"
            confidence = 0.5
        elif positive_score > negative_score:
            sentiment = "POSITIVE"
            confidence = positive_score / total_words
        elif negative_score > positive_score:
            sentiment = "NEGATIVE"
            confidence = negative_score / total_words
        else:
            sentiment = "NEUTRAL"
            confidence = 0.5

        return {
            'sentiment': sentiment,
            'confidence': round(confidence, 2),
            'positive_score': positive_score,
            'negative_score': negative_score
        }

    def analyze_review(self, review_text):
        """Complete analysis of a single review"""
        entities = self.extract_entities(review_text)
        sentiment = self.analyze_sentiment(review_text)

        return {
            'review': review_text,
            'entities': entities,
            'sentiment': sentiment
        }

    def analyze_multiple_reviews(self, reviews):
        """Analyze multiple reviews and provide summary"""
        results = []
        all_brands = []
        all_products = []
        sentiment_counts = Counter()

        print("=== AMAZON REVIEWS NLP ANALYSIS ===\n")

        for i, review in enumerate(reviews, 1):
            result = self.analyze_review(review)
            results.append(result)

            print(f"REVIEW {i}:")
            print(f"Text: {review[:100]}...")
            print(f"Sentiment: {result['sentiment']['sentiment']} (Confidence: {result['sentiment']['confidence']})")

            # Display entities
            entities = result['entities']
            if entities['BRANDS']:
                print(f"Brands: {', '.join(set(entities['BRANDS']))}")
                all_brands.extend(entities['BRANDS'])

            if entities['PRODUCT']:
                print(f"Products: {', '.join(set(entities['PRODUCT']))}")
                all_products.extend(entities['PRODUCT'])

            if entities['ORG']:
                print(f"Organizations: {', '.join(set(entities['ORG']))}")

            sentiment_counts[result['sentiment']['sentiment']] += 1
            print("-" * 60)

        # Summary statistics
        print("\n=== ANALYSIS SUMMARY ===")
        print(f"Total Reviews Analyzed: {len(reviews)}")

        print(f"\nSentiment Distribution:")
        for sentiment, count in sentiment_counts.items():
            percentage = (count / len(reviews)) * 100
            print(f"  {sentiment}: {count} ({percentage:.1f}%)")

        if all_brands:
            brand_counts = Counter(all_brands)
            print(f"\nMost Mentioned Brands:")
            for brand, count in brand_counts.most_common(5):
                print(f"  {brand}: {count} mentions")

        if all_products:
            product_counts = Counter(all_products)
            print(f"\nMost Mentioned Products:")
            for product, count in product_counts.most_common(5):
                print(f"  {product}: {count} mentions")

        return results

# Initialize analyzer and run analysis
if __name__ == "__main__":
    analyzer = AmazonReviewAnalyzer()

    # Analyze sample reviews
    results = analyzer.analyze_multiple_reviews(sample_reviews)

    # Example of analyzing a single review
    print("\n" + "="*60)
    print("SINGLE REVIEW DETAILED ANALYSIS:")
    print("="*60)

    single_review = "The new Apple iPhone 14 Pro Max is absolutely incredible! Apple has created the best smartphone ever. The camera quality is outstanding and the A16 chip makes everything lightning fast. Highly recommend this product to everyone!"

    single_result = analyzer.analyze_review(single_review)

    print(f"Review: {single_result['review']}")
    print(f"\nSentiment Analysis:")
    print(f"  Sentiment: {single_result['sentiment']['sentiment']}")
    print(f"  Confidence: {single_result['sentiment']['confidence']}")
    print(f"  Positive Words Found: {single_result['sentiment']['positive_score']}")
    print(f"  Negative Words Found: {single_result['sentiment']['negative_score']}")

    print(f"\nNamed Entity Recognition:")
    for entity_type, entities in single_result['entities'].items():
        if entities:
            print(f"  {entity_type}: {', '.join(set(entities))}")