In [13]:
import sys
sys.path.append('../src')
import utils
import pickle
import json
import os
from collections import defaultdict
import math
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.chunk import RegexpParser
from nltk.stem.porter import PorterStemmer
import jellyfish

class AdvancedReviewSummarizerModel:
    """
    Advanced model for review summarization that extracts meaningful phrases with context.
    """
    
    def __init__(self, model_path="advanced_review_model.pkl"):
        self.model_path = model_path
        
        # Initialize core components
        self.idf = {}
        self.stemmer = PorterStemmer()
        
        # Load NLTK resources
        self._load_nltk_resources()
        self._initialize_stopwords()
        self._initialize_punctuation()
        
        # Parameters
        self.SUMMARY_SIZE_FACTOR = 3
        self.RF_WEIGHT = 2
        self.LEVENSHTEIN_THRESHOLD = 0.85
        
        # Initialize semantic patterns
        self._initialize_semantic_patterns()
        
    def _load_nltk_resources(self):
        """Download required NLTK resources if not present"""
        nltk_downloader = nltk.downloader.Downloader()
        resources = ['punkt', 'averaged_perceptron_tagger', 'stopwords', 'maxent_ne_chunker', 'words']
        
        for resource in resources:
            if not nltk_downloader.is_installed(resource):
                nltk_downloader.download(resource)
    
    def _initialize_stopwords(self):
        """Initialize the stopwords list"""
        # Minimal stopwords for better phrase extraction
        self.stop_words = set(['a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been',
                              'has', 'have', 'had', 'do', 'does', 'did', 'will', 'would',
                              'can', 'could', 'should', 'i', 'me', 'my', 'we', 'our', 'you',
                              'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them',
                              'this', 'that', 'these', 'those', 'and', 'or', 'but', 'in', 'on',
                              'at', 'to', 'for', 'of', 'with', 'by'])
    
    def _initialize_punctuation(self):
        """Initialize punctuation sets"""
        self.punc = ''',.;:?!'"()[]{}<>|\/@#^&*_~=+\n\t—–-'''
        self.fullstop = '.'
    
    def _initialize_semantic_patterns(self):
        """Initialize semantic patterns for phrase extraction"""
        self.semantic_patterns = [
            # Feature-quality patterns
            (r'(?P<feature>audio|sound|battery|microphone|app|customer)\s+(?P<quality>quality|life|interface|support)', 'FEATURE_QUALITY'),
            
            # Aspect-evaluation patterns
            (r'(?P<aspect>volume|battery|price|design|build|screen)\s+(?P<eval>is\s+)?(?P<quality>great|good|excellent|poor|bad|high|low|enough|not\s+enough)', 'ASPECT_EVALUATION'),
            
            # Problem-solution patterns
            (r'(?P<problem>connection\s+issues?|problems?|noise)\s+(?P<action>fixed|resolved|solved)', 'PROBLEM_SOLUTION'),
            
            # Comfort-usage patterns
            (r'(?P<comfort>comfortable|easy)\s+to\s+(?P<action>wear|use|hold|set\s?up)', 'COMFORT_USAGE'),
            
            # Performance patterns
            (r'(?P<device>microphone|camera)\s+(?P<action>picks?\s?up|captures?)\s+(?P<object>(?:background\s+)?noise|sound)', 'PERFORMANCE'),
            
            # Quality adjective patterns
            (r'(?P<adjective>crisp|clear|sleek|premium|confusing)\s+(?P<feature>audio|sound|design|interface|app)', 'QUALITY_ADJECTIVE'),
            
            # Durability patterns
            (r'(?P<action>scratches?)\s+(?P<manner>easily|quickly)', 'DURABILITY'),
            
            # Value patterns
            (r'(?P<worth>worth\s+the|high)\s+(?P<aspect>price|cost|money)', 'VALUE'),
            
            # Satisfaction patterns
            (r'(?P<feeling>love|like|hate)\s+(?P<feature>the\s+)?(?P<aspect>audio|battery|design|app|interface)', 'SATISFACTION'),
            
            # Comparison patterns
            (r'(?P<comparison>better|worse)\s+than\s+(?P<reference>expected|advertised)', 'COMPARISON'),
            
            # Update patterns
            (r'(?P<type>firmware|software)\s+update\s+(?P<result>fixed|solved|improved)', 'UPDATE_RESULT'),
        ]
        
        # Compile patterns
        self.compiled_patterns = [(re.compile(pattern, re.IGNORECASE), label) for pattern, label in self.semantic_patterns]
    
    def train(self, dataframe):
        """Train the model by computing IDF values"""
        print("Training advanced model...")
        
        vocabulary = set()
        doc_f = defaultdict(lambda: 0)
        
        # Process each review
        for i, row in dataframe.iterrows():
            cleaned_review = self._text_preprocess_clean(row['all_reviews'])
            vocabulary.update(cleaned_review)
            
            unique_words = set(cleaned_review)
            for word in unique_words:
                doc_f[word] += 1
        
        # Calculate IDF
        DOC_COUNT = len(dataframe)
        for word in vocabulary:
            self.idf[word] = math.log10(DOC_COUNT / float(doc_f[word]))
        
        print(f"Model trained on {DOC_COUNT} documents")
        print(f"Vocabulary size: {len(vocabulary)}")
        
        return self
    
    def summarize_new_product(self, reviews):
        """Extract meaningful keyphrases using advanced semantic analysis"""
        if isinstance(reviews, list):
            all_reviews = ' '.join(reviews)
        else:
            all_reviews = reviews
        
        # Extract semantic phrases
        semantic_phrases = self._extract_semantic_phrases(all_reviews)
        
        # Extract contextual phrases
        contextual_phrases = self._extract_contextual_phrases(all_reviews)
        
        # Extract syntactic phrases
        syntactic_phrases = self._extract_syntactic_phrases(all_reviews)
        
        # Combine and score all phrases
        all_phrases = semantic_phrases + contextual_phrases + syntactic_phrases
        
        # Score phrases by importance and frequency
        phrase_scores = self._score_phrases(all_phrases, all_reviews)
        
        # Get top 10 unique phrases
        top_phrases = self._get_top_phrases(phrase_scores, 10)
        
        return top_phrases
    
    def _extract_semantic_phrases(self, text):
        """Extract phrases using semantic patterns"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            for pattern, label in self.compiled_patterns:
                matches = pattern.finditer(sentence)
                for match in matches:
                    # Extract the matched groups
                    groups = match.groupdict()
                    
                    # Construct meaningful phrase based on pattern type
                    if label == 'FEATURE_QUALITY':
                        phrase = f"{groups['feature']} {groups['quality']}"
                    elif label == 'ASPECT_EVALUATION':
                        if 'quality' in groups:
                            phrase = f"{groups['aspect']} is {groups['quality']}"
                        else:
                            phrase = f"{groups['aspect']} {groups.get('eval', '')}"
                    elif label == 'PROBLEM_SOLUTION':
                        phrase = f"{groups['problem']} {groups['action']}"
                    elif label == 'COMFORT_USAGE':
                        phrase = f"{groups['comfort']} to {groups['action']}"
                    elif label == 'PERFORMANCE':
                        phrase = f"{groups['device']} {groups['action']} {groups['object']}"
                    elif label == 'QUALITY_ADJECTIVE':
                        phrase = f"{groups['adjective']} {groups['feature']}"
                    elif label == 'DURABILITY':
                        phrase = f"{groups['action']} {groups['manner']}"
                    elif label == 'VALUE':
                        phrase = f"{groups['worth']} {groups['aspect']}"
                    elif label == 'SATISFACTION':
                        phrase = f"{groups['feeling']} {groups.get('feature', '')} {groups['aspect']}"
                    elif label == 'COMPARISON':
                        phrase = f"{groups['comparison']} than {groups['reference']}"
                    elif label == 'UPDATE_RESULT':
                        phrase = f"{groups['type']} update {groups['result']}"
                    
                    # Clean and validate phrase
                    phrase = self._clean_phrase(phrase)
                    if phrase:
                        phrases.append((phrase, label))
        
        return phrases
    
    def _extract_contextual_phrases(self, text):
        """Extract phrases using contextual analysis"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            pos_tags = nltk.pos_tag(words)
            
            # Define important nouns
            important_nouns = {'battery', 'audio', 'sound', 'quality', 'life', 'design', 
                              'microphone', 'app', 'interface', 'customer', 'support', 
                              'price', 'value', 'connection', 'volume', 'screen', 
                              'setup', 'update', 'noise', 'comfort', 'build'}
            
            for i, (word, tag) in enumerate(pos_tags):
                if word in important_nouns:
                    # Look for adjacent modifiers
                    phrase_parts = [word]
                    
                    # Check left (adjectives/adverbs)
                    for j in range(i-1, max(-1, i-3), -1):
                        prev_word, prev_tag = pos_tags[j]
                        if prev_tag.startswith(('JJ', 'RB')):
                            phrase_parts.insert(0, prev_word)
                        else:
                            break
                    
                    # Check right (nouns, prepositions)
                    for j in range(i+1, min(len(pos_tags), i+3)):
                        next_word, next_tag = pos_tags[j]
                        if next_tag.startswith(('NN', 'IN')):
                            phrase_parts.append(next_word)
                        else:
                            break
                    
                    phrase = ' '.join(phrase_parts)
                    phrase = self._clean_phrase(phrase)
                    if phrase and len(phrase_parts) >= 2:
                        phrases.append((phrase, 'CONTEXTUAL'))
        
        return phrases
    
    def _extract_syntactic_phrases(self, text):
        """Extract phrases using syntactic chunking"""
        phrases = []
        sentences = sent_tokenize(text)
        
        # Define chunking grammar
        grammar = r"""
        NP: {<DT|JJ|NN.*>+}          # chunk sequences of DT, JJ, NN
        VP: {<VB.*><RB>*<NP>}        # verb + optional adverb + noun phrase
        ADJ_NP: {<JJ><NN>}           # adjective + noun
        """
        
        cp = RegexpParser(grammar)
        
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            pos_tags = nltk.pos_tag(words)
            
            # Parse the sentence
            tree = cp.parse(pos_tags)
            
            # Extract noun phrases
            for subtree in tree:
                if isinstance(subtree, nltk.Tree):
                    if subtree.label() in ['NP', 'ADJ_NP']:
                        phrase = ' '.join(word for word, tag in subtree.leaves())
                        phrase = self._clean_phrase(phrase)
                        if phrase and len(phrase.split()) >= 2:
                            phrases.append((phrase, 'SYNTACTIC'))
        
        return phrases
    
    def _score_phrases(self, phrases, text):
        """Score phrases based on multiple factors"""
        phrase_scores = defaultdict(float)
        
        for phrase, phrase_type in phrases:
            # Base frequency score
            frequency = text.lower().count(phrase.lower())
            score = frequency * 2.0  # Base score multiplier
            
            # Type-based scoring
            if phrase_type == 'FEATURE_QUALITY':
                score *= 3.0  # Highest priority for feature-quality phrases
            elif phrase_type in ['PROBLEM_SOLUTION', 'UPDATE_RESULT']:
                score *= 2.5  # High priority for problems and solutions
            elif phrase_type in ['PERFORMANCE', 'COMFORT_USAGE']:
                score *= 2.0  # Medium priority for performance descriptions
            elif phrase_type == 'SEMANTIC':
                score *= 1.8
            elif phrase_type == 'CONTEXTUAL':
                score *= 1.5
            else:
                score *= 1.2
            
            # Length-based scoring
            word_count = len(phrase.split())
            if word_count == 2:
                score *= 1.2  # Sweet spot for two-word phrases
            elif word_count == 3:
                score *= 1.1
            
            # Keyword importance
            important_keywords = {'quality', 'battery', 'life', 'audio', 'sound', 'microphone', 
                                'connection', 'interface', 'support', 'customer', 'design', 
                                'comfort', 'volume', 'update', 'fixed', 'issue', 'problem'}
            
            for keyword in important_keywords:
                if keyword in phrase.lower():
                    score *= 1.3
            
            phrase_scores[phrase] = score
        
        return phrase_scores
    
    def _get_top_phrases(self, phrase_scores, n):
        """Get top N unique phrases"""
        # Sort by score
        sorted_phrases = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
        
        # Remove overlapping phrases (keep the highest scoring)
        final_phrases = []
        seen_words = set()
        
        for phrase, score in sorted_phrases:
            words = set(phrase.lower().split())
            
            # Check if this phrase is too similar to already selected ones
            is_similar = False
            for selected_phrase in final_phrases:
                selected_words = set(selected_phrase.lower().split())
                overlap = len(words.intersection(selected_words))
                
                # Consider phrases similar if they share more than 50% of words
                if overlap > len(words) * 0.5 or overlap > len(selected_words) * 0.5:
                    is_similar = True
                    break
            
            if not is_similar and len(final_phrases) < n:
                final_phrases.append(phrase)
        
        return final_phrases
    
    def _clean_phrase(self, phrase):
        """Clean and normalize a phrase"""
        if not phrase:
            return None
        
        # Remove extra spaces
        phrase = re.sub(r'\s+', ' ', phrase).strip()
        
        # Remove phrases that are too short or just punctuation
        if len(phrase) < 3 or all(c in self.punc for c in phrase):
            return None
        
        # Remove leading/trailing punctuation
        phrase = phrase.strip(self.punc)
        
        # Normalize common variations
        replacements = {
            'apps': 'app',
            'batteries': 'battery',
            'interfaces': 'interface',
            'noises': 'noise',
            'connections': 'connection',
            'microphones': 'microphone',
            'issues': 'issue',
            'problems': 'problem',
            'updates': 'update'
        }
        
        for old, new in replacements.items():
            phrase = phrase.replace(old, new)
        
        return phrase if phrase else None
    
    def _text_preprocess_clean(self, review):
        """Clean and tokenize text for IDF calculation"""
        # Basic cleaning
        review = re.sub(r'[^\w\s]', ' ', review)
        review = review.lower()
        review = re.sub(r'\s+', ' ', review)
        
        # Tokenize
        tokens = word_tokenize(review)
        
        # Remove very short tokens and numbers
        tokens = [t for t in tokens if len(t) > 2 and not t.isdigit()]
        
        return tokens
    
    def save(self, path=None):
        """Save the trained model to disk"""
        if path is None:
            path = self.model_path
            
        model_data = {
            'idf': self.idf,
            'stop_words': self.stop_words,
            'punc': self.punc,
            'fullstop': self.fullstop,
            'semantic_patterns': self.semantic_patterns,
            'parameters': {
                'SUMMARY_SIZE_FACTOR': self.SUMMARY_SIZE_FACTOR,
                'RF_WEIGHT': self.RF_WEIGHT,
                'LEVENSHTEIN_THRESHOLD': self.LEVENSHTEIN_THRESHOLD
            }
        }
        
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {path}")
    
    @classmethod
    def load(cls, path="advanced_review_model.pkl"):
        """Load a trained model from disk"""
        model = cls(model_path=path)
        
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        
        model.idf = model_data['idf']
        model.stop_words = model_data['stop_words']
        model.punc = model_data['punc']
        model.fullstop = model_data['fullstop']
        model.semantic_patterns = model_data['semantic_patterns']
        
        # Recompile patterns
        model.compiled_patterns = [(re.compile(pattern, re.IGNORECASE), label) 
                                  for pattern, label in model.semantic_patterns]
        
        params = model_data['parameters']
        model.SUMMARY_SIZE_FACTOR = params['SUMMARY_SIZE_FACTOR']
        model.RF_WEIGHT = params['RF_WEIGHT']
        model.LEVENSHTEIN_THRESHOLD = params['LEVENSHTEIN_THRESHOLD']
        
        print(f"Model loaded from {path}")
        print(f"Vocabulary size: {len(model.idf)}")
        
        return model


# Usage example:
if __name__ == "__main__":
    import utils
    
    # Training phase (one-time)
    print("Loading training data...")
    file2 = os.path.join("data", 'asin_numreviews_allreview.csv')
    df_allreview = utils.csv_to_dataframe(file2)
    
    # Create and train model
    model = AdvancedReviewSummarizerModel()
    model.train(df_allreview)
    model.save()
    
    # Load trained model (for future use)
    loaded_model = AdvancedReviewSummarizerModel.load()
    
    # Summarize new product reviews
    new_reviews = [
        "This phone case is really good! It's durable and protective.",
        "I love the design, but it's a bit bulky for my pocket.",
        "Great value for money. The screen protector works perfectly."
    ]
    
    
    summary = loaded_model.summarize_new_product(new_reviews)
    print("\nSummary for new product:")
    for phrase in summary:
        print(phrase)

  self.punc = ''',.;:?!'"()[]{}<>|\/@#^&*_~=+\n\t—–-'''


Loading training data...
Training advanced model...
Model trained on 129396 documents
Vocabulary size: 220849
Model saved to advanced_review_model.pkl
Model loaded from advanced_review_model.pkl
Vocabulary size: 220849

Summary for new product:
screen protector
love the design
great value for money
this phone case


In [14]:
# For Untitled1.ipynb

# Option 1: Simple solution - copy the class into your notebook
# Copy the entire ReviewSummarizerModel class definition from the artifact above
# and paste it directly into a cell in your notebook

# OR


# Once import works, use the model:
try:
    model = ReviewSummarizerModel.load("review_summarizer_model.pkl")
    
    new_product_reviews = [
    "The product arrived two days late and the packaging was torn, but the item itself was undamaged.",
    "Works flawlessly with my iPhone but refuses to pair with my Android tablet.",
    "Feels durable and has survived a few accidental drops without any damage.",
    "Interface is smooth and fast way better than my previous device.",
    "Disappointed that it doesn’t support voice commands as advertised.",
    "Great for everyday tasks but struggles with anything resource heavy.",
    "Really appreciated the free accessories included in the box!",
    "Overheats when used for more than an hour straight  needs better ventilation.",
    "Setup was intuitive and the user manual was actually helpful.",
    "Not bad overall, but I expected better performance for the price."
]



    
    summary = model.summarize_new_product(new_product_reviews)
    
    print("\nSummary keyphrases:")
    for i, phrase in enumerate(summary, 1):
        print(f"{i}. {phrase}")
except Exception as e:
    print("Error:", e)

Model loaded from review_summarizer_model.pkl
Vocabulary size: 194095

Summary keyphrases:
1. better performance
2. support voice
3. previous device
4. android tablet
5. better ventilation
6. fast way
7. everyday tasks
8. resource heavy
9. user manual
10. voice commands
