In [1]:
import sys
sys.path.append('../src')
import utils
import pickle
import json
import os
from collections import defaultdict, Counter
import math
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.chunk import RegexpParser
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
import jellyfish
import spacy

class SophisticatedReviewSummarizerModel:
    """
    Sophisticated model for review summarization that understands context and extracts truly meaningful phrases.
    """
    
    def __init__(self, model_path="sophisticated_review_model.pkl"):
        self.model_path = model_path
        
        # Initialize core components
        self.idf = {}
        self.stemmer = PorterStemmer()
        
        # Load NLTK resources
        self._load_nltk_resources()
        self._initialize_stopwords()
        self._initialize_punctuation()
        
        # Parameters
        self.SUMMARY_SIZE_FACTOR = 3
        self.RF_WEIGHT = 2
        self.LEVENSHTEIN_THRESHOLD = 0.85
        
        # Initialize semantic patterns and dependencies
        self._initialize_semantic_patterns()
        self._initialize_aspect_dictionaries()
        
        # Try to load spacy model (fallback if not available)
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except:
            print("Spacy model not found. Some features may be limited.")
            self.nlp = None
        
    def _load_nltk_resources(self):
        """Download required NLTK resources if not present"""
        nltk_downloader = nltk.downloader.Downloader()
        resources = ['punkt', 'averaged_perceptron_tagger', 'stopwords', 'maxent_ne_chunker', 'words', 'wordnet']
        
        for resource in resources:
            if not nltk_downloader.is_installed(resource):
                nltk_downloader.download(resource)
    
    def _initialize_stopwords(self):
        """Initialize the stopwords list with product review specific words"""
        # Minimal stopwords to preserve context
        self.stop_words = set(['a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
        
        # Words to exclude from keyphrases
        self.filter_words = set(['get', 'got', 'getting', 'gets', 'make', 'makes', 'making', 'made',
                                'go', 'goes', 'going', 'went', 'come', 'comes', 'coming', 'came',
                                'thing', 'things', 'stuff', 'bit', 'lot', 'very', 'really', 'just',
                                'like', 'know', 'think', 'want', 'wanted', 'need', 'needed'])
    
    def _initialize_punctuation(self):
        """Initialize punctuation sets"""
        self.punc = ''',.;:?!'"()[]{}<>|\/@#^&*_~=+\n\t—–-•'''
        self.fullstop = '.'
    
    def _initialize_semantic_patterns(self):
        """Initialize semantic patterns for phrase extraction"""
        # Enhanced semantic patterns with more specificity
        self.semantic_patterns = [
            # Product attributes with descriptors
            (r'(?P<descriptor>crisp|clear|tinny|muffled|loud|soft|sharp)\s+(?P<attribute>audio|sound|music|voice)', 'SOUND_QUALITY'),
            (r'(?P<attribute>audio|sound|music)\s+(?P<descriptor>quality|clarity|is\s+(?:crisp|clear|muffled|tinny))', 'SOUND_QUALITY'),
            
            # Battery performance with context
            (r'(?P<attribute>battery|charge)\s+(?P<performance>life|lasts|duration)\s+(?P<descriptor>great|amazing|excellent|poor|short|long|all\s+day)', 'BATTERY_PERFORMANCE'),
            (r'(?P<duration>all\s+day|hours?|days?)\s+(?P<performance>battery|charge|power)', 'BATTERY_DURATION'),
            
            # Connection and connectivity issues
            (r'(?P<issue>connection|connectivity|pairing)\s+(?P<problem>issues?|problems?|difficulties|dropped)', 'CONNECTION_ISSUES'),
            (r'(?P<result>fixed|resolved|solved)\s+(?P<issue>connection|connectivity|pairing)', 'CONNECTION_RESOLVED'),
            
            # Comfort and wearability
            (r'(?P<comfort>comfortable|uncomfortable)\s+to\s+(?P<action>wear|use|hold)', 'COMFORT_LEVEL'),
            (r'(?P<comfort>no|zero|minimal)\s+(?P<issue>(?:ear\s+)?fatigue|discomfort)', 'COMFORT_POSITIVE'),
            
            # User interface experiences
            (r'(?P<interface>app|interface|software)\s+(?P<quality>confusing|intuitive|user-friendly|difficult|easy)', 'INTERFACE_QUALITY'),
            (r'(?P<missing>lacks?|missing)\s+(?P<feature>basic\s+features?|functionality)', 'INTERFACE_ISSUES'),
            
            # Customer support experiences
            (r'(?P<service>customer\s+support|tech\s+support|service)\s+(?P<quality>friendly|helpful|quick|slow|poor)', 'SUPPORT_QUALITY'),
            (r'(?P<action>resolved?|fixed|helped)\s+(?P<object>my\s+)?(?P<issue>issue|problem)', 'SUPPORT_RESOLUTION'),
            
            # Price and value perception
            (r'(?P<value>worth\s+the|overpriced|expensive|cheap|affordable)\s+(?P<aspect>price|cost|money)', 'VALUE_PERCEPTION'),
            (r'(?P<price>price)\s+(?P<evaluation>is\s+)?(?P<adjective>high|low|fair|reasonable)', 'PRICE_EVALUATION'),
            
            # Durability and build quality
            (r'(?P<action>scratches?|dents?|cracks?)\s+(?P<manner>easily|quickly|readily)', 'DURABILITY_ISSUE'),
            (r'(?P<quality>build|construction)\s+(?P<evaluation>quality|is\s+(?:good|poor|excellent|solid))', 'BUILD_QUALITY'),
            
            # Microphone performance
            (r'(?P<device>microphone?|mic)\s+(?P<action>picks?\s?up|captures?)\s+(?P<issue>background\s+noise|ambient\s+sounds?)', 'MIC_NOISE_ISSUE'),
            (r'(?P<device>microphone?|mic)\s+(?P<quality>quality|performance)\s+(?P<evaluation>good|poor|excellent|bad)', 'MIC_QUALITY'),
            
            # Setup and installation
            (r'(?P<process>setup|installation|configuration)\s+(?P<evaluation>seamless|easy|difficult|complicated)', 'SETUP_EXPERIENCE'),
            (r'(?P<evaluation>easy|hard|difficult)\s+to\s+(?P<action>set\s?up|install|configure)', 'SETUP_EASE'),
            
            # Volume and loudness
            (r'(?P<issue>volume|loudness)\s+(?P<evaluation>not\s+enough|insufficient|too\s+low|perfect)\s+(?P<context>outdoors?|in\s+public)?', 'VOLUME_ISSUE'),
            (r'(?P<evaluation>loud\s+enough|too\s+quiet|perfect\s+volume)', 'VOLUME_EVALUATION'),
            
            # Software updates and fixes
            (r'(?P<type>firmware|software)\s+(?P<action>update)\s+(?P<result>fixed|solved|improved|broke)', 'UPDATE_RESULT'),
            (r'(?P<action>update)\s+(?P<result>fixed|resolved)\s+(?P<issue>everything|issues?|problems?)', 'UPDATE_FIX'),
        ]
        
        # Compile patterns
        self.compiled_patterns = [(re.compile(pattern, re.IGNORECASE), label) for pattern, label in self.semantic_patterns]
    
    def _initialize_aspect_dictionaries(self):
        """Initialize dictionaries for product aspects and their modifiers"""
        # Product aspects and related words
        self.product_aspects = {
            'sound': ['audio', 'music', 'voice', 'volume', 'sound', 'noise'],
            'battery': ['battery', 'charge', 'power', 'juice'],
            'design': ['design', 'look', 'appearance', 'aesthetics', 'build'],
            'comfort': ['comfort', 'fit', 'wear', 'ergonomic'],
            'interface': ['app', 'interface', 'software', 'ui', 'menu'],
            'support': ['support', 'service', 'help', 'assistance'],
            'connectivity': ['connection', 'bluetooth', 'wireless', 'pairing', 'connect'],
            'microphone': ['microphone', 'mic', 'calling', 'call'],
            'durability': ['durability', 'build', 'quality', 'sturdy', 'solid'],
            'setup': ['setup', 'installation', 'configuration', 'install'],
            'price': ['price', 'cost', 'value', 'money', 'expensive', 'cheap'],
            'performance': ['performance', 'speed', 'response', 'lag']
        }
        
        # Sentiment modifiers
        self.sentiment_modifiers = {
            'positive': ['excellent', 'great', 'amazing', 'fantastic', 'superb', 'wonderful', 
                        'perfect', 'love', 'best', 'impressive', 'outstanding', 'superior'],
            'negative': ['poor', 'bad', 'terrible', 'awful', 'horrible', 'worst', 'disappointing',
                        'frustrating', 'annoying', 'hate', 'subpar', 'mediocre'],
            'neutral': ['okay', 'fine', 'average', 'decent', 'adequate', 'acceptable']
        }
    
    def summarize_new_product(self, reviews):
        """Extract meaningful keyphrases using sophisticated analysis"""
        if isinstance(reviews, list):
            all_reviews = ' '.join(reviews)
        else:
            all_reviews = reviews
        
        # Multiple extraction methods
        semantic_phrases = self._extract_semantic_phrases(all_reviews)
        dependency_phrases = self._extract_dependency_phrases(all_reviews) if self.nlp else []
        contextual_phrases = self._extract_contextual_phrases(all_reviews)
        compound_phrases = self._extract_compound_phrases(all_reviews)
        
        # Combine all phrases
        all_phrases = semantic_phrases + dependency_phrases + contextual_phrases + compound_phrases
        
        # Advanced scoring and ranking
        phrase_scores = self._advanced_score_phrases(all_phrases, all_reviews)
        
        # Extract top phrases with diversity
        top_phrases = self._extract_diverse_phrases(phrase_scores, 10)
        
        return top_phrases
    
    def _extract_semantic_phrases(self, text):
        """Extract phrases using semantic patterns with improved context"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            for pattern, label in self.compiled_patterns:
                matches = pattern.finditer(sentence)
                for match in matches:
                    groups = match.groupdict()
                    
                    # Construct more meaningful phrases based on pattern
                    if label == 'SOUND_QUALITY':
                        if 'descriptor' in groups and 'attribute' in groups:
                            phrase = f"{groups['descriptor']} {groups['attribute']}"
                        else:
                            phrase = f"{groups['attribute']} {groups.get('descriptor', 'quality')}"
                    
                    elif label == 'BATTERY_PERFORMANCE':
                        if 'descriptor' in groups:
                            phrase = f"battery life {groups['descriptor']}"
                        else:
                            phrase = f"{groups['performance']} {groups.get('descriptor', '')}"
                    
                    elif label == 'CONNECTION_ISSUES':
                        phrase = f"{groups['issue']} {groups['problem']}"
                    
                    elif label == 'CONNECTION_RESOLVED':
                        phrase = f"{groups['issue']} {groups['result']}"
                    
                    elif label == 'COMFORT_LEVEL':
                        phrase = f"{groups['comfort']} to {groups['action']}"
                    
                    elif label == 'COMFORT_POSITIVE':
                        phrase = f"no {groups['issue']}"
                    
                    elif label == 'INTERFACE_QUALITY':
                        phrase = f"app interface {groups['quality']}"
                    
                    elif label == 'INTERFACE_ISSUES':
                        phrase = f"lacks basic features"
                    
                    elif label == 'SUPPORT_QUALITY':
                        phrase = f"customer support {groups['quality']}"
                    
                    elif label == 'SUPPORT_RESOLUTION':
                        phrase = f"support {groups['action']} issue"
                    
                    elif label == 'VALUE_PERCEPTION':
                        phrase = f"{groups['value'].replace('_', ' ')} {groups['aspect']}"
                    
                    elif label == 'DURABILITY_ISSUE':
                        phrase = f"{groups['action']} {groups['manner']}"
                    
                    elif label == 'MIC_NOISE_ISSUE':
                        phrase = f"microphone picks up noise"
                    
                    elif label == 'SETUP_EXPERIENCE':
                        phrase = f"setup {groups['evaluation']}"
                    
                    elif label == 'VOLUME_ISSUE':
                        if 'context' in groups and groups['context']:
                            phrase = f"volume {groups['evaluation']} {groups['context']}"
                        else:
                            phrase = f"volume {groups['evaluation']}"
                    
                    elif label == 'UPDATE_RESULT':
                        phrase = f"{groups['type']} update {groups['result']}"
                    
                    else:
                        # Default construction
                        phrase = ' '.join([v for v in groups.values() if v])
                    
                    # Clean and validate phrase
                    phrase = self._clean_phrase(phrase)
                    if phrase:
                        phrases.append((phrase, label, 1.0))
        
        return phrases
    
    def _extract_dependency_phrases(self, text):
        """Extract phrases using dependency parsing (if spacy is available)"""
        if not self.nlp:
            return []
        
        phrases = []
        doc = self.nlp(text)
        
        for sent in doc.sents:
            # Find product aspects and their relations
            for token in sent:
                # Look for product aspect words
                if token.lemma_ in sum(self.product_aspects.values(), []):
                    # Get modifiers and complements
                    modifiers = []
                    complements = []
                    
                    for child in token.children:
                        if child.dep_ in ['amod', 'compound']:  # adjective/compound modifiers
                            modifiers.append(child.text)
                        elif child.dep_ in ['prep', 'pobj']:  # prepositional phrases
                            complements.append(child.text)
                    
                    # Construct phrases
                    if modifiers:
                        phrase = ' '.join(modifiers + [token.text])
                        phrases.append((phrase, 'DEPENDENCY', 0.8))
                    
                    if complements and modifiers:
                        phrase = ' '.join(modifiers + [token.text] + complements)
                        phrases.append((phrase, 'DEPENDENCY', 0.9))
        
        return phrases
    
    def _extract_contextual_phrases(self, text):
        """Extract phrases with contextual understanding"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            pos_tags = nltk.pos_tag(words)
            
            # Advanced phrase construction
            i = 0
            while i < len(pos_tags):
                word, tag = pos_tags[i]
                
                # Check if it's an important product aspect
                for aspect, terms in self.product_aspects.items():
                    if word in terms:
                        phrase_parts = [word]
                        context_found = False
                        
                        # Look ahead for relevant context
                        j = i + 1
                        while j < len(pos_tags) and j < i + 4:
                            next_word, next_tag = pos_tags[j]
                            
                            # Add if it's a descriptive word
                            if next_tag.startswith(('JJ', 'RB', 'VB')):
                                phrase_parts.append(next_word)
                                context_found = True
                            elif next_tag.startswith('NN') and next_word in terms:
                                phrase_parts.append(next_word)
                                context_found = True
                            elif next_word not in self.stop_words and not next_word in self.punc:
                                # Check if it's semantically related
                                for sentiment_set in self.sentiment_modifiers.values():
                                    if next_word in sentiment_set:
                                        phrase_parts.append(next_word)
                                        context_found = True
                                        break
                            j += 1
                        
                        # If we found meaningful context, create phrase
                        if context_found and len(phrase_parts) >= 2:
                            phrase = ' '.join(phrase_parts)
                            phrase = self._clean_phrase(phrase)
                            if phrase:
                                phrases.append((phrase, 'CONTEXTUAL', 0.7))
                
                i += 1
        
        return phrases
    
    def _extract_compound_phrases(self, text):
        """Extract compound phrases with better context understanding"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            # Find negation patterns
            negation_words = ['not', 'no', 'never', 'hardly', 'barely', 'scarcely']
            words = word_tokenize(sentence.lower())
            
            # Look for negative phrases
            for i, word in enumerate(words):
                if word in negation_words and i < len(words) - 1:
                    # Look for following adjectives or nouns
                    for j in range(i + 1, min(i + 4, len(words))):
                        next_word = words[j]
                        if next_word not in self.stop_words:
                            phrase = f"{word} {next_word}"
                            
                            # Extend with more context if available
                            if j < len(words) - 1:
                                further_word = words[j + 1]
                                if further_word in sum(self.product_aspects.values(), []):
                                    phrase += f" {further_word}"
                            
                            phrase = self._clean_phrase(phrase)
                            if phrase:
                                phrases.append((phrase, 'COMPOUND', 0.8))
                            break
            
            # Look for comparison phrases
            comparison_words = ['better', 'worse', 'best', 'worst', 'more', 'less']
            for i, word in enumerate(words):
                if word in comparison_words and i < len(words) - 2:
                    if words[i + 1] == 'than':
                        following = words[i + 2:]
                        for aspect_terms in self.product_aspects.values():
                            for term in aspect_terms:
                                if term in following:
                                    phrase = f"{word} than {term}"
                                    phrase = self._clean_phrase(phrase)
                                    if phrase:
                                        phrases.append((phrase, 'COMPARISON', 0.9))
                                    break
        
        return phrases
    
    def _advanced_score_phrases(self, phrases, text):
        """Advanced scoring with multiple factors"""
        phrase_scores = defaultdict(float)
        
        for phrase, phrase_type, base_weight in phrases:
            # Calculate frequency
            frequency = text.lower().count(phrase.lower())
            
            # Base score
            score = frequency * base_weight * 2.0
            
            # Type-based multipliers
            type_multipliers = {
                'SOUND_QUALITY': 2.5,
                'BATTERY_PERFORMANCE': 2.5,
                'CONNECTION_ISSUES': 2.0,
                'COMFORT_LEVEL': 2.0,
                'INTERFACE_QUALITY': 2.0,
                'SUPPORT_QUALITY': 1.8,
                'MIC_NOISE_ISSUE': 2.2,
                'VALUE_PERCEPTION': 1.9,
                'UPDATE_RESULT': 2.3,
                'DEPENDENCY': 1.5,
                'CONTEXTUAL': 1.3,
                'COMPOUND': 1.4
            }
            score *= type_multipliers.get(phrase_type, 1.0)
            
            # Word count bonus
            word_count = len(phrase.split())
            if word_count == 2:
                score *= 1.3  # Optimal length
            elif word_count == 3:
                score *= 1.1
            
            # Aspect coverage bonus
            covered_aspects = []
            for aspect, terms in self.product_aspects.items():
                if any(term in phrase.lower() for term in terms):
                    covered_aspects.append(aspect)
            
            if covered_aspects:
                score *= (1 + 0.2 * len(covered_aspects))
            
            # Sentiment clarity bonus
            for sentiment, words in self.sentiment_modifiers.items():
                if any(word in phrase.lower() for word in words):
                    score *= 1.2
                    break
            
            phrase_scores[phrase] = score
        
        return phrase_scores
    
    def _extract_diverse_phrases(self, phrase_scores, n):
        """Extract diverse phrases covering different aspects"""
        sorted_phrases = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
        
        final_phrases = []
        covered_aspects = set()
        
        for phrase, score in sorted_phrases:
            if len(final_phrases) >= n:
                break
            
            # Check if this phrase covers a new aspect
            phrase_aspects = set()
            for aspect, terms in self.product_aspects.items():
                if any(term in phrase.lower() for term in terms):
                    phrase_aspects.add(aspect)
            
            # Add if it covers new aspects or is very high scoring
            if not phrase_aspects.intersection(covered_aspects) or score > 5.0:
                final_phrases.append(phrase)
                covered_aspects.update(phrase_aspects)
        
        # If we don't have enough diverse phrases, fill with highest scoring ones
        while len(final_phrases) < n and sorted_phrases:
            phrase, _ = sorted_phrases.pop(0)
            if phrase not in final_phrases:
                final_phrases.append(phrase)
        
        return final_phrases
    
    def _clean_phrase(self, phrase):
        """Clean and normalize a phrase"""
        if not phrase:
            return None
        
        # Remove extra spaces
        phrase = re.sub(r'\s+', ' ', phrase).strip()
        
        # Remove artifacts
        phrase = phrase.replace('_', ' ')
        
        # Remove leading/trailing punctuation
        phrase = phrase.strip(self.punc)
        
        # Skip if too short or just stopwords
        words = phrase.split()
        if len(words) < 2 or all(w in self.stop_words for w in words):
            return None
        
        # Remove phrases starting or ending with filter words
        if words[0] in self.filter_words or words[-1] in self.filter_words:
            return None
        
        return phrase
    
    def train(self, dataframe):
        """Train the model by computing IDF values"""
        print("Training sophisticated model...")
        
        vocabulary = set()
        doc_f = defaultdict(lambda: 0)
        
        for i, row in dataframe.iterrows():
            cleaned_review = self._text_preprocess_clean(row['all_reviews'])
            vocabulary.update(cleaned_review)
            
            unique_words = set(cleaned_review)
            for word in unique_words:
                doc_f[word] += 1
        
        DOC_COUNT = len(dataframe)
        for word in vocabulary:
            self.idf[word] = math.log10(DOC_COUNT / float(doc_f[word]))
        
        print(f"Model trained on {DOC_COUNT} documents")
        print(f"Vocabulary size: {len(vocabulary)}")
        
        return self
    
    def _text_preprocess_clean(self, review):
        """Clean and tokenize text for IDF calculation"""
        review = re.sub(r'[^\w\s]', ' ', review)
        review = review.lower()
        review = re.sub(r'\s+', ' ', review)
        
        tokens = word_tokenize(review)
        tokens = [t for t in tokens if len(t) > 2 and not t.isdigit()]
        
        return tokens
    
    def save(self, path=None):
        """Save the trained model to disk"""
        if path is None:
            path = self.model_path
            
        model_data = {
            'idf': self.idf,
            'stop_words': self.stop_words,
            'filter_words': self.filter_words,
            'punc': self.punc,
            'fullstop': self.fullstop,
            'semantic_patterns': self.semantic_patterns,
            'product_aspects': self.product_aspects,
            'sentiment_modifiers': self.sentiment_modifiers,
            'parameters': {
                'SUMMARY_SIZE_FACTOR': self.SUMMARY_SIZE_FACTOR,
                'RF_WEIGHT': self.RF_WEIGHT,
                'LEVENSHTEIN_THRESHOLD': self.LEVENSHTEIN_THRESHOLD
            }
        }
        
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {path}")
    
    @classmethod
    def load(cls, path="sophisticated_review_model.pkl"):
        """Load a trained model from disk"""
        model = cls(model_path=path)
        
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        
        model.idf = model_data['idf']
        model.stop_words = model_data['stop_words']
        model.filter_words = model_data['filter_words']
        model.punc = model_data['punc']
        model.fullstop = model_data['fullstop']
        model.semantic_patterns = model_data['semantic_patterns']
        model.product_aspects = model_data['product_aspects']
        model.sentiment_modifiers = model_data['sentiment_modifiers']
        
        # Recompile patterns
        model.compiled_patterns = [(re.compile(pattern, re.IGNORECASE), label) 
                                  for pattern, label in model.semantic_patterns]
        
        params = model_data['parameters']
        model.SUMMARY_SIZE_FACTOR = params['SUMMARY_SIZE_FACTOR']
        model.RF_WEIGHT = params['RF_WEIGHT']
        model.LEVENSHTEIN_THRESHOLD = params['LEVENSHTEIN_THRESHOLD']
        
        print(f"Model loaded from {path}")
        print(f"Vocabulary size: {len(model.idf)}")
        
        return model

[nltk_data] Downloading package punkt to /Users/spartan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/spartan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/spartan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/spartan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
import sys
sys.path.append('../src')
import pickle
import json
import os
from collections import defaultdict, Counter
import math
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.chunk import RegexpParser
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.parse import CoreNLPParser
import jellyfish

class AdvancedReviewSummarizerModel:
    """
    Advanced model for review summarization without spacy dependency.
    Uses NLTK for all NLP operations.
    """
    
    def __init__(self, model_path="advanced_review_model.pkl"):
        self.model_path = model_path
        
        # Initialize core components
        self.idf = {}
        self.stemmer = PorterStemmer()
        
        # Load NLTK resources
        self._load_nltk_resources()
        self._initialize_stopwords()
        self._initialize_punctuation()
        
        # Parameters
        self.SUMMARY_SIZE_FACTOR = 3
        self.RF_WEIGHT = 2
        self.LEVENSHTEIN_THRESHOLD = 0.85
        
        # Initialize semantic patterns and dependencies
        self._initialize_semantic_patterns()
        self._initialize_aspect_dictionaries()
        self._initialize_dependency_parser()
        
    def _load_nltk_resources(self):
        """Download required NLTK resources if not present"""
        nltk_downloader = nltk.downloader.Downloader()
        resources = ['punkt', 'averaged_perceptron_tagger', 'stopwords', 'maxent_ne_chunker', 'words', 'wordnet']
        
        for resource in resources:
            if not nltk_downloader.is_installed(resource):
                nltk_downloader.download(resource)
    
    def _initialize_stopwords(self):
        """Initialize the stopwords list with product review specific words"""
        # Minimal stopwords to preserve context
        self.stop_words = set(['a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])
        
        # Words to exclude from keyphrases
        self.filter_words = set(['get', 'got', 'getting', 'gets', 'make', 'makes', 'making', 'made',
                                'go', 'goes', 'going', 'went', 'come', 'comes', 'coming', 'came',
                                'thing', 'things', 'stuff', 'bit', 'lot', 'very', 'really', 'just',
                                'like', 'know', 'think', 'want', 'wanted', 'need', 'needed'])
    
    def _initialize_punctuation(self):
        """Initialize punctuation sets"""
        self.punc = ''',.;:?!'"()[]{}<>|\/@#^&*_~=+\n\t—–-•'''
        self.fullstop = '.'
    
    def _initialize_semantic_patterns(self):
        """Initialize semantic patterns for phrase extraction"""
        # Enhanced semantic patterns with more specificity
        self.semantic_patterns = [
            # Product attributes with descriptors
            (r'(?P<descriptor>crisp|clear|tinny|muffled|loud|soft|sharp)\s+(?P<attribute>audio|sound|music|voice)', 'SOUND_QUALITY'),
            (r'(?P<attribute>audio|sound|music)\s+(?P<descriptor>quality|clarity|is\s+(?:crisp|clear|muffled|tinny))', 'SOUND_QUALITY'),
            
            # Battery performance with context
            (r'(?P<attribute>battery|charge)\s+(?P<performance>life|lasts|duration)\s+(?P<descriptor>great|amazing|excellent|poor|short|long|all\s+day)', 'BATTERY_PERFORMANCE'),
            (r'(?P<duration>all\s+day|hours?|days?)\s+(?P<performance>battery|charge|power)', 'BATTERY_DURATION'),
            
            # Connection and connectivity issues
            (r'(?P<issue>connection|connectivity|pairing)\s+(?P<problem>issues?|problems?|difficulties|dropped)', 'CONNECTION_ISSUES'),
            (r'(?P<result>fixed|resolved|solved)\s+(?P<issue>connection|connectivity|pairing)', 'CONNECTION_RESOLVED'),
            
            # Comfort and wearability
            (r'(?P<comfort>comfortable|uncomfortable)\s+to\s+(?P<action>wear|use|hold)', 'COMFORT_LEVEL'),
            (r'(?P<comfort>no|zero|minimal)\s+(?P<issue>(?:ear\s+)?fatigue|discomfort)', 'COMFORT_POSITIVE'),
            
            # User interface experiences
            (r'(?P<interface>app|interface|software)\s+(?P<quality>confusing|intuitive|user-friendly|difficult|easy)', 'INTERFACE_QUALITY'),
            (r'(?P<missing>lacks?|missing)\s+(?P<feature>basic\s+features?|functionality)', 'INTERFACE_ISSUES'),
            
            # Customer support experiences
            (r'(?P<service>customer\s+support|tech\s+support|service)\s+(?P<quality>friendly|helpful|quick|slow|poor)', 'SUPPORT_QUALITY'),
            (r'(?P<action>resolved?|fixed|helped)\s+(?P<object>my\s+)?(?P<issue>issue|problem)', 'SUPPORT_RESOLUTION'),
            
            # Price and value perception
            (r'(?P<value>worth\s+the|overpriced|expensive|cheap|affordable)\s+(?P<aspect>price|cost|money)', 'VALUE_PERCEPTION'),
            (r'(?P<price>price)\s+(?P<evaluation>is\s+)?(?P<adjective>high|low|fair|reasonable)', 'PRICE_EVALUATION'),
            
            # Durability and build quality
            (r'(?P<action>scratches?|dents?|cracks?)\s+(?P<manner>easily|quickly|readily)', 'DURABILITY_ISSUE'),
            (r'(?P<quality>build|construction)\s+(?P<evaluation>quality|is\s+(?:good|poor|excellent|solid))', 'BUILD_QUALITY'),
            
            # Microphone performance
            (r'(?P<device>microphone?|mic)\s+(?P<action>picks?\s?up|captures?)\s+(?P<issue>background\s+noise|ambient\s+sounds?)', 'MIC_NOISE_ISSUE'),
            (r'(?P<device>microphone?|mic)\s+(?P<quality>quality|performance)\s+(?P<evaluation>good|poor|excellent|bad)', 'MIC_QUALITY'),
            
            # Setup and installation
            (r'(?P<process>setup|installation|configuration)\s+(?P<evaluation>seamless|easy|difficult|complicated)', 'SETUP_EXPERIENCE'),
            (r'(?P<evaluation>easy|hard|difficult)\s+to\s+(?P<action>set\s?up|install|configure)', 'SETUP_EASE'),
            
            # Volume and loudness
            (r'(?P<issue>volume|loudness)\s+(?P<evaluation>not\s+enough|insufficient|too\s+low|perfect)\s+(?P<context>outdoors?|in\s+public)?', 'VOLUME_ISSUE'),
            (r'(?P<evaluation>loud\s+enough|too\s+quiet|perfect\s+volume)', 'VOLUME_EVALUATION'),
            
            # Software updates and fixes
            (r'(?P<type>firmware|software)\s+(?P<action>update)\s+(?P<result>fixed|solved|improved|broke)', 'UPDATE_RESULT'),
            (r'(?P<action>update)\s+(?P<result>fixed|resolved)\s+(?P<issue>everything|issues?|problems?)', 'UPDATE_FIX'),
        ]
        
        # Compile patterns
        self.compiled_patterns = [(re.compile(pattern, re.IGNORECASE), label) for pattern, label in self.semantic_patterns]
    
    def _initialize_aspect_dictionaries(self):
        """Initialize dictionaries for product aspects and their modifiers"""
        # Product aspects and related words
        self.product_aspects = {
            'sound': ['audio', 'music', 'voice', 'volume', 'sound', 'noise'],
            'battery': ['battery', 'charge', 'power', 'juice'],
            'design': ['design', 'look', 'appearance', 'aesthetics', 'build'],
            'comfort': ['comfort', 'fit', 'wear', 'ergonomic'],
            'interface': ['app', 'interface', 'software', 'ui', 'menu'],
            'support': ['support', 'service', 'help', 'assistance'],
            'connectivity': ['connection', 'bluetooth', 'wireless', 'pairing', 'connect'],
            'microphone': ['microphone', 'mic', 'calling', 'call'],
            'durability': ['durability', 'build', 'quality', 'sturdy', 'solid'],
            'setup': ['setup', 'installation', 'configuration', 'install'],
            'price': ['price', 'cost', 'value', 'money', 'expensive', 'cheap'],
            'performance': ['performance', 'speed', 'response', 'lag']
        }
        
        # Sentiment modifiers
        self.sentiment_modifiers = {
            'positive': ['excellent', 'great', 'amazing', 'fantastic', 'superb', 'wonderful', 
                        'perfect', 'love', 'best', 'impressive', 'outstanding', 'superior'],
            'negative': ['poor', 'bad', 'terrible', 'awful', 'horrible', 'worst', 'disappointing',
                        'frustrating', 'annoying', 'hate', 'subpar', 'mediocre'],
            'neutral': ['okay', 'fine', 'average', 'decent', 'adequate', 'acceptable']
        }
    
    def _initialize_dependency_parser(self):
        """Initialize NLTK-based dependency parser (alternative to spacy)"""
        # Define chunk grammar for dependency-like relationships
        self.chunk_grammar = r"""
        NP: {<DT|JJ|NN.*>+}          # noun phrases
        VP: {<VB.*><RB>*<NP>}        # verb phrases
        PREP: {<IN><NP>}             # prepositional phrases
        ADJ_NP: {<JJ><NN>}           # adjective + noun
        SUBJ_PRED: {<NP><VB.*>}      # subject + predicate
        """
        self.chunk_parser = RegexpParser(self.chunk_grammar)
    
    def summarize_new_product(self, reviews):
        """Extract meaningful keyphrases using advanced analysis"""
        if isinstance(reviews, list):
            all_reviews = ' '.join(reviews)
        else:
            all_reviews = reviews
        
        # Multiple extraction methods
        semantic_phrases = self._extract_semantic_phrases(all_reviews)
        dependency_phrases = self._extract_dependency_phrases(all_reviews)
        contextual_phrases = self._extract_contextual_phrases(all_reviews)
        compound_phrases = self._extract_compound_phrases(all_reviews)
        linguistic_phrases = self._extract_linguistic_patterns(all_reviews)
        
        # Combine all phrases
        all_phrases = semantic_phrases + dependency_phrases + contextual_phrases + compound_phrases + linguistic_phrases
        
        # Advanced scoring and ranking
        phrase_scores = self._advanced_score_phrases(all_phrases, all_reviews)
        
        # Extract top phrases with diversity
        top_phrases = self._extract_diverse_phrases(phrase_scores, 10)
        
        return top_phrases
    
    def _extract_dependency_phrases(self, text):
        """Extract phrases using NLTK chunking (spacy alternative)"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            pos_tags = nltk.pos_tag(words)
            
            # Parse using chunk grammar
            tree = self.chunk_parser.parse(pos_tags)
            
            # Extract phrases from parsed tree
            for subtree in tree:
                if isinstance(subtree, nltk.Tree):
                    # Extract phrases based on chunk type
                    if subtree.label() in ['NP', 'ADJ_NP']:
                        phrase = ' '.join(word for word, tag in subtree.leaves())
                        
                        # Check if it contains product aspects
                        for aspect, terms in self.product_aspects.items():
                            if any(term in phrase for term in terms):
                                phrase = self._clean_phrase(phrase)
                                if phrase:
                                    phrases.append((phrase, 'DEPENDENCY', 0.8))
                                break
        
        return phrases
    
    def _extract_linguistic_patterns(self, text):
        """Extract phrases using linguistic patterns"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            pos_tags = nltk.pos_tag(words)
            
            # Look for specific linguistic patterns
            for i in range(len(pos_tags)):
                word, tag = pos_tags[i]
                
                # Pattern 1: Adjective + Product Aspect
                if tag.startswith('JJ') and i < len(pos_tags) - 1:
                    next_word, next_tag = pos_tags[i + 1]
                    if next_tag.startswith('NN'):
                        for aspect, terms in self.product_aspects.items():
                            if next_word in terms:
                                phrase = f"{word} {next_word}"
                                phrases.append((phrase, 'LINGUISTIC', 0.9))
                
                # Pattern 2: Product Aspect + Modifier
                if tag.startswith('NN'):
                    for aspect, terms in self.product_aspects.items():
                        if word in terms:
                            # Look for following modifiers
                            if i < len(pos_tags) - 1:
                                next_word, next_tag = pos_tags[i + 1]
                                if next_tag.startswith(('JJ', 'RB', 'VB')):
                                    phrase = f"{word} {next_word}"
                                    phrases.append((phrase, 'LINGUISTIC', 0.85))
                            
                            # Look for preceding modifiers
                            if i > 0:
                                prev_word, prev_tag = pos_tags[i - 1]
                                if prev_tag.startswith(('JJ', 'RB')):
                                    phrase = f"{prev_word} {word}"
                                    phrases.append((phrase, 'LINGUISTIC', 0.85))
                
                # Pattern 3: Verb + Product Aspect (e.g., "scratches easily")
                if tag.startswith('VB') and i < len(pos_tags) - 1:
                    next_word, next_tag = pos_tags[i + 1]
                    if next_tag.startswith('RB'):
                        phrase = f"{word} {next_word}"
                        phrases.append((phrase, 'LINGUISTIC', 0.8))
                
                # Pattern 4: Negation + Adjective/Verb (e.g., "not enough")
                if word in ['not', 'no', 'never'] and i < len(pos_tags) - 1:
                    next_word, next_tag = pos_tags[i + 1]
                    if next_tag.startswith(('JJ', 'RB', 'VB')):
                        phrase = f"{word} {next_word}"
                        phrases.append((phrase, 'LINGUISTIC', 0.9))
        
        return phrases
    
    def _extract_semantic_phrases(self, text):
        """Extract phrases using semantic patterns with improved context"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            for pattern, label in self.compiled_patterns:
                matches = pattern.finditer(sentence)
                for match in matches:
                    groups = match.groupdict()
                    
                    # Construct more meaningful phrases based on pattern
                    if label == 'SOUND_QUALITY':
                        if 'descriptor' in groups and 'attribute' in groups:
                            phrase = f"{groups['descriptor']} {groups['attribute']}"
                        else:
                            phrase = f"{groups['attribute']} {groups.get('descriptor', 'quality')}"
                    
                    elif label == 'BATTERY_PERFORMANCE':
                        if 'descriptor' in groups:
                            phrase = f"battery life {groups['descriptor']}"
                        else:
                            phrase = f"{groups['performance']} {groups.get('descriptor', '')}"
                    
                    elif label == 'CONNECTION_ISSUES':
                        phrase = f"{groups['issue']} {groups['problem']}"
                    
                    elif label == 'CONNECTION_RESOLVED':
                        phrase = f"{groups['issue']} {groups['result']}"
                    
                    elif label == 'COMFORT_LEVEL':
                        phrase = f"{groups['comfort']} to {groups['action']}"
                    
                    elif label == 'COMFORT_POSITIVE':
                        phrase = f"no {groups['issue']}"
                    
                    elif label == 'INTERFACE_QUALITY':
                        phrase = f"app interface {groups['quality']}"
                    
                    elif label == 'INTERFACE_ISSUES':
                        phrase = f"lacks basic features"
                    
                    elif label == 'SUPPORT_QUALITY':
                        phrase = f"customer support {groups['quality']}"
                    
                    elif label == 'SUPPORT_RESOLUTION':
                        phrase = f"support {groups['action']} issue"
                    
                    elif label == 'VALUE_PERCEPTION':
                        phrase = f"{groups['value'].replace('_', ' ')} {groups['aspect']}"
                    
                    elif label == 'DURABILITY_ISSUE':
                        phrase = f"{groups['action']} {groups['manner']}"
                    
                    elif label == 'MIC_NOISE_ISSUE':
                        phrase = f"microphone picks up noise"
                    
                    elif label == 'SETUP_EXPERIENCE':
                        phrase = f"setup {groups['evaluation']}"
                    
                    elif label == 'VOLUME_ISSUE':
                        if 'context' in groups and groups['context']:
                            phrase = f"volume {groups['evaluation']} {groups['context']}"
                        else:
                            phrase = f"volume {groups['evaluation']}"
                    
                    elif label == 'UPDATE_RESULT':
                        phrase = f"{groups['type']} update {groups['result']}"
                    
                    else:
                        # Default construction
                        phrase = ' '.join([v for v in groups.values() if v])
                    
                    # Clean and validate phrase
                    phrase = self._clean_phrase(phrase)
                    if phrase:
                        phrases.append((phrase, label, 1.0))
        
        return phrases
    
    def _extract_contextual_phrases(self, text):
        """Extract phrases with contextual understanding"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            words = word_tokenize(sentence.lower())
            pos_tags = nltk.pos_tag(words)
            
            # Advanced phrase construction
            i = 0
            while i < len(pos_tags):
                word, tag = pos_tags[i]
                
                # Check if it's an important product aspect
                for aspect, terms in self.product_aspects.items():
                    if word in terms:
                        phrase_parts = [word]
                        context_found = False
                        
                        # Look ahead for relevant context
                        j = i + 1
                        while j < len(pos_tags) and j < i + 4:
                            next_word, next_tag = pos_tags[j]
                            
                            # Add if it's a descriptive word
                            if next_tag.startswith(('JJ', 'RB', 'VB')):
                                phrase_parts.append(next_word)
                                context_found = True
                            elif next_tag.startswith('NN') and next_word in terms:
                                phrase_parts.append(next_word)
                                context_found = True
                            elif next_word not in self.stop_words and not next_word in self.punc:
                                # Check if it's semantically related
                                for sentiment_set in self.sentiment_modifiers.values():
                                    if next_word in sentiment_set:
                                        phrase_parts.append(next_word)
                                        context_found = True
                                        break
                            j += 1
                        
                        # If we found meaningful context, create phrase
                        if context_found and len(phrase_parts) >= 2:
                            phrase = ' '.join(phrase_parts)
                            phrase = self._clean_phrase(phrase)
                            if phrase:
                                phrases.append((phrase, 'CONTEXTUAL', 0.7))
                
                i += 1
        
        return phrases
    
    def _extract_compound_phrases(self, text):
        """Extract compound phrases with better context understanding"""
        phrases = []
        sentences = sent_tokenize(text)
        
        for sentence in sentences:
            # Find negation patterns
            negation_words = ['not', 'no', 'never', 'hardly', 'barely', 'scarcely']
            words = word_tokenize(sentence.lower())
            
            # Look for negative phrases
            for i, word in enumerate(words):
                if word in negation_words and i < len(words) - 1:
                    # Look for following adjectives or nouns
                    for j in range(i + 1, min(i + 4, len(words))):
                        next_word = words[j]
                        if next_word not in self.stop_words:
                            phrase = f"{word} {next_word}"
                            
                            # Extend with more context if available
                            if j < len(words) - 1:
                                further_word = words[j + 1]
                                if further_word in sum(self.product_aspects.values(), []):
                                    phrase += f" {further_word}"
                            
                            phrase = self._clean_phrase(phrase)
                            if phrase:
                                phrases.append((phrase, 'COMPOUND', 0.8))
                            break
            
            # Look for comparison phrases
            comparison_words = ['better', 'worse', 'best', 'worst', 'more', 'less']
            for i, word in enumerate(words):
                if word in comparison_words and i < len(words) - 2:
                    if words[i + 1] == 'than':
                        following = words[i + 2:]
                        for aspect_terms in self.product_aspects.values():
                            for term in aspect_terms:
                                if term in following:
                                    phrase = f"{word} than {term}"
                                    phrase = self._clean_phrase(phrase)
                                    if phrase:
                                        phrases.append((phrase, 'COMPARISON', 0.9))
                                    break
        
        return phrases
    
    def _advanced_score_phrases(self, phrases, text):
        """Advanced scoring with multiple factors"""
        phrase_scores = defaultdict(float)
        
        for phrase, phrase_type, base_weight in phrases:
            # Calculate frequency
            frequency = text.lower().count(phrase.lower())
            
            # Base score
            score = frequency * base_weight * 2.0
            
            # Type-based multipliers
            type_multipliers = {
                'SOUND_QUALITY': 2.5,
                'BATTERY_PERFORMANCE': 2.5,
                'CONNECTION_ISSUES': 2.0,
                'COMFORT_LEVEL': 2.0,
                'INTERFACE_QUALITY': 2.0,
                'SUPPORT_QUALITY': 1.8,
                'MIC_NOISE_ISSUE': 2.2,
                'VALUE_PERCEPTION': 1.9,
                'UPDATE_RESULT': 2.3,
                'DEPENDENCY': 1.5,
                'CONTEXTUAL': 1.3,
                'COMPOUND': 1.4,
                'LINGUISTIC': 1.6
            }
            score *= type_multipliers.get(phrase_type, 1.0)
            
            # Word count bonus
            word_count = len(phrase.split())
            if word_count == 2:
                score *= 1.3  # Optimal length
            elif word_count == 3:
                score *= 1.1
            
            # Aspect coverage bonus
            covered_aspects = []
            for aspect, terms in self.product_aspects.items():
                if any(term in phrase.lower() for term in terms):
                    covered_aspects.append(aspect)
            
            if covered_aspects:
                score *= (1 + 0.2 * len(covered_aspects))
            
            # Sentiment clarity bonus
            for sentiment, words in self.sentiment_modifiers.items():
                if any(word in phrase.lower() for word in words):
                    score *= 1.2
                    break
            
            phrase_scores[phrase] = score
        
        return phrase_scores
    
    def _extract_diverse_phrases(self, phrase_scores, n):
        """Extract diverse phrases covering different aspects"""
        sorted_phrases = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)
        
        final_phrases = []
        covered_aspects = set()
        
        for phrase, score in sorted_phrases:
            if len(final_phrases) >= n:
                break
            
            # Check if this phrase covers a new aspect
            phrase_aspects = set()
            for aspect, terms in self.product_aspects.items():
                if any(term in phrase.lower() for term in terms):
                    phrase_aspects.add(aspect)
            
            # Add if it covers new aspects or is very high scoring
            if not phrase_aspects.intersection(covered_aspects) or score > 5.0:
                final_phrases.append(phrase)
                covered_aspects.update(phrase_aspects)
        
        # If we don't have enough diverse phrases, fill with highest scoring ones
        while len(final_phrases) < n and sorted_phrases:
            phrase, _ = sorted_phrases.pop(0)
            if phrase not in final_phrases:
                final_phrases.append(phrase)
        
        return final_phrases
    
    def _clean_phrase(self, phrase):
        """Clean and normalize a phrase"""
        if not phrase:
            return None
        
        # Remove extra spaces
        phrase = re.sub(r'\s+', ' ', phrase).strip()
        
        # Remove artifacts
        phrase = phrase.replace('_', ' ')
        
        # Remove leading/trailing punctuation
        phrase = phrase.strip(self.punc)
        
        # Skip if too short or just stopwords
        words = phrase.split()
        if len(words) < 2 or all(w in self.stop_words for w in words):
            return None
        
        # Remove phrases starting or ending with filter words
        if words[0] in self.filter_words or words[-1] in self.filter_words:
            return None
        
        return phrase
    
    def train(self, dataframe):
        """Train the model by computing IDF values"""
        print("Training advanced model...")
        
        vocabulary = set()
        doc_f = defaultdict(lambda: 0)
        
        for i, row in dataframe.iterrows():
            cleaned_review = self._text_preprocess_clean(row['all_reviews'])
            vocabulary.update(cleaned_review)
            
            unique_words = set(cleaned_review)
            for word in unique_words:
                doc_f[word] += 1
        
        DOC_COUNT = len(dataframe)
        for word in vocabulary:
            self.idf[word] = math.log10(DOC_COUNT / float(doc_f[word]))
        
        print(f"Model trained on {DOC_COUNT} documents")
        print(f"Vocabulary size: {len(vocabulary)}")
        
        return self
    
    def _text_preprocess_clean(self, review):
        """Clean and tokenize text for IDF calculation"""
        review = re.sub(r'[^\w\s]', ' ', review)
        review = review.lower()
        review = re.sub(r'\s+', ' ', review)
        
        tokens = word_tokenize(review)
        tokens = [t for t in tokens if len(t) > 2 and not t.isdigit()]
        
        return tokens
    
    def save(self, path=None):
        """Save the trained model to disk"""
        if path is None:
            path = self.model_path
            
        model_data = {
            'idf': self.idf,
            'stop_words': self.stop_words,
            'filter_words': self.filter_words,
            'punc': self.punc,
            'fullstop': self.fullstop,
            'semantic_patterns': self.semantic_patterns,
            'product_aspects': self.product_aspects,
            'sentiment_modifiers': self.sentiment_modifiers,
            'parameters': {
                'SUMMARY_SIZE_FACTOR': self.SUMMARY_SIZE_FACTOR,
                'RF_WEIGHT': self.RF_WEIGHT,
                'LEVENSHTEIN_THRESHOLD': self.LEVENSHTEIN_THRESHOLD
            }
        }
        
        with open(path, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {path}")
    
    @classmethod
    def load(cls, path="advanced_review_model.pkl"):
        """Load a trained model from disk"""
        model = cls(model_path=path)
        
        with open(path, 'rb') as f:
            model_data = pickle.load(f)
        
        model.idf = model_data['idf']
        model.stop_words = model_data['stop_words']
        model.filter_words = model_data['filter_words']
        model.punc = model_data['punc']
        model.fullstop = model_data['fullstop']
        model.semantic_patterns = model_data['semantic_patterns']
        model.product_aspects = model_data['product_aspects']
        model.sentiment_modifiers = model_data['sentiment_modifiers']
        
        # Recompile patterns
        model.compiled_patterns = [(re.compile(pattern, re.IGNORECASE), label) 
                                  for pattern, label in model.semantic_patterns]
        
        params = model_data['parameters']
        model.SUMMARY_SIZE_FACTOR = params['SUMMARY_SIZE_FACTOR']
        model.RF_WEIGHT = params['RF_WEIGHT']
        model.LEVENSHTEIN_THRESHOLD = params['LEVENSHTEIN_THRESHOLD']
        
        print(f"Model loaded from {path}")
        print(f"Vocabulary size: {len(model.idf)}")
        
        return model

# Usage example:
if __name__ == "__main__":
    import utils
    
    # Training phase (one-time)
    print("Loading training data...")
    file2 = os.path.join("data", 'asin_numreviews_allreview.csv')
    df_allreview = utils.csv_to_dataframe(file2)
    
    # Create and train model
    model = AdvancedReviewSummarizerModel()
    model.train(df_allreview)
    model.save()
    
    # Load trained model (for future use)
    loaded_model = AdvancedReviewSummarizerModel.load()
    
    # Summarize new product reviews
    new_reviews = [
        "This phone case is really good! It's durable and protective.",
        "I love the design, but it's a bit bulky for my pocket.",
        "Great value for money. The screen protector works perfectly."
    ]
    
    
    summary = loaded_model.summarize_new_product(new_reviews)
    print("\nSummary for new product:")
    for phrase in summary:
        print(phrase)

  self.punc = ''',.;:?!'"()[]{}<>|\/@#^&*_~=+\n\t—–-•'''


Loading training data...
Training advanced model...
