# Building Classifier and Model Accuracy

In [19]:
!pip uninstall typing_extensions torch torchvision torchaudio
!pip install typing_extensions==4.7.1
!pip install torch torchvision torchaudio
!pip install transformers

^C
Collecting typing_extensions==4.7.1
  Downloading typing_extensions-4.7.1-py3-none-any.whl.metadata (3.1 kB)
Downloading typing_extensions-4.7.1-py3-none-any.whl (33 kB)
Installing collected packages: typing_extensions
  Attempting uninstall: typing_extensions
    Found existing installation: typing_extensions 4.12.2
    Uninstalling typing_extensions-4.12.2:
      Successfully uninstalled typing_extensions-4.12.2
Successfully installed typing_extensions-4.7.1


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.6.0 requires typing-extensions>=4.10.0, but you have typing-extensions 4.7.1 which is incompatible.


Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp311-cp311-win_amd64.whl.metadata (6.7 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading torchvision-0.21.0-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
    --------------------------------------- 0.0/1.6 MB 640.0 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.6 MB 1.8 MB/s eta 0:00:01
   --------- ------------------------------ 0.4/1.6 MB 3.4 MB/s eta 0:00:01
   --------- ------------------------------ 0.4/1.6 MB 3.4 MB/s eta 0:00:01
   ------------------- -------------------- 0.8/1.6 MB 3.7 MB/s eta 0:00:01
   ------------------------- -------------- 1.0/1.6 MB 4.0 MB/s eta 0:00:01
   --------------------------------- ------ 1.3/1.6 MB 4.3 MB/s eta 0:00:01
   ------

## KNN

## Classification based on Model

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

class BrandSentimentClassifier:
    def __init__(self, k=5):
        """Initialize the classifier"""
        self.k = k
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            min_df=2,
            max_df=0.95,
            ngram_range=(1, 2)
        )

    def prepare_data(self, df, brand):
        """Prepare data for specific brand"""
        # Filter reviews for specific brand
        brand_df = df[df['Product Brand'] == brand].copy()
        
        if len(brand_df) < 10:  # Too little data
            return None, None
            
        # Set initial labels (based on keywords)
        positive_keywords = {
       
            'good', 'great', 'nice', 'fine', 'decent', 'positive',
            
          
            'excellent', 'amazing', 'fantastic', 'awesome', 'perfect', 'superb',
            'outstanding', 'exceptional', 'remarkable', 'brilliant', 'wonderful',
            'impressive', 'incredible', 'marvelous', 'magnificent',
            
         
            'love', 'like', 'enjoy', 'admire', 'appreciate', 'satisfied',
            'happy', 'pleased', 'delighted', 'fond', 
            
            
            'best', 'superior', 'premium', 'top-notch', 'first-rate', 'high-end',
            'leading', 'outstanding', 'elite', 'prime', 'stellar',
            
           
            'fast', 'smooth', 'stable', 'reliable', 'efficient', 'powerful',
            'responsive', 'quick', 'seamless', 'robust',
            
          
            'recommend', 'worth', 'recommended', 'valuable', 'worthwhile',
            
          
            'exceeded', 'surpassed', 'beyond', 'surprised', 'impressive',
            'extraordinary', 'exceptional', 'astounding',
            
        
            'quality', 'premium', 'refined', 'polished', 'solid', 'durable',
            'well-built', 'well-made',
            
          
            'innovative', 'advanced', 'cutting-edge', 'revolutionary', 'state-of-the-art',
            'breakthrough', 'pioneering',
            
      
            'comfortable', 'convenient', 'handy', 'user-friendly', 'intuitive',
            'practical', 'versatile', 'flexible'
        }
        brand_df['Label'] = brand_df['Text'].str.lower().apply(
            lambda x: 1 if any(word in str(x) for word in positive_keywords) else 0
        )
        
        return brand_df['Text'], brand_df['Label']

    def classify_review(self, test_vec, train_vecs, train_labels):
        """KNN classification based on similarity"""
        # Calculate similarity
        similarities = cosine_similarity(test_vec, train_vecs).flatten()
        
        # Get indices of k most similar documents
        top_k_indices = np.argsort(similarities)[-self.k:][::-1]
        
        # Get labels for these k documents
        top_k_labels = [train_labels[i] for i in top_k_indices]
        
        # Return majority label
        return Counter(top_k_labels).most_common(1)[0][0]

    def train_and_evaluate(self, df, brand):
        """Train and evaluate the model"""
        print(f"\nProcessing {brand}...")
        
        # Prepare data
        X, y = self.prepare_data(df, brand)
        if X is None:
            print(f"Insufficient data for {brand}")
            return None
            
        # Split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Convert to TF-IDF features
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)
        
        # Reset indices
        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        
        # Predict test set
        y_pred = []
        for i in range(X_test_tfidf.shape[0]):
            test_vector = X_test_tfidf[i:i+1]
            pred_label = self.classify_review(test_vector, X_train_tfidf, y_train)
            y_pred.append(pred_label)
            
        # Evaluate results
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        pos_ratio = sum(y_pred) / len(y_pred)
        
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Positive ratio: {pos_ratio:.2%}")
        print(f"Negative ratio: {(1-pos_ratio):.2%}")
        print("\nClassification Report:")
        print(report)
        
        return {
            'accuracy': accuracy,
            'report': report,
            'positive_ratio': pos_ratio,
            'negative_ratio': 1 - pos_ratio,
            'predictions': y_pred,
            'true_labels': y_test
        }

def main():
    # Load data
    print("Loading data...")
    df = pd.read_csv('all_brands_combined_mobile_data_final.csv')
    
    # Create classifier
    classifier = BrandSentimentClassifier(k=5)
    
    # Brands to analyze
    brands = ['Google Pixel', 'Oppo Find X3 Pro', 'Samsung S24 Ultra', 
              'Xiaomi 14 Ultra', 'iPhone']
    
    # Analyze each brand
    results = {}
    for brand in brands:
        result = classifier.train_and_evaluate(df, brand)
        if result:
            results[brand] = result
    
    # Print overall comparison results
    print("\nOverall Brand Sentiment Comparison:")
    print("-" * 50)
    
    # Create comparison table
    comparison_data = []
    for brand in brands:
        if brand in results:
            comparison_data.append({
                'Brand': brand,
                'Accuracy': f"{results[brand]['accuracy']:.2f}",
                'Positive Ratio': f"{results[brand]['positive_ratio']:.2%}",
                'Negative Ratio': f"{results[brand]['negative_ratio']:.2%}"
            })
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nBrand Sentiment Summary:")
    print(comparison_df.to_string(index=False))

if __name__ == "__main__":
    main()

Loading data...

Processing Google Pixel...
Accuracy: 0.65
Positive ratio: 24.78%
Negative ratio: 75.22%

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.86      0.73       518
           1       0.68      0.38      0.49       402

    accuracy                           0.65       920
   macro avg       0.66      0.62      0.61       920
weighted avg       0.66      0.65      0.63       920


Processing Oppo Find X3 Pro...
Accuracy: 0.68
Positive ratio: 23.60%
Negative ratio: 76.40%

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.86      0.77       210
           1       0.64      0.40      0.49       129

    accuracy                           0.68       339
   macro avg       0.67      0.63      0.63       339
weighted avg       0.68      0.68      0.66       339


Processing Samsung S24 Ultra...
Accuracy: 0.79
Positive ratio: 13.92%
Negative ratio: 86.08%

Class

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

class BrandSentimentClassifier:
    def __init__(self, k=5):
        """Initialize the classifier"""
        self.k = k
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            min_df=2,
            max_df=0.95,
            ngram_range=(1, 2)
        )

    def prepare_data(self, df, brand):
        """Prepare data for specific brand"""
        # Filter reviews for specific brand
        brand_df = df[df['Product Brand'] == brand].copy()
        
        if len(brand_df) < 10:  # Too little data
            return None, None
            
     
        positive_keywords = {
          
            'good', 'great', 'nice', 'fine', 'decent', 'positive',
          
            'excellent', 'amazing', 'fantastic', 'awesome', 'perfect', 'superb',
            'outstanding', 'exceptional', 'remarkable', 'brilliant', 'wonderful',
            'impressive', 'incredible', 'marvelous', 'magnificent',
            
        
            'love', 'like', 'enjoy', 'admire', 'appreciate', 'satisfied',
            'happy', 'pleased', 'delighted', 'fond',
            
         
            'best', 'superior', 'premium', 'top-notch', 'first-rate', 'high-end',
            'leading', 'outstanding', 'elite', 'prime', 'stellar',
            
          
            'fast', 'smooth', 'stable', 'reliable', 'efficient', 'powerful',
            'responsive', 'quick', 'seamless', 'robust',
            
     
            'recommend', 'worth', 'recommended', 'valuable', 'worthwhile',
            
         
            'exceeded', 'surpassed', 'beyond', 'surprised', 'impressive',
            'extraordinary', 'exceptional', 'astounding',
            
         
            'quality', 'premium', 'refined', 'polished', 'solid', 'durable',
            'well-built', 'well-made',
            
         
            'innovative', 'advanced', 'cutting-edge', 'revolutionary', 'state-of-the-art',
            'breakthrough', 'pioneering',
            
         
            'comfortable', 'convenient', 'handy', 'user-friendly', 'intuitive',
            'practical', 'versatile', 'flexible'
        }

       
        negative_keywords = {
          
            'bad', 'poor', 'terrible', 'horrible', 'awful', 'negative',
            
      
            'defective', 'broken', 'faulty', 'damaged', 'malfunctioning',
            'defect', 'flawed', 'inferior', 'cheap', 'low-quality',
            
  
            'slow', 'sluggish', 'laggy', 'unstable', 'unreliable', 'weak',
            'unresponsive', 'crash', 'freeze', 'hang', 'bug', 'glitch',
            
          
            'hate', 'dislike', 'disappointed', 'disappointing', 'dissatisfied',
            'unhappy', 'frustrated', 'annoyed', 'annoying', 'irritating',
            
        
            'expensive', 'overpriced', 'costly', 'pricey', 'overvalued',
            'not worth', 'waste', 'wasted',
            
        
            'difficult', 'complicated', 'confusing', 'awkward', 'clunky',
            'cumbersome', 'unintuitive', 'impractical',
            
           
            'rude', 'unhelpful', 'unresponsive', 'poor service', 'bad support',
            
           
            'avoid', 'return', 'returned', 'refund', 'regret',
            'mistake', 'error', 'problem', 'issue', 'concern',
            
       
            'worst', 'disaster', 'catastrophe', 'nightmare', 'horrible',
            'terrible', 'useless', 'worthless', 'garbage', 'junk',
            
           
            'warranty', 'repair', 'replace', 'replacement', 'defect',
            'malfunction', 'break', 'broke'
        }
        
        
        def determine_sentiment(text):
            text = str(text).lower()
            has_positive = any(word in text for word in positive_keywords)
            has_negative = any(word in text for word in negative_keywords)
            
            if has_positive and not has_negative:
                return 1 
            elif has_negative and not has_positive:
                return 0  
            elif has_positive and has_negative:
              
                positive_count = sum(1 for word in positive_keywords if word in text)
                negative_count = sum(1 for word in negative_keywords if word in text)
                return 1 if positive_count > negative_count else 0
            else:
                return 0
        
        brand_df['Label'] = brand_df['Text'].apply(determine_sentiment)
        
        return brand_df['Text'], brand_df['Label']

    def classify_review(self, test_vec, train_vecs, train_labels):
        """KNN classification based on similarity"""
        # Calculate similarity
        similarities = cosine_similarity(test_vec, train_vecs).flatten()
        
        # Get indices of k most similar documents
        top_k_indices = np.argsort(similarities)[-self.k:][::-1]
        
        # Get labels for these k documents
        top_k_labels = [train_labels[i] for i in top_k_indices]
        
        # Return majority label
        return Counter(top_k_labels).most_common(1)[0][0]

    def train_and_evaluate(self, df, brand):
        """Train and evaluate the model"""
        print(f"\nProcessing {brand}...")
        
        # Prepare data
        X, y = self.prepare_data(df, brand)
        if X is None:
            print(f"Insufficient data for {brand}")
            return None
            
        # Split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Convert to TF-IDF features
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)
        
        # Reset indices
        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        
        # Predict test set
        y_pred = []
        for i in range(X_test_tfidf.shape[0]):
            test_vector = X_test_tfidf[i:i+1]
            pred_label = self.classify_review(test_vector, X_train_tfidf, y_train)
            y_pred.append(pred_label)
            
        # Evaluate results
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        pos_ratio = sum(y_pred) / len(y_pred)
        
        print(f"Accuracy: {accuracy:.2f}")
        print(f"Positive ratio: {pos_ratio:.2%}")
        print(f"Negative ratio: {(1-pos_ratio):.2%}")
        print("\nClassification Report:")
        print(report)
        
        return {
            'accuracy': accuracy,
            'report': report,
            'positive_ratio': pos_ratio,
            'negative_ratio': 1 - pos_ratio,
            'predictions': y_pred,
            'true_labels': y_test
        }

def main():
    # Load data
    print("Loading data...")
    df = pd.read_csv('all_brands_combined_mobile_data_final.csv')
    
    # Create classifier
    classifier = BrandSentimentClassifier(k=5)
    
    # Brands to analyze
    brands = ['Google Pixel', 'Oppo Find X3 Pro', 'Samsung S24 Ultra', 
              'Xiaomi 14 Ultra', 'iPhone']
    
    # Analyze each brand
    results = {}
    for brand in brands:
        result = classifier.train_and_evaluate(df, brand)
        if result:
            results[brand] = result
    
    # Print overall comparison results
    print("\nOverall Brand Sentiment Comparison:")
    print("-" * 50)
    
    # Create comparison table
    comparison_data = []
    for brand in brands:
        if brand in results:
            comparison_data.append({
                'Brand': brand,
                'Accuracy': f"{results[brand]['accuracy']:.2f}",
                'Positive Ratio': f"{results[brand]['positive_ratio']:.2%}",
                'Negative Ratio': f"{results[brand]['negative_ratio']:.2%}"
            })
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nBrand Sentiment Summary:")
    print(comparison_df.to_string(index=False))

if __name__ == "__main__":
    main()

Loading data...

Processing Google Pixel...
Accuracy: 0.72
Positive ratio: 14.13%
Negative ratio: 85.87%

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.93      0.82       621
           1       0.65      0.28      0.39       299

    accuracy                           0.72       920
   macro avg       0.69      0.60      0.60       920
weighted avg       0.70      0.72      0.68       920


Processing Oppo Find X3 Pro...
Accuracy: 0.71
Positive ratio: 17.70%
Negative ratio: 82.30%

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.89      0.81       233
           1       0.57      0.32      0.41       106

    accuracy                           0.71       339
   macro avg       0.65      0.60      0.61       339
weighted avg       0.69      0.71      0.68       339


Processing Samsung S24 Ultra...
Accuracy: 0.82
Positive ratio: 8.76%
Negative ratio: 91.24%

Classi

In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

class MobileReviewClassifier:
    def __init__(self, k=5):
        """
        Initialize the classifier
        k: number of neighbors in KNN algorithm
        """
        self.k = k
        self.vectorizer = TfidfVectorizer(
            stop_words='english',
            min_df=2,  # minimum document frequency
            max_df=0.95,  # maximum document frequency
            ngram_range=(1, 2)  # use unigrams and bigrams
        )
        
        self.dimension_keywords = {
            'battery life': ['battery', 'battery life', 'charge', 'charging', 'power'],
            'camera': ['camera', 'photo', 'picture', 'image', 'photography', 'shoot'],
            'design': ['design', 'build', 'look', 'build quality', 'material'],
            'display': ['display', 'screen', 'resolution', 'brightness', 'color'],
            'performance': ['performance', 'speed', 'fast', 'slow', 'processor', 'lag'],
            'software': ['software', 'system', 'os', 'android', 'ios', 'interface'],
            'speaker': ['speaker', 'sound', 'audio', 'volume', 'music']
        }

    def prepare_data(self, df, brand, dimension):
        """Prepare data for specific brand and dimension"""
        # Filter reviews for specific brand
        brand_df = df[df['Product Brand'] == brand].copy()
        
        # Filter reviews containing relevant keywords
        keywords = self.dimension_keywords[dimension]
        mask = brand_df['Text'].str.lower().apply(lambda x: any(keyword in x for keyword in keywords))
        dimension_df = brand_df[mask].copy()
        
        if len(dimension_df) < 10:  # Too little data
            return None, None
            
        # Set initial labels for training (needs to be adjusted based on actual situation)
        # Assumes reviews containing positive keywords are positive
        positive_keywords = {'good', 'great', 'excellent', 'amazing', 'love', 'perfect'}
        dimension_df['Label'] = dimension_df['Text'].str.lower().apply(
            lambda x: 1 if any(word in x for word in positive_keywords) else 0
        )
        
        return dimension_df['Text'], dimension_df['Label']

    def classify_review(self, test_vec, train_vecs, train_labels):
        """KNN classification based on similarity"""
        # Calculate similarity
        similarities = cosine_similarity(test_vec, train_vecs).flatten()
        
        # Get indices of k most similar documents
        top_k_indices = np.argsort(similarities)[-self.k:][::-1]
        
        # Get labels for these k documents
        top_k_labels = [train_labels[i] for i in top_k_indices]
        
        # Return majority label
        return Counter(top_k_labels).most_common(1)[0][0]

    def train_and_evaluate(self, df, brand, dimension):
        """Train and evaluate the model"""
        # Prepare data
        X, y = self.prepare_data(df, brand, dimension)
        if X is None:
            print(f"Insufficient data for {brand} - {dimension}")
            return None
            
        # Split training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Convert text to TF-IDF features
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        X_test_tfidf = self.vectorizer.transform(X_test)
        
        # Reset indices
        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        
        # Predict test set
        y_pred = []
        for i in range(X_test_tfidf.shape[0]):
            test_vector = X_test_tfidf[i:i+1]
            pred_label = self.classify_review(test_vector, X_train_tfidf, y_train)
            y_pred.append(pred_label)
            
        # Evaluate results
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        return {
            'accuracy': accuracy,
            'report': report,
            'positive_ratio': sum(y_pred) / len(y_pred),
            'negative_ratio': 1 - (sum(y_pred) / len(y_pred))
        }

def main():
    # Load data
    print("Loading data...")
    df = pd.read_csv('all_brands_combined_mobile_data_final.csv')
    
    # Create classifier
    classifier = MobileReviewClassifier(k=5)
    
    # Define brands to analyze
    brands = ['Google Pixel', 'Oppo Find X3 Pro', 'Samsung S24 Ultra', 
              'Xiaomi 14 Ultra', 'iPhone']
    
    # Analyze each dimension for each brand
    results = {}
    
    for brand in brands:
        print(f"\nProcessing {brand}...")
        brand_results = {}
        
        for dimension in classifier.dimension_keywords.keys():
            print(f"Training classifier for {dimension}")
            result = classifier.train_and_evaluate(df, brand, dimension)
            
            if result:
                brand_results[dimension] = result
                print(f"Accuracy: {result['accuracy']:.2f}")
                print(f"Positive ratio: {result['positive_ratio']:.2%}")
                print(f"Negative ratio: {result['negative_ratio']:.2%}")
                print("\nClassification Report:")
                print(result['report'])
        
        results[brand] = brand_results
    
    # Print overall results
    print("\nOverall Sentiment Analysis Results:")
    for brand in brands:
        if brand in results:
            print(f"\n{brand} Analysis:")
            print("-" * 50)
            
            for dimension, metrics in results[brand].items():
                print(f"\n{dimension.title()}:")
                print(f"Accuracy: {metrics['accuracy']:.2f}")
                print(f"Positive ratio: {metrics['positive_ratio']:.2%}")
                print(f"Negative ratio: {metrics['negative_ratio']:.2%}")

if __name__ == "__main__":
    main()

Loading data...

Processing Google Pixel...
Training classifier for battery life
Accuracy: 0.74
Positive ratio: 43.69%
Negative ratio: 56.31%

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.75      0.78        63
           1       0.64      0.72      0.68        40

    accuracy                           0.74       103
   macro avg       0.73      0.74      0.73       103
weighted avg       0.75      0.74      0.74       103

Training classifier for camera
Accuracy: 0.76
Positive ratio: 40.97%
Negative ratio: 59.03%

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.79      0.80        86
           1       0.69      0.71      0.70        58

    accuracy                           0.76       144
   macro avg       0.75      0.75      0.75       144
weighted avg       0.76      0.76      0.76       144

Training classifier for design
Accuracy: 0.75
Positive ratio: 36

## Word2Dec

In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import re

class Word2VecSvmClassifier:
    def __init__(self, vector_size=100):
        """
        Initialize the classifier
        vector_size: Dimension of Word2Vec vectors, higher dimensions can capture richer semantic information but require more computational resources
        """
        # Download required NLTK data
        nltk.download('punkt', quiet=True)
        
        self.dimension_keywords = {
            'battery life': ['battery', 'battery life', 'charge', 'charging', 'power'],
            'camera': ['camera', 'photo', 'picture', 'image', 'photography', 'shoot'],
            'design': ['design', 'build', 'look', 'build quality', 'material'],
            'display': ['display', 'screen', 'resolution', 'brightness', 'color'],
            'performance': ['performance', 'speed', 'fast', 'slow', 'processor', 'lag'],
            'software': ['software', 'system', 'os', 'android', 'ios', 'interface'],
            'speaker': ['speaker', 'sound', 'audio', 'volume', 'music']
        }
        
        self.target_brands = [
            'Google Pixel',
            'Oppo Find X3 Pro',
            'Samsung S24 Ultra',
            'Xiaomi 14 Ultra',
            'iPhone'
        ]
        
        self.dimensions = list(self.dimension_keywords.keys())
        self.vector_size = vector_size
        self.word2vec_model = None
        
        # Create an SVM classifier for each dimension of each brand
        self.classifiers = {
            brand: {dim: SVC(kernel='rbf', probability=True) 
                   for dim in self.dimensions}
            for brand in self.target_brands
        }

    def preprocess_text(self, text):
        """
        Clean and tokenize text
        Convert text to lowercase, remove special characters, and tokenize
        """
        if not isinstance(text, str):
            return []
        # Convert to lowercase
        text = text.lower()
        # Remove special characters
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        # Tokenize
        return word_tokenize(text)

    def train_word2vec(self, texts):
        """
        Train Word2Vec model
        Use all review texts to learn word vectors to capture domain-specific word semantics
        """
        print("Training Word2Vec model...")
        # Preprocess all texts
        processed_texts = [self.preprocess_text(text) for text in texts]
        
        # Train Word2Vec model
        self.word2vec_model = Word2Vec(
            sentences=processed_texts,
            vector_size=self.vector_size,  # Word vector dimension
            window=5,  # Context window size
            min_count=1,  # Minimum word frequency
            workers=4  # Number of training threads
        )
        print("Word2Vec model training completed")

    def get_text_vector(self, text):
        """
        Convert text to vector representation
        Represent entire text by averaging word vectors
        """
        tokens = self.preprocess_text(text)
        vectors = []
        
        for token in tokens:
            try:
                # Get word vector
                vector = self.word2vec_model.wv[token]
                vectors.append(vector)
            except KeyError:
                continue
        
        if not vectors:
            return np.zeros(self.vector_size)
        
        # Return average of all word vectors
        return np.mean(vectors, axis=0)

    def prepare_dimension_data(self, texts, brand, dimension):
        """
        Prepare training data for specific brand and dimension
        """
        X = []  # Store text vectors
        y = []  # Store sentiment labels
        
        for text in texts:
            # Check if text contains relevant dimension keywords
            if any(keyword in text.lower() for keyword in self.dimension_keywords[dimension]):
                # Get vector representation of text
                text_vector = self.get_text_vector(text)
                X.append(text_vector)
                
                # Determine sentiment label (simplified version, should use annotated data in practice)
                sentiment = 1 if any(pos in text.lower() 
                    for pos in ['good', 'great', 'excellent', 'amazing']) else 0
                y.append(sentiment)
        
        if not X:
            return None, None
            
        return np.array(X), np.array(y)

    def train_and_evaluate(self, df):
        """
        Train models and evaluate performance
        """
        # First train Word2Vec model
        self.train_word2vec(df['Text'])
        
        # Filter target brand data
        df = df[df['Product Brand'].isin(self.target_brands)]
        
        # Split training and test sets
        train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
        
        results = {}
        
        for brand in self.target_brands:
            print(f"\nProcessing {brand}...")
            brand_results = {}
            
            brand_train = train_df[train_df['Product Brand'] == brand]
            brand_test = test_df[test_df['Product Brand'] == brand]
            
            for dimension in self.dimensions:
                print(f"Training classifier for {dimension}")
                
                # Prepare training data
                X_train, y_train = self.prepare_dimension_data(
                    brand_train['Text'], brand, dimension
                )
                
                if X_train is None or len(X_train) < 10:
                    print(f"Insufficient data for {brand} - {dimension}")
                    continue
                
                # Prepare test data
                X_test, y_test = self.prepare_dimension_data(
                    brand_test['Text'], brand, dimension
                )
                
                if X_test is None or len(X_test) < 5:
                    print(f"Insufficient test data for {brand} - {dimension}")
                    continue
                
                # Train SVM classifier
                classifier = self.classifiers[brand][dimension]
                classifier.fit(X_train, y_train)
                
                # Predict and evaluate
                y_pred = classifier.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                report = classification_report(y_test, y_pred)
                
                # Save results
                brand_results[dimension] = {
                    'accuracy': accuracy,
                    'report': report,
                    'positive_ratio': sum(y_pred == 1) / len(y_pred),
                    'negative_ratio': sum(y_pred == 0) / len(y_pred)
                }
                
                print(f"Accuracy: {accuracy:.2f}")
                print("\nClassification Report:")
                print(report)
            
            results[brand] = brand_results
        
        return results

def main():
    # Load data
    df = pd.read_csv('all_brands_combined_mobile_data_final.csv')
    
    # Create classifier and train/evaluate
    classifier = Word2VecSvmClassifier(vector_size=100)
    results = classifier.train_and_evaluate(df)
    
    # Print overall results
    print("\nOverall Sentiment Analysis Results:")
    for brand in classifier.target_brands:
        if brand in results:
            print(f"\n{brand} Analysis:")
            print("-" * 50)
            
            for dimension, metrics in results[brand].items():
                print(f"\n{dimension.title()}:")
                print(f"Accuracy: {metrics['accuracy']:.2f}")
                print(f"Positive ratio: {metrics['positive_ratio']:.2%}")
                print(f"Negative ratio: {metrics['negative_ratio']:.2%}")

if __name__ == "__main__":
    main()

Training Word2Vec model...
Word2Vec model training completed

Processing Google Pixel...
Training classifier for battery life
Accuracy: 0.77

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.99      0.87        75
           1       0.80      0.15      0.26        26

    accuracy                           0.77       101
   macro avg       0.79      0.57      0.56       101
weighted avg       0.78      0.77      0.71       101

Training classifier for camera
Accuracy: 0.77

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.99      0.86       115
           1       0.86      0.15      0.25        41

    accuracy                           0.77       156
   macro avg       0.81      0.57      0.56       156
weighted avg       0.79      0.77      0.70       156

Training classifier for design
Accuracy: 0.73

Classification Report:
              precision    recall  f1-sco

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.83

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90        71
           1       0.00      0.00      0.00        15

    accuracy                           0.83        86
   macro avg       0.41      0.50      0.45        86
weighted avg       0.68      0.83      0.75        86

Training classifier for performance
Accuracy: 0.70

Classification Report:
              precision    recall  f1-score   support

           0       0.69      1.00      0.82        76
           1       1.00      0.08      0.15        37

    accuracy                           0.70       113
   macro avg       0.85      0.54      0.48       113
weighted avg       0.79      0.70      0.60       113

Training classifier for software
Accuracy: 0.81

Classification Report:
              precision    recall  f1-score   support

           0       0.81      1.00      0.89       244
           1       1.00      0.03      0.06        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.75

Classification Report:
              precision    recall  f1-score   support

           0       0.74      1.00      0.85        54
           1       1.00      0.17      0.30        23

    accuracy                           0.75        77
   macro avg       0.87      0.59      0.57        77
weighted avg       0.82      0.75      0.68        77

Training classifier for design
Accuracy: 0.76

Classification Report:
              precision    recall  f1-score   support

           0       0.76      1.00      0.87        26
           1       0.00      0.00      0.00         8

    accuracy                           0.76        34
   macro avg       0.38      0.50      0.43        34
weighted avg       0.58      0.76      0.66        34

Training classifier for display
Accuracy: 0.73

Classification Report:
              precision    recall  f1-score   support

           0       0.73      1.00      0.84        24
           1       0.00      0.00      0.00         9

  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        79
           1       1.00      0.06      0.12        16

    accuracy                           0.84        95
   macro avg       0.92      0.53      0.52        95
weighted avg       0.87      0.84      0.78        95

Training classifier for speaker
Accuracy: 0.71

Classification Report:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83         5
           1       0.00      0.00      0.00         2

    accuracy                           0.71         7
   macro avg       0.36      0.50      0.42         7
weighted avg       0.51      0.71      0.60         7


Processing Samsung S24 Ultra...
Training classifier for battery life
Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        46
           1      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Accuracy: 0.89

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        59
           1       0.00      0.00      0.00         7

    accuracy                           0.89        66
   macro avg       0.45      0.50      0.47        66
weighted avg       0.80      0.89      0.84        66

Training classifier for performance
Accuracy: 0.71

Classification Report:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83        15
           1       0.00      0.00      0.00         6

    accuracy                           0.71        21
   macro avg       0.36      0.50      0.42        21
weighted avg       0.51      0.71      0.60        21

Training classifier for software
Accuracy: 0.84

Classification Report:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91        64
           1       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       199
           1       0.00      0.00      0.00        45

    accuracy                           0.82       244
   macro avg       0.41      0.50      0.45       244
weighted avg       0.67      0.82      0.73       244

Training classifier for camera


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.79

Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       153
           1       0.00      0.00      0.00        40

    accuracy                           0.79       193
   macro avg       0.40      0.50      0.44       193
weighted avg       0.63      0.79      0.70       193

Training classifier for design
Accuracy: 0.76

Classification Report:
              precision    recall  f1-score   support

           0       0.76      1.00      0.86        57
           1       0.00      0.00      0.00        18

    accuracy                           0.76        75
   macro avg       0.38      0.50      0.43        75
weighted avg       0.58      0.76      0.66        75

Training classifier for display


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.83

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       103
           1       0.00      0.00      0.00        21

    accuracy                           0.83       124
   macro avg       0.42      0.50      0.45       124
weighted avg       0.69      0.83      0.75       124

Training classifier for performance
Accuracy: 0.82

Classification Report:
              precision    recall  f1-score   support

           0       0.82      1.00      0.90        53
           1       0.00      0.00      0.00        12

    accuracy                           0.82        65
   macro avg       0.41      0.50      0.45        65
weighted avg       0.66      0.82      0.73        65

Training classifier for software
Accuracy: 0.83

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.91       198
           1       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
