# 🛍️ ReviewSense: Product Review Analysis Engine

> *ReviewSense is a comprehensive, end-to-end Natural Language Processing application built to extract deep, actionable insights from unstructured product reviews.*  
Where a simple star rating only tells part of the story, ReviewSense dives into the text to uncover what customers are saying, why they're saying it, and how they feel about specific product features.  

## Imports

In [None]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import os
import numpy as np
from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup, AutoConfig
from torch.optim import AdamW
import torch
from torchmetrics.functional import accuracy
from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, pipeline



## Prepare the data

In [None]:
def explore_and_preprocess_reviews(
    train_path='data/train.csv', 
    test_path='data/test.csv',
    output_dir='data'
):
    """
    Loads the Amazon Sentiment Analysis dataset (https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews)
    (you need to extract the train/test splits from the zip file in the data folder),
    performs basic EDA, and preprocesses it for model training.

    Args:
        train_path (str): Path to the training CSV file.
        test_path (str): Path to the testing CSV file.
        output_dir (str): Directory to save the processed file.
    """
    # --- 1. Load Data ---
    # This dataset typically comes without headers. We'll assign them.
    # Column 1: Sentiment (1 = Negative, 2 = Positive)
    # Column 2: Title
    # Column 3: Review Text
    print(f"Loading data from '{train_path}' and '{test_path}'...")
    try:
        col_names = ['sentiment_orig', 'title', 'review']
        train_df = pd.read_csv(train_path, header=None, names=col_names)
        test_df = pd.read_csv(test_path, header=None, names=col_names)
        
        # Combine for unified EDA and preprocessing
        df = pd.concat([train_df, test_df], ignore_index=True)

    except FileNotFoundError:
        print(f"\nERROR: Make sure '{train_path}' and '{test_path}' are in the specified directory.")
        print("This script is designed for the 'Amazon Reviews for Sentiment Analysis' dataset from Kaggle.")
        return

    df.dropna(inplace=True)

    # --- 2. Preprocessing ---
    print("\n--- Preprocessing Data for Sentiment Analysis ---")

    # a) Create new sentiment labels (0 = Negative, 1 = Positive)
    # This dataset is binary, not three-class like the previous one.
    df['sentiment'] = df['sentiment_orig'].apply(lambda x: 0 if x == 1 else 1)

    # b) Combine title and review body
    df['full_text'] = df['title'].astype(str) + ". " + df['review'].astype(str)

    # c) Select and rename columns
    processed_df = df[['full_text', 'sentiment']].copy()

    # --- 4. Save Processed Data ---
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, 'reviews_processed.csv')
    processed_df.to_csv(output_path, index=False)
    print(f"\nSaved {len(processed_df)} processed reviews to '{output_path}'")


In [None]:
#--- Preprocess the Reviews Dataset ---
print("\n--- Preprocessing started ---")
explore_and_preprocess_reviews()
print("\n--- Preprocessing finished ---")

## Define a base model (Multinomial Naive Bayes)

In [None]:
def train_baseline_sentiment_model(data_path='data/reviews_processed.csv', grid_search=True, nb__alpha=0.1, tfidf__max_df=0.75, tfidf__ngram_range=(1, 2), sample_size: int = 50000):
    """
    Trains and evaluates a Multinomial Naive Bayes model for sentiment analysis.
    Can optionally perform a grid search.

    Args:
        data_path (str): Path to the processed reviews CSV file.
        grid_search (bool): If True, performs a grid search.
        nb__alpha (float): Alpha for MultinomialNB.
        tfidf__max_df (float): max_df for TfidfVectorizer.
        tfidf__ngram_range (tuple): ngram_range for TfidfVectorizer.
        sample_size (int, optional): Number of reviews to use. If None, uses all.
    """
    # --- 1. Load Data ---
    print(f"Loading data from '{data_path}'...")
    if not os.path.exists(data_path):
        print(f"\nERROR: '{data_path}' not found. Please run the EDA script first!")
        return
        
    df = pd.read_csv(data_path)
    df.dropna(inplace=True)

    # --- 2. Sample Data ---
    if sample_size:
        print(f"Using a sample of {sample_size} reviews for training the baseline model.")
        df = df.sample(n=sample_size, random_state=42)

    # --- 3. Train-Test Split ---
    print("Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        df['full_text'],
        df['sentiment'],
        test_size=0.2,
        random_state=42,
        stratify=df['sentiment']
    )

    # --- 4. Create a Pipeline ---
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('nb', MultinomialNB()),
    ])

    best_params = None

    if grid_search:
        # --- 5a. Perform Grid Search ---
        print("Performing Grid Search to find the best hyperparameters...")
        parameters = {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'tfidf__max_df': [0.5, 0.75, 1.0],
            'nb__alpha': [0.1, 0.5, 1.0],
        }
        param_grid = list(ParameterGrid(parameters))
        best_score = -1

        for params in tqdm(param_grid, desc="Grid Search Progress"):
            pipeline.set_params(**params)
            pipeline.fit(X_train, y_train)
            score = pipeline.score(X_test, y_test)
            if score > best_score:
                best_score = score
                best_params = params
        
        print(f"\nBest score on test set: {best_score:.4f}")
        print("Best parameters found:")
        print(best_params)

    else:
        # --- 5b. Use provided hyperparameters ---
        print("Skipping grid search and using provided hyperparameters...")
        best_params = {
            'nb__alpha': nb__alpha,
            'tfidf__max_df': tfidf__max_df,
            'tfidf__ngram_range': tfidf__ngram_range
        }

    # --- 6. Train the Final Model ---
    print("\nTraining final model...")
    best_model = pipeline.set_params(**best_params)
    best_model.fit(X_train, y_train)
    print("Model training complete.")

    # --- 7. Evaluate the Best Model ---
    print("\n--- Model Evaluation ---")
    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    target_names = ['Negative', 'Positive']
    
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names))
    
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', 
                xticklabels=target_names, yticklabels=target_names)
    plt.title('Confusion Matrix for Naive Bayes on Amazon Reviews')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

In [None]:
#--- Train the base model ---
train_baseline_sentiment_model(sample_size=150000, grid_search=False)

## Define the dataset and lightning DataModule

In [None]:
class ReviewDataset(Dataset):
    """
    Custom PyTorch Dataset for Amazon Reviews.

    This class takes a pandas DataFrame of review data, a tokenizer, and a max
    token length, and prepares it for use in a PyTorch model. It handles the
    tokenization of the text and the formatting of the labels for each item.

    Attributes:
        tokenizer: The Hugging Face tokenizer to use for processing text.
        data (pd.DataFrame): The DataFrame containing the review data.
        max_token_len (int): The maximum sequence length for the tokenizer.
    """
    def __init__(self, data: pd.DataFrame, tokenizer, max_token_len: int):
        """
        Initializes the ReviewDataset.

        Args:
            data (pd.DataFrame): The input DataFrame containing 'full_text' and
                                 'sentiment' columns.
            tokenizer: The pre-trained tokenizer instance.
            max_token_len (int): The maximum length for tokenized sequences.
        """
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, index: int):
        """
        Retrieves one sample from the dataset at the specified index.

        This method handles the tokenization of a single review text, including
        padding and truncation, and formats the output into a dictionary of
        tensors ready for the model.

        Args:
            index (int): The index of the data sample to retrieve.

        Returns:
            dict: A dictionary containing the tokenized inputs and the label,
                  with the following keys:
                  - 'input_ids': The token IDs of the review text.
                  - 'attention_mask': The attention mask for the review text.
                  - 'labels': The sentiment label as a tensor.
        """
        data_row = self.data.iloc[index]
        text = str(data_row.full_text)
        labels = data_row.sentiment

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return dict(
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.tensor(labels, dtype=torch.long)
        )

class ReviewDataModule(pl.LightningDataModule):
    """
    PyTorch Lightning DataModule to handle the Amazon Reviews dataset.

    This class encapsulates all the steps needed to process the data:
    loading, splitting, and creating PyTorch DataLoaders for training,
    validation, and testing. It allows for using a smaller random sample of the
    full dataset for faster experimentation.

    Attributes:
        data_path (str): Path to the processed CSV file.
        batch_size (int): The size of each data batch.
        max_token_len (int): The maximum sequence length for the tokenizer.
        tokenizer: The Hugging Face tokenizer instance.
        num_workers (int): The number of CPU cores to use for data loading.
        sample_size (int, optional): The number of samples to use. If None,
                                     the full dataset is used.
    """
    def __init__(self, data_path: str, batch_size: int = 16, max_token_len: int = 256, model_name='distilbert-base-uncased', num_workers: int = 0, sample_size: int = None):
        """
        Initializes the ReviewDataModule.

        Args:
            data_path (str): The path to the processed CSV data file.
            batch_size (int): The number of samples per batch.
            max_token_len (int): Maximum length of tokenized sequences.
            model_name (str): The name of the pre-trained model to use for the tokenizer.
            num_workers (int): Number of subprocesses to use for data loading.
            sample_size (int, optional): If specified, a random sample of this
                                         size will be used from the dataset.
                                         Defaults to None, which uses the full dataset.
        """
        super().__init__()
        self.data_path = data_path
        self.batch_size = batch_size
        self.max_token_len = max_token_len
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.num_workers = num_workers
        self.sample_size = sample_size
        self.train_df = None
        self.val_df = None
        self.test_df = None

    def setup(self, stage=None):
        """
        Loads and splits the data for training, validation, and testing.

        This method is called by PyTorch Lightning. It reads the CSV, handles
        missing values, optionally takes a random sample, and performs a
        stratified train-validation-test split. The indices of the resulting
        DataFrames are reset to prevent potential KeyErrors during data loading.
        """
        df = pd.read_csv(self.data_path)
        df.dropna(inplace=True)

        # If a sample size is provided, sample the dataframe
        if self.sample_size:
            print(f"Using a sample of {self.sample_size} reviews.")
            df = df.sample(n=self.sample_size, random_state=42)

        # Stratified split to maintain label distribution
        train_val_df, self.test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df.sentiment)
        self.train_df, self.val_df = train_test_split(train_val_df, test_size=0.1, random_state=42, stratify=train_val_df.sentiment)

        # Reset indices to prevent KeyErrors
        self.train_df = self.train_df.reset_index(drop=True)
        self.val_df = self.val_df.reset_index(drop=True)
        self.test_df = self.test_df.reset_index(drop=True)

        print(f"Size of training set: {len(self.train_df)}")
        print(f"Size of validation set: {len(self.val_df)}")
        print(f"Size of test set: {len(self.test_df)}")

    def train_dataloader(self):
        """Returns the DataLoader for the training set."""
        return DataLoader(
            ReviewDataset(self.train_df, self.tokenizer, self.max_token_len),
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=self.num_workers
        )

    def val_dataloader(self):
        """Returns the DataLoader for the validation set."""
        return DataLoader(
            ReviewDataset(self.val_df, self.tokenizer, self.max_token__len),
            batch_size=self.batch_size,
            num_workers=self.num_workers
        )

    def test_dataloader(self):
        """Returns the DataLoader for the test set."""
        return DataLoader(
            ReviewDataset(self.test_df, self.tokenizer, self.max_token_len),
            batch_size=self.batch_size,
            num_workers=self.num_workers
        )
        

In [None]:
# --- Configuration ---
data_path = "data/reviews_processed.csv"
BATCH_SIZE = 64
MAX_TOKEN_LEN = 256

print("Initializing ReviewDataModule...")
review_datamodule = ReviewDataModule(
    data_path=data_path,
    batch_size=BATCH_SIZE,
    max_token_len=MAX_TOKEN_LEN,
    model_name='distilbert-base-uncased',
    sample_size=100000 # Pass the sample size to the datamodule
)
review_datamodule.setup()

# Fetch one batch from the training dataloader to inspect its contents
print("\n--- Fetching one batch from the training dataloader ---")
train_batch = next(iter(review_datamodule.train_dataloader()))

print("\n--- Example Batch ---")
print(f"Input IDs shape: {train_batch['input_ids'].shape}")
print(f"Attention Mask shape: {train_batch['attention_mask'].shape}")
print(f"Labels: {train_batch['labels']}")
print(f"Labels shape: {train_batch['labels'].shape}")

## FineTune DistilBert

In [None]:
class SentimentClassifier(pl.LightningModule):
    """
    PyTorch Lightning module for the sentiment classification model.
    """
    def __init__(self, model_name='distilbert-base-uncased', n_classes=2, learning_rate=2e-5, n_warmup_steps=0, n_training_steps=0, dropout_prob=0.2): # Added dropout
        super().__init__()
        self.save_hyperparameters()

        # Configure dropout
        config = AutoConfig.from_pretrained(model_name)
        config.hidden_dropout_prob = dropout_prob
        config.attention_probs_dropout_prob = dropout_prob
        config.num_labels = n_classes

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    def training_step(self, batch, batch_idx):
        output = self.forward(**batch)
        self.log("train_loss", output.loss, prog_bar=True, logger=True)
        return output.loss

    def validation_step(self, batch, batch_idx):
        output = self.forward(**batch)
        preds = torch.argmax(output.logits, dim=1)
        val_acc = accuracy(preds, batch['labels'], task='binary')
        self.log("val_loss", output.loss, prog_bar=True, logger=True)
        self.log("val_accuracy", val_acc, prog_bar=True, logger=True)
        return output.loss

    def test_step(self, batch, batch_idx):
        output = self.forward(**batch)
        preds = torch.argmax(output.logits, dim=1)
        test_acc = accuracy(preds, batch['labels'], task='binary')
        self.log("test_accuracy", test_acc)
        return test_acc

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        output = self.forward(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        return torch.argmax(output.logits, dim=1)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate, weight_decay=0.01)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.n_warmup_steps,
            num_training_steps=self.hparams.n_training_steps
        )
        return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))


In [None]:
def train_sentiment_model(data_path='data/reviews_processed.csv', model_name='distilbert-base-uncased', n_epochs=5, sample_size: int = None):
    """
    Main function to train the sentiment analysis model on the Amazon Reviews dataset.

    Args:
        data_path (str): Path to the processed data file.
        model_name (str): Name of the transformer model to use.
        n_epochs (int): Maximum number of epochs for training.
        sample_size (int, optional): The number of reviews to use for training.
                                     If None, the full dataset is used.
    """
    # --- 1. Hyperparameters ---
    BATCH_SIZE = 64
    MAX_TOKEN_LEN = 256
    LEARNING_RATE = 2e-5
    N_CLASSES = 2  # Negative, Positive

    # --- 2. Initialize DataModule ---
    print("Initializing ReviewDataModule...")
    review_datamodule = ReviewDataModule(
        data_path=data_path,
        batch_size=BATCH_SIZE,
        max_token_len=MAX_TOKEN_LEN,
        model_name=model_name,
        sample_size=sample_size # Pass the sample size to the datamodule
    )
    review_datamodule.setup()

    n_training_steps = len(review_datamodule.train_dataloader()) * n_epochs
    n_warmup_steps = int(n_training_steps * 0.1)

    # --- 3. Initialize Model ---
    print("Initializing SentimentClassifier model...")
    model = SentimentClassifier(
        model_name=model_name,
        n_classes=N_CLASSES,
        learning_rate=LEARNING_RATE,
        n_warmup_steps=n_warmup_steps,
        n_training_steps=n_training_steps
    )

    # --- 4. Configure Training Callbacks ---
    checkpoint_callback = ModelCheckpoint(
        dirpath="checkpoints",
        filename="sentiment-binary-best-checkpoint",
        save_top_k=1,
        verbose=True,
        monitor="val_loss",
        mode="min"
    )
    logger = TensorBoardLogger("lightning_logs", name="sentiment-classifier-binary")
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

    # --- 5. Initialize Trainer ---
    print("Initializing PyTorch Lightning Trainer...")
    trainer = pl.Trainer(
        logger=logger,
        callbacks=[checkpoint_callback, early_stopping_callback],
        max_epochs=n_epochs,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        devices=1,
    )

    # --- 6. Start Training ---
    print(f"Starting training with {model_name} for up to {n_epochs} epochs...")
    trainer.fit(model, review_datamodule)

    # --- 7. Evaluate on Test Set and Generate Confusion Matrix ---
    print("\nTraining complete. Evaluating on the test set...")
    trainer.test(model, datamodule=review_datamodule)

    predictions = trainer.predict(model, datamodule=review_datamodule)
    if predictions:
        all_preds = torch.cat(predictions).cpu().numpy()
        true_labels = review_datamodule.test_df.sentiment.to_numpy()
        target_names = ['Negative', 'Positive'] # Updated labels

        cm = confusion_matrix(true_labels, all_preds)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='YlGnBu',
                    xticklabels=target_names, yticklabels=target_names)
        plt.title('Confusion Matrix for Sentiment Analysis')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()



In [None]:
#--- Train DistilBert ---
train_sentiment_model(data_path=data_path, sample_size=100000)

## Define the models

In [None]:
class ReviewSummarizer:
    """
    A class to handle the summarization of product reviews using a pre-trained T5 model.
    """
    def __init__(self, model_name='t5-small'):
        """
        Initializes the summarizer with a pre-trained T5 model and tokenizer.

        Args:
            model_name (str): The name of the pre-trained T5 model to use.
        """
        print(f"Loading summarization model: {model_name}...")
        self.model_name = model_name
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # Load the tokenizer and model from Hugging Face
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
        print("Summarization model loaded successfully.")

    def summarize(self, text: str, max_length: int = 50, min_length: int = 10) -> str:
        """
        Generates a summary for a given text.

        Args:
            text (str): The review text to summarize.
            max_length (int): The maximum length of the generated summary.
            min_length (int): The minimum length of the generated summary.

        Returns:
            str: The generated summary.
        """
        if not text or not isinstance(text, str):
            return ""

        # T5 models require a prefix for the task. For summarization, it's "summarize: "
        preprocess_text = f"summarize: {text.strip()}"

        # Tokenize the input text
        tokenized_text = self.tokenizer.encode(preprocess_text, return_tensors="pt").to(self.device)

        # Generate the summary
        summary_ids = self.model.generate(
            tokenized_text,
            max_length=max_length,
            min_length=min_length,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )

        # Decode the summary and return it
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summary

class AspectAnalyzer:
    """
    A class to handle Aspect-Based Sentiment Analysis (ABSA) using a pre-trained model.
    """
    # Changed to a different, currently valid lightweight model for ABSA.
    def __init__(self, model_name='yangheng/deberta-v3-base-absa-v1.1', force_cpu=False):
        """
        Initializes the ABSA pipeline with a pre-trained model.

        Args:
            model_name (str): The name of the pre-trained ABSA model.
            force_cpu (bool): If True, forces the model to run on the CPU.
        """
        print(f"Loading Aspect-Based Sentiment Analysis model: {model_name}...")
        self.model_name = model_name

        if force_cpu:
            self.device = -1 # Use -1 for CPU in pipeline
            print("Forcing ABSA model to run on CPU.")
        else:
            self.device = 0 if torch.cuda.is_available() else -1

        print(f"Using device: {self.device} (0 for GPU, -1 for CPU)")

        self.absa_pipeline = pipeline(
            "text-classification",
            model=self.model_name,
            tokenizer=self.model_name,
            device=self.device
        )
        print("ABSA model loaded successfully.")

    def analyze(self, text: str, aspects: list) -> dict:
        """
        Analyzes the sentiment towards a list of aspects within a given text.
        """
        if not text or not isinstance(text, str) or not aspects:
            return {}

        # The model expects the review and aspect separated by a special token.
        # Note: Different ABSA models might expect different input formats.
        # This format is common but may need adjustment for other models.
        inputs = [f"{text} [SEP] {aspect}" for aspect in aspects]
        results = self.absa_pipeline(inputs)

        # Process results into a user-friendly dictionary
        aspect_sentiments = {}
        for aspect, result in zip(aspects, results):
            aspect_sentiments[aspect] = {'sentiment': result['label'], 'score': result['score']}

        return aspect_sentiments

class FineTunedSentimentClassifier:
    """
    This class handles loading the fine-tuned checkpoint and making predictions.
    """
    def __init__(self, checkpoint_path, model_name='distilbert-base-uncased', force_cpu=False):
        self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Loading fine-tuned sentiment model from checkpoint: {checkpoint_path}...")
        print(f"Using device: {self.device}")

        self.model = SentimentClassifier.load_from_checkpoint(checkpoint_path, map_location=self.device)
        self.model.to(self.device)
        self.model.eval() # Set model to evaluation mode

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.labels = ['NEGATIVE', 'POSITIVE']
        print("Fine-tuned sentiment model loaded successfully.")

    def classify(self, text: str) -> dict:
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=128,
            return_token_type_ids=False, padding="max_length",
            truncation=True, return_attention_mask=True, return_tensors='pt',
        )
        input_ids = encoding["input_ids"].to(self.device)
        attention_mask = encoding["attention_mask"].to(self.device)
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        prediction_idx = torch.argmax(probabilities, dim=1).item()
        return {'label': self.labels[prediction_idx], 'score': probabilities[0][prediction_idx].item()}

class AspectExtractor:
    """
    This class uses a Part-of-Speech (POS) tagging model to first extract all
    potential aspect terms (nouns) from a review text. It then filters these
    nouns against a pre-defined dictionary of valid aspects for a given
    product category to return only the relevant features.
    """
    def __init__(self, model_name="vblagoje/bert-english-uncased-finetuned-pos", force_cpu=False):
        self.model_name = model_name
        self.device = 'cpu' if force_cpu else ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Loading Part-of-Speech (POS) tagging model: {self.model_name}...")
        print(f"Using device: {self.device}")

        self.pipeline = pipeline(
            "token-classification",
            model=self.model_name,
            device=-1 if self.device == 'cpu' else 0,
            aggregation_strategy="simple"
        )
        print("POS tagging model loaded successfully.")

    def extract(self, text: str, aspect_dictionary: list) -> list:
        """
        Extracts aspects from the given text that are present in the provided
        aspect dictionary.

        Args:
            text (str): The review text to analyze.
            aspect_dictionary (list): A list of valid, known aspects for the
                                      product category.

        Returns:
            list: A list of aspects that were both found in the text and are
                  present in the aspect dictionary.
        """
        if not text or not aspect_dictionary:
            return []

        # 1. Extract all nouns from the text using the POS model
        model_outputs = self.pipeline(text)
        noun_tags = {'NOUN', 'PROPN'}
        extracted_nouns = {
            output['word'].lower() for output in model_outputs
            if output['entity_group'] in noun_tags
        }

        # 2. Filter the extracted nouns against the provided dictionary
        # We find the intersection between the two sets.
        valid_aspects = {aspect.lower() for aspect in aspect_dictionary}

        final_aspects = list(extracted_nouns.intersection(valid_aspects))

        return final_aspects
    

In [None]:
# --- Configuration ---
# --- IMPORTANT: UPDATE THIS PATH ---
# You need to provide the path to the best checkpoint file that was saved
# during the training of your sentiment model.
SENTIMENT_CHECKPOINT_PATH = "checkpoints/sentiment-binary-best-checkpoint.ckpt"

# --- Pre-defined Aspect Dictionaries for Different Product Categories ---
ASPECT_DICTIONARIES = {
    "Phone": ['camera', 'battery', 'battery life', 'screen', 'performance', 'price', 'design'],
    "Coffee Maker": ['ease of use', 'design', 'noise level', 'coffee quality', 'brew time', 'cleaning'],
    "Book": ['plot', 'characters', 'writing style', 'pacing', 'ending'],
    "Default": ['quality', 'price', 'service', 'design', 'features'] # A fallback list
}

def main():
    """
    Main function to run the command-line review analysis tool.
    """
    # --- 1. Load All Models ---
    print("--- Initializing all models ---")
    sentiment_classifier, summarizer, aspect_analyzer, aspect_extractor = None, None, None, None
    try:
        summarizer = ReviewSummarizer(force_cpu=True)
        aspect_analyzer = AspectAnalyzer(force_cpu=True)
        aspect_extractor = AspectExtractor(force_cpu=True)

        if not os.path.exists(SENTIMENT_CHECKPOINT_PATH):
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            print("!!! WARNING: Sentiment checkpoint path not found or not set.         !!!")
            print(f"!!! Please update the 'SENTIMENT_CHECKPOINT_PATH' variable in main.py")
            print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        else:
            sentiment_classifier = FineTunedSentimentClassifier(
                checkpoint_path=SENTIMENT_CHECKPOINT_PATH, force_cpu=True
            )
        print("\n--- All models loaded successfully ---\n")
    except Exception as e:
        print(f"An error occurred during model initialization: {e}")
        return

    # --- 2. Interactive Loop ---
    while True:
        print("\n==================================================")
        print("          Product Review Analysis Tool          ")
        print("==================================================")

        # Get user input
        review_text = input("Enter the product review text (or type 'quit' to exit):\n> ")
        if review_text.lower() == 'quit':
            break

        print("\nAvailable Product Categories:")
        for i, category in enumerate(ASPECT_DICTIONARIES.keys(), 1):
            print(f"{i}. {category}")

        category_choice = input(f"Select a product category (1-{len(ASPECT_DICTIONARIES)}):\n> ")
        try:
            category_idx = int(category_choice) - 1
            product_category = list(ASPECT_DICTIONARIES.keys())[category_idx]
        except (ValueError, IndexError):
            print("Invalid choice. Using 'Default' category.")
            product_category = "Default"

        # --- 3. Run Analysis ---
        print("\n--- Analyzing Review... ---")

        # a. Overall Sentiment
        sentiment_result = sentiment_classifier.classify(review_text)

        # b. Summary
        summary_result = summarizer.summarize(review_text)

        # c. Aspect Extraction and Analysis
        aspect_dictionary = ASPECT_DICTIONARIES.get(product_category)
        extracted_aspects = aspect_extractor.extract(review_text, aspect_dictionary)
        aspect_results = None
        if extracted_aspects:
            aspect_results = aspect_analyzer.analyze(review_text, extracted_aspects)

        # --- 4. Display Results ---
        print("\n-------------------- ANALYSIS RESULTS --------------------")
        print(f"\n[ Overall Sentiment ]")
        print(f"  - Sentiment: {sentiment_result['label']} (Score: {sentiment_result['score']:.2f})")

        print(f"\n[ Generated Summary ]")
        print(f"  - {summary_result}")

        print(f"\n[ Detected Aspect Sentiments ]")
        if aspect_results:
            for aspect, result in aspect_results.items():
                print(f"  - {aspect.title()}: {result['sentiment']} (Score: {result['score']:.2f})")
        else:
            print("  - No relevant aspects from the dictionary were detected in the review.")
        print("----------------------------------------------------------")


In [None]:
# --- Run the workflow ---
main()