In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\sentiment-analysis'

In [2]:
from dataclasses import dataclass
from pathlib import Path
import numpy as np
from typing import Dict, List, Tuple, Union, Optional, Any
from transformers import TrainingArguments

@dataclass
class TransformerModelConfig:
    """Configuration class for transformer-based model settings"""
    root_dir: Path
    data_file_path: Path
    model_name: str
    text_column: str
    label_column: str
    max_length: int
    batch_size: int
    num_labels: int
    labels: List[str]  

In [3]:
from SentiScope.constants import (CONFIG_FILE_PATH,
                                  PARAMS_FILE_PATH)
from SentiScope.utils.file_utils import (create_directories,
                                            get_size)
from SentiScope.utils.config_utils import (read_yaml,
                                           Settings,
                                           get_settings)

In [4]:
import json
import os
from pathlib import Path
from typing import Dict, Any
from transformers import TrainingArguments
from SentiScope.logging import logger


class ConfigurationManager:
    """Configuration manager for transformer-based sentiment analysis"""
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):
        
        # Read configuration and parameter files
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        
        # Create root artifacts directory
        create_directories([self.config.artifacts_root])
        
    def get_latest_report_transformers(self) -> Dict:
        """Locate the latest report.json file based on the timestamp folder."""
        config = self.config.feature_transformation
        profiling_dir = Path(config.root_dir)

        # Get all subdirectories in data_profiling
        timestamp_dirs = [d for d in profiling_dir.iterdir() if d.is_dir()]
        
        if not timestamp_dirs:
            raise FileNotFoundError("No timestamp folders found in data_profiling.")

        # Sort directories by name (assuming timestamp format)
        latest_dir = sorted(timestamp_dirs, key=lambda x: x.name, reverse=True)[0]
        report_path = latest_dir / "metadata.json"

        if not report_path.exists():
            raise FileNotFoundError(f"report.json not found in {latest_dir}.")

        # Load the report.json file
        with open(report_path, "r") as f:
            report_data = json.load(f)

        return report_data  




    def get_transformer_config(self) -> TransformerModelConfig:
        """Create and return transformer model configuration"""
        # Access transformer-specific config section
        config = self.config.transformer_model
        report_data = self.get_latest_report_transformation()
        
        create_directories([config.root_dir])
        
        timestamp = report_data["timestamp"]
        # data_file_path = Path(str(config.data_file).format(timestamp=timestamp))
        data_file_path = Path(config.data_file_path).joinpath(f"{timestamp}", "test_split.csv")
        
        # Create and return the transformer config
        transformer_config = TransformerModelConfig(
            root_dir=config.root_dir,
            data_file_path=  data_file_path,
            model_name=config.model_name,
            text_column=config.text_column,
            label_column=config.label_column,
            max_length=config.max_length,
            batch_size=config.batch_size,
            num_labels=config.num_labels,
            labels = config.labels
        )
        
        return transformer_config

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd
from SentiScope.logging import logger
class TransformerSentiment:
    def __init__(self, config: TransformerModelConfig):
        logger.info("Initializing TransformerSentiment...")
        self.config = config
        self.MODEL = self.config.model_name
        
        logger.info(f"Loading tokenizer and model for {self.MODEL}...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.MODEL)
        self.labels = self.config.labels
        logger.info("Tokenizer and model loaded successfully.")
        
    def predict_single_sentiment(self, text):
        """
        Predict sentiment for a single text input
        Returns: tuple (sentiment, confidence)
        """  
        # Tokenize the input text
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Get model predictions
        outputs = self.model(**inputs)
        scores = outputs.logits.detach().numpy()
        probabilities = torch.nn.functional.softmax(torch.tensor(scores), dim=-1)
        
        # Determine the sentiment
        sentiment_index = np.argmax(probabilities.numpy(), axis=1)[0]
        sentiment = self.labels[sentiment_index]
        confidence = probabilities[0][sentiment_index].item()

        return sentiment, confidence

    def predict_dataframe_sentiment(self, df):
        """
        Predict sentiment for all texts in a DataFrame
        Args:
            df: pandas DataFrame
        Returns: DataFrame with added sentiment and confidence columns
        """
        logger.info("Predicting sentiment for DataFrame...")
        
        # Create new columns for results
        df['sentiment'] = ''
        df['confidence'] = 0.0
        
        # Process each text in the DataFrame
        for idx in df.index:
            text = df.loc[idx, self.config.text_column]
            sentiment, confidence = self.predict_single_sentiment(text)
            df.loc[idx, 'sentiment'] = sentiment
            df.loc[idx, 'confidence'] = confidence
        
        logger.info("Sentiment prediction for DataFrame completed.")
        return df


In [12]:
# Initialize configuration
config_manager = ConfigurationManager()
transformer_config = config_manager.get_transformer_config()
Transformer_Sentiment = TransformerSentiment(config=transformer_config)
data = pd.read_csv("artifacts/feature_transformation/20250116_043222/test_split.csv").head(50)
results = Transformer_Sentiment.predict_dataframe_sentiment(data)
results

[2025-01-17 03:31:21,557: INFO: config_utils: yaml file: config\config.yaml loaded successfully]
[2025-01-17 03:31:21,559: INFO: config_utils: yaml file: params.yaml loaded successfully]
[2025-01-17 03:31:21,560: INFO: file_utils: created directory at: artifacts]
[2025-01-17 03:31:21,561: INFO: file_utils: created directory at: artifacts/transformer_models]
[2025-01-17 03:31:21,562: INFO: 3086270261: Initializing TransformerSentiment...]
[2025-01-17 03:31:21,562: INFO: 3086270261: Loading tokenizer and model for cardiffnlp/twitter-roberta-base-sentiment...]
[2025-01-17 03:31:22,851: INFO: 3086270261: Tokenizer and model loaded successfully.]
[2025-01-17 03:31:23,076: INFO: 3086270261: Predicting sentiment for DataFrame...]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[2025-01-17 03:31:26,706: INFO: 3086270261: Sentiment prediction for DataFrame completed.]


Unnamed: 0.1,Unnamed: 0,tweets,labels,processed_text,text_length,sentiment,confidence
0,162912,How will ChatGPT affect the Web3 space? Indust...,bad,chatgpt affect web3 space industri answer,104,neutral,0.849885
1,77419,Thank you ChatGPT 🥲 https://t.co/zc3V7LZyhk ht...,neutral,thank chatgpt,67,good,0.968757
2,32239,ChatGPT prompt: “Write a letter from Demeter t...,neutral,chatgpt prompt write letter demet hade form co...,144,good,0.915747
3,89499,The ChatGPT screenplays and stores are crap an...,bad,chatgpt screenplay store crap dull think thing...,207,bad,0.972925
4,173018,A New Chat Bot Is a ‘Code Red’ for Google’s Se...,bad,new chat bot code red googl search busi,83,neutral,0.585799
5,122726,ChatGPT is already at 1 million users in just ...,neutral,chatgpt alreadi 1 million user 6 day perspect ...,279,good,0.761117
6,8476,How un-American #ChatGPT https://t.co/oLS2gOeN1M,bad,chatgpt,48,bad,0.82305
7,181423,Has anyone found a way to get ChatGPT to conti...,neutral,anyon found way get chatgpt continu gener code...,278,neutral,0.504812
8,21748,I just used ChatGPT to rubber-duck a solution ...,neutral,use chatgpt solut moder complex holi moli,120,neutral,0.486404
9,192367,Please check my new video. ChatGPT vs. Chatson...,good,pleas check new video chatgpt chatson comprehe...,124,neutral,0.816349
