In [21]:
import os

In [22]:
%pwd

'/Users/amoghagadde/Desktop/Amogha/Projects/Data_Science'

In [23]:
os.chdir("../")

In [24]:
%pwd

'/Users/amoghagadde/Desktop/Amogha/Projects'

In [25]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: Path

In [26]:
from textSummarizer.constants import *
from textSummarizer.utils.common import read_yaml, create_directories

In [27]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
        
        
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path = config.model_path,
            tokenizer_path = config.tokenizer_path,
            metric_file_name = config.metric_file_name
           
        )

        return model_evaluation_config

In [28]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk, load_metric
import torch
import pandas as pd
from tqdm import tqdm

In [29]:
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config


    
    def generate_batch_sized_chunks(self,list_of_elements, batch_size):
        """split the dataset into smaller batches that we can process simultaneously
        Yield successive batch-sized chunks from list_of_elements."""
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i : i + batch_size]

    
    def calculate_metric_on_test_ds(self,dataset, metric, model, tokenizer, 
                               batch_size=16, device = torch.device("cpu"),
                            #    device="cuda" if torch.cuda.is_available() else "cpu", 
                               column_text="article", 
                               column_summary="highlights"):
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(
            zip(article_batches, target_batches), total=len(article_batches)):
            
            inputs = tokenizer(article_batch, max_length=1024,  truncation=True, 
                            padding="max_length", return_tensors="pt")
            
            summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                            attention_mask=inputs["attention_mask"].to(device), 
                            length_penalty=0.8, num_beams=8, max_length=128)
            ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
            
            
            # Finally, we decode the generated texts, 
            # replace the  token, and add the decoded texts with the references to the metric.
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=True) 
                for s in summaries]      
            
            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
            
            
            metric.add_batch(predictions=decoded_summaries, references=target_batch)
            
        #  Finally compute and return the ROUGE scores.
        score = metric.compute()
        return score


    def evaluate(self):
        # device = "cuda" if torch.cuda.is_available() else "cpu"
        device = torch.device("cpu")
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_t5 = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path).to(device)
       
        #loading data 
        dataset_samsum_pt = load_from_disk(self.config.data_path)


        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
  
        rouge_metric = load_metric('rouge', trust_remote_code=True)

        score = self.calculate_metric_on_test_ds(
        dataset_samsum_pt['test'][0:10], rouge_metric, model_t5, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
            )

        rouge_dict = dict((rn, score[rn].mid.fmeasure ) for rn in rouge_names )

        df = pd.DataFrame(rouge_dict, index = ['t5'] )
        df.to_csv(self.config.metric_file_name, index=False)


**Values (length_penalty=0.8):**

A value of 1.0 (default) means no penalty is applied.
Values greater than 1.0 encourage shorter sequences. The model is penalized for generating longer sequences, leading to more concise summaries or responses.
Values less than 1.0 encourage longer sequences. The model is less penalized for longer outputs, which can be useful if the task requires more detailed or verbose text.

**Values (num_beams=8):**

num_beams = 1: This is equivalent to greedy search, where only the most probable sequence is kept at each step.
num_beams > 1: Multiple sequences are explored simultaneously, leading to potentially more accurate or higher-quality outputs.
Trade-offs:
Higher values for num_beams typically lead to more accurate and diverse outputs but also increase computational cost and memory usage.

Trade-offs:
Higher values for num_beams typically lead to more accurate and diverse outputs but also increase computational cost and memory usage.

**Values (, max_length=128):**

Any positive integer value can be used. The appropriate value typically depends on the nature of the task:
For summarization, a lower max_length might be appropriate.
For text generation tasks that require detailed descriptions, a higher max_length might be used.
Example:
If max_length = 128, the model will stop generating text once it reaches 128 tokens, regardless of whether the sequence is complete or not.
This can help prevent the model from generating overly verbose or redundant content.

**ROUGE** (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate the quality of machine-generated text, particularly in the context of text summarization and machine translation. ROUGE compares the overlap between the generated text (e.g., a summary or translation) and one or more reference texts, usually created by humans. 

Here are the key ROUGE metrics:

### 1. **ROUGE-N**
- **Definition**: Measures the overlap of n-grams between the generated text and the reference text.
- **Common Variants**:
  - **ROUGE-1**: Measures the overlap of unigrams (single words).
  - **ROUGE-2**: Measures the overlap of bigrams (two consecutive words).
- **Example**:
  - Reference: "The cat is on the mat."
  - Generated: "The cat sat on the mat."
  - ROUGE-1 would count the overlapping words like "The," "cat," "on," "the," and "mat."
  - ROUGE-2 would count overlapping bigrams like "The cat" and "on the."

### 2. **ROUGE-L**
- **Definition**: Measures the longest common subsequence (LCS) between the generated and reference texts.
- **Purpose**: Unlike ROUGE-N, which looks at contiguous sequences of words, ROUGE-L focuses on the longest sequence of words that appear in the same order in both texts. This is useful for capturing sentence structure and word order.
- **Example**:
  - Reference: "The cat is on the mat."
  - Generated: "The mat is where the cat sat."
  - The LCS is "The cat is the mat," giving credit to the sequence of words that appear in both sentences.

### 3. **ROUGE-Lsum**
- **Definition**: A variant of ROUGE-L that is often used specifically for summarization tasks. It compares the longest common subsequence in a sentence-by-sentence manner between the generated and reference summaries.

### 4. **ROUGE-W**
- **Definition**: A weighted version of ROUGE-L that gives more importance to longer sequences.

### 5. **ROUGE-S (ROUGE-Skip-Bigram)**
- **Definition**: Measures the overlap of skip-bigrams, which are pairs of words that appear in the same order but are not necessarily adjacent.

### **How ROUGE is Used**:
- **Text Summarization**: ROUGE is widely used to evaluate automatic summarization systems by comparing generated summaries against one or more human-written reference summaries.
- **Machine Translation**: In some cases, ROUGE is also used to evaluate translations by comparing them to reference translations.
- **Evaluation**: ROUGE scores are generally reported as precision, recall, and F1 scores:
  - **Precision**: The proportion of n-grams in the generated summary that are also in the reference summary.
  - **Recall**: The proportion of n-grams in the reference summary that are also in the generated summary.
  - **F1 Score**: The harmonic mean of precision and recall.

### **Why ROUGE?**
- **Efficiency**: ROUGE provides a quick and automatic way to evaluate the quality of generated text, which is especially useful when human evaluation is not feasible.
- **Standard Metric**: It is a standard and widely accepted metric in the fields of natural language processing and text summarization.

### **Limitations**:
- **Surface-Level**: ROUGE focuses on surface-level similarities (exact word matches) and does not account for semantic meaning or coherence.
- **Sensitivity to Synonyms**: It may not give credit for using synonyms or paraphrasing, even if the meaning is preserved.
- **Context Ignorance**: ROUGE does not consider the broader context or the logical flow of the generated text.

Despite its limitations, ROUGE remains a popular and useful metric, particularly when used in combination with other evaluation methods.

In [30]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    model_evaluation_config.evaluate()
except Exception as e:
    raise e

[2024-08-10 23:57:06,235: INFO: common: yaml file: /Users/amoghagadde/Desktop/Amogha/Projects/Data_Science/Text-Summarizer/config/config.yaml loaded successfully]
[2024-08-10 23:57:06,237: INFO: common: yaml file: /Users/amoghagadde/Desktop/Amogha/Projects/Data_Science/Text-Summarizer/params.yaml loaded successfully]
[2024-08-10 23:57:06,239: INFO: common: created directory at: artifacts]
[2024-08-10 23:57:06,239: INFO: common: created directory at: /Users/amoghagadde/Desktop/Amogha/Projects/Data_Science/Text-Summarizer/artifacts/model_evaluation]


100%|██████████| 5/5 [00:37<00:00,  7.59s/it]

[2024-08-10 23:57:44,996: INFO: rouge_scorer: Using default tokenizer.]



