# Model Evaluation Module

In [1]:
import os
%pwd

'c:\\Users\\amman\\Documents\\Generative AI\\End-to-End-Text-Summariser-Project\\notebooks'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\amman\\Documents\\Generative AI\\End-to-End-Text-Summariser-Project'

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path 
    model_path: Path
    tokenizer_path: Path 
    metric_file_name: Path


In [4]:
from src.textSummariser.constants import * 
from src.textSummariser.utils.common import read_yaml, create_directories


In [5]:
class ConfigurationManager:
    def __init__(self, config_path=CONFIG_FILE_PATH, params_path=PARAMS_FILE_PATH):
        self.config = read_yaml(config_path)
        self.params = read_yaml(params_path)

        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        
        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            model_path=config.model_path,
            tokenizer_path=config.tokenizer_path,
            metric_file_name=config.metric_file_name
        )

        return model_evaluation_config

In [6]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_from_disk
from tqdm import tqdm
import torch 
import pandas as pd
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


[2025-02-18 11:46:49,341: INFO: config: PyTorch version 2.6.0 available.]


In [9]:
class ModelEvauluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config=config 

    def generate_batch_sized_chunks(self,list_of_elements, batch_size):
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i: i + batch_size]

    def calculate_metric_on_test_ds(self, dataset, metric, model, tokenizer,
                                    batch_size=16, device="cuda" if torch.cuda.is_available() else "cpu",
                                    column_text="article",
                                    column_summary="hihglights"):
        
        article_batches = list(self.generate_batch_sized_chunks(dataset[column_text], batch_size))
        target_batches = list(self.generate_batch_sized_chunks(dataset[column_summary], batch_size))

        for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):
            inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                               padding="max_length", return_tensors="pt")
            
            summaries = model.generate(input_ids=inputs['input_ids'].to(device),
                                       attention_mask=inputs['attention_mask'].to(device),
                                       length_penalty=0.8, num_beams=8, max_length=128)
            
            decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                                  clean_up_tokenization_spaces=True) for s in summaries]
            
            decoded_summaries = [d.replace("", " ") for d in decoded_summaries]
            
            metric.add_batch(predictions=decoded_summaries, references=target_batch)

        score = metric.compute()
        print(score)
        print(type(score))
        return score
    
    def evaluate(self):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_path)
        model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(self.config.model_path)

        dataset_samsum_pt = load_from_disk(self.config.data_path)

        rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

        rouge_metric = evaluate.load('rouge')
        
        score = self.calculate_metric_on_test_ds(dataset_samsum_pt['test'][0:10], 
                                                 rouge_metric, model_pegasus, tokenizer,
                                                 batch_size=2, device=device, column_text = 'dialogue', column_summary= 'summary')
        

        rouge_dict = dict((rn, score[rn]) for rn in rouge_names)

        df = pd.DataFrame(rouge_dict, index=['pegasus'])
        df.to_csv(self.config.metric_file_name, index=False)


In [10]:
config = ConfigurationManager()
model_evaluation_config = config.get_model_evaluation_config()
model_evaluation = ModelEvauluation(config=model_evaluation_config)
model_evaluation.evaluate()

[2025-02-18 11:58:29,554: INFO: common: yaml file: config\config.yaml loaded successfully.]
[2025-02-18 11:58:29,558: INFO: common: yaml file: params.yaml loaded successfully.]
[2025-02-18 11:58:29,560: INFO: common: Created directory at: artifacts]
[2025-02-18 11:58:29,562: INFO: common: Created directory at: artifacts/model_evaluation]


100%|██████████| 5/5 [07:00<00:00, 84.19s/it]

[2025-02-18 12:05:37,435: INFO: rouge_scorer: Using default tokenizer.]





{'rouge1': np.float64(0.022542099025845155), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.022510047743793872), 'rougeLsum': np.float64(0.021594081357239253)}
<class 'dict'>
