In [1]:
import os 
os.chdir("../")

In [11]:
import logging
from pathlib import Path
from dataclasses import dataclass
from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_from_disk
import pandas as pd
import torch
from src.textSummarizer.utils.common import create_directories
from typing import Dict, Tuple
import evaluate 
logger = logging.getLogger(__name__)

In [12]:
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: str
    batch_size: int = 8
    max_length: int = 128
    num_beams: int = 4

In [13]:

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Evaluation will run on: {self.device}")

    def load_components(self) -> Tuple[BartForConditionalGeneration, BartTokenizer]:
        """Load model and tokenizer"""
        try:
            model = BartForConditionalGeneration.from_pretrained(str(self.config.model_path))
            model = model.to(self.device)
            
            tokenizer = BartTokenizer.from_pretrained(str(self.config.tokenizer_path))
            
            return model, tokenizer
        except Exception as e:
            logger.error(f"Error loading components: {str(e)}")
            raise

    def load_datasets(self):
        """Load the test dataset for evaluation"""
        try:
            test_path = Path(self.config.data_path) / "transformed_test_data"
            test_dataset = load_from_disk(test_path)
            logger.info(f"Loaded test dataset with {len(test_dataset)} samples")
            return test_dataset
        except Exception as e:
            logger.error(f"Error loading datasets: {str(e)}")
            raise

    def compute_metrics(self, eval_pred, tokenizer) -> Dict[str, float]:
        """Compute evaluation metrics (ROUGE)"""
        rouge = evaluate.load("rouge")
        predictions, labels = eval_pred

        # Replace -100 in labels with pad_token_id for decoding
        labels = [[(label if label != -100 else tokenizer.pad_token_id) for label in l] for l in labels]

        # Decode predicted and reference summaries
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Clean text
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        # Compute ROUGE
        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        result = {k: round(v * 100, 4) for k, v in result.items()}

        return result

    def evaluate(self):
        """Complete evaluation pipeline"""
        try:
            model, tokenizer = self.load_components()
            test_dataset = self.load_datasets()
            
            # Prepare data collator
            data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

            # Prepare the trainer
            trainer = Seq2SeqTrainer(
                model=model,
                args=Seq2SeqTrainingArguments(
                    output_dir=self.config.root_dir,
                    per_device_eval_batch_size=self.config.batch_size,
                    predict_with_generate=True,
                    logging_dir=os.path.join(self.config.root_dir, "logs"),
                ),
                data_collator=data_collator,
                compute_metrics=lambda eval_pred: self.compute_metrics(eval_pred, tokenizer),  # Pass tokenizer here
            )

            logger.info("Starting evaluation...")
            metrics = trainer.evaluate(eval_dataset=test_dataset)
            logger.info(f"Evaluation metrics: {metrics}")

            # Save metrics
            metrics_path = Path(self.config.root_dir) / self.config.metric_file_name
            pd.DataFrame([metrics]).to_csv(metrics_path, index=False)
            logger.info(f"Metrics saved to {metrics_path}")

            return metrics
        except Exception as e:
            logger.error(f"Evaluation failed: {str(e)}")
            raise

In [14]:
# Configure paths (should match your config.yaml)
eval_config = ModelEvaluationConfig(
    root_dir=Path("artifacts/model_evaluation"),
    data_path=Path("artifacts/data_transformation"),
    model_path=Path("artifacts/model_trainer/final_model"),
    tokenizer_path=Path("artifacts/model_trainer/final_model"),
    metric_file_name="evaluation_metrics.csv"
)

# Run evaluation
evaluator = ModelEvaluation(eval_config)
results = evaluator.evaluate()

# Output results
print("ROUGE Scores:", results)


[2025-07-05 21:49:16,512: INFO: 1843591608: Evaluation will run on: cuda]
[2025-07-05 21:49:17,142: INFO: 1843591608: Loaded test dataset with 150 samples]
[2025-07-05 21:49:17,663: INFO: 1843591608: Starting evaluation...]


[2025-07-05 21:50:06,196: INFO: rouge_scorer: Using default tokenizer.]
[2025-07-05 21:50:06,582: INFO: 1843591608: Evaluation metrics: {'eval_loss': 1.1355241537094116, 'eval_model_preparation_time': 0.002, 'eval_rouge1': 25.1023, 'eval_rouge2': 11.8723, 'eval_rougeL': 20.1832, 'eval_rougeLsum': 23.3543, 'eval_runtime': 48.9063, 'eval_samples_per_second': 3.067, 'eval_steps_per_second': 0.388}]
[2025-07-05 21:50:06,592: INFO: 1843591608: Metrics saved to artifacts\model_evaluation\evaluation_metrics.csv]
ROUGE Scores: {'eval_loss': 1.1355241537094116, 'eval_model_preparation_time': 0.002, 'eval_rouge1': 25.1023, 'eval_rouge2': 11.8723, 'eval_rougeL': 20.1832, 'eval_rougeLsum': 23.3543, 'eval_runtime': 48.9063, 'eval_samples_per_second': 3.067, 'eval_steps_per_second': 0.388}


In [None]:
import os
os.chdir("../")

import logging
from pathlib import Path
from dataclasses import dataclass
from transformers import BartForConditionalGeneration, BartTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_from_disk
import pandas as pd
import torch
import yaml
from src.textSummarizer.constants import *
from typing import Dict, Tuple
import evaluate

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_path: Path
    model_path: Path
    tokenizer_path: Path
    metric_file_name: str

@dataclass
class Seq2SeqTrainingArgumentsConfig:
    output_dir: Path
    eval_strategy: str
    eval_steps: int
    logging_steps: int
    save_steps: int
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    gradient_accumulation_steps: int
    num_train_epochs: int
    save_total_limit: int
    predict_with_generate: bool
    fp16: bool
    logging_dir: Path
    report_to: str
    learning_rate: float
    weight_decay: float
    warmup_steps: int
    gradient_checkpointing: bool

class ConfigurationManager:
    def __init__(self, config_path: str=CONFIG_FILE_PATH, params_path: str=PARAMS_FILE_PATH):
        self.config_path = config_path
        self.params_path = params_path
        self.model_evaluation_config = self.load_model_evaluation_config()
        self.training_args_config = self.load_training_args_config()

    def load_model_evaluation_config(self) -> ModelEvaluationConfig:
        """Load model evaluation configuration from YAML file."""
        with open(self.config_path, 'r') as file:
            config = yaml.safe_load(file)
            model_eval_config = config['model_evaluation']
            return ModelEvaluationConfig(
                root_dir=Path(model_eval_config['root_dir']),
                data_path=Path(model_eval_config['data_path']),
                model_path=Path(model_eval_config['model_path']),
                tokenizer_path=Path(model_eval_config['tokenizer_path']),
                metric_file_name=model_eval_config['metric_file_name']
            )

    def load_training_args_config(self) -> Seq2SeqTrainingArgumentsConfig:
        """Load training arguments configuration from YAML file."""
        with open(self.params_path, 'r') as file:
            params = yaml.safe_load(file)
            training_args = params['Seq2SeqTrainingArguments']
            return Seq2SeqTrainingArgumentsConfig(
                output_dir=Path(training_args['output_dir']),
                eval_strategy=training_args['eval_strategy'],
                eval_steps=training_args['eval_steps'],
                logging_steps=training_args['logging_steps'],
                save_steps=training_args['save_steps'],
                per_device_train_batch_size=training_args['per_device_train_batch_size'],
                per_device_eval_batch_size=training_args['per_device_eval_batch_size'],
                gradient_accumulation_steps=training_args['gradient_accumulation_steps'],
                num_train_epochs=training_args['num_train_epochs'],
                save_total_limit=training_args['save_total_limit'],
                predict_with_generate=training_args['predict_with_generate'],
                fp16=training_args['fp16'],
                logging_dir=Path(training_args['logging_dir']),
                report_to=training_args['report_to'],
                learning_rate=training_args['learning_rate'],
                weight_decay=training_args['weight_decay'],
                warmup_steps=training_args['warmup_steps'],
                gradient_checkpointing=training_args['gradient_checkpointing']
            )

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig, training_args_config: Seq2SeqTrainingArgumentsConfig):
        self.config = config
        self.training_args_config = training_args_config  # Store training args config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Evaluation will run on: {self.device}")

    def load_components(self) -> Tuple[BartForConditionalGeneration, BartTokenizer]:
        """Load model and tokenizer"""
        try:
            model = BartForConditionalGeneration.from_pretrained(str(self.config.model_path))
            model = model.to(self.device)
            
            tokenizer = BartTokenizer.from_pretrained(str(self.config.tokenizer_path))
            
            return model, tokenizer
        except Exception as e:
            logger.error(f"Error loading components: {str(e)}")
            raise

    def load_datasets(self):
        """Load the test dataset for evaluation"""
        try:
            test_path = Path(self.config.data_path) / "transformed_test_data"
            test_dataset = load_from_disk(test_path)
            logger.info(f"Loaded test dataset with {len(test_dataset)} samples")
            return test_dataset
        except Exception as e:
            logger.error(f"Error loading datasets: {str(e)}")
            raise

    def compute_metrics(self, eval_pred, tokenizer) -> Dict[str, float]:
        """Compute evaluation metrics (ROUGE)"""
        rouge = evaluate.load("rouge")
        predictions, labels = eval_pred

        # Replace -100 in labels with pad_token_id for decoding
        labels = [[(label if label != -100 else tokenizer.pad_token_id) for label in l] for l in labels]

        # Decode predicted and reference summaries
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Clean text
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        # Compute ROUGE
        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        result = {k: round(v * 100, 4) for k, v in result.items()}

        return result

    def evaluate(self):
        """Complete evaluation pipeline"""
        try:
            model, tokenizer = self.load_components()
            test_dataset = self.load_datasets()
            
            # Prepare data collator
            data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

            # Prepare the trainer using training arguments from the configuration
            trainer = Seq2SeqTrainer(
                model=model,
                args=Seq2SeqTrainingArguments(
                    output_dir=self.training_args_config.output_dir,
                    per_device_eval_batch_size=self.training_args_config.per_device_eval_batch_size,
                    predict_with_generate=self.training_args_config.predict_with_generate,
                    logging_dir=self.training_args_config.logging_dir,
                ),
                data_collator=data_collator,
                compute_metrics=lambda eval_pred: self.compute_metrics(eval_pred, tokenizer),  # Pass tokenizer here
            )

            logger.info("Starting evaluation...")
            metrics = trainer.evaluate(eval_dataset=test_dataset)
            logger.info(f"Evaluation metrics: {metrics}")

            # Save metrics
            metrics_path = Path(self.config.root_dir) / self.config.metric_file_name
            pd.DataFrame([metrics]).to_csv(metrics_path, index=False)
            logger.info(f"Metrics saved to {metrics_path}")

            return metrics
        except Exception as e:
            logger.error(f"Evaluation failed: {str(e)}")
            raise

# Usage Example
if __name__ == "__main__":
    # Load configurations
    config_manager = ConfigurationManager()

    # Access model evaluation config
    eval_config = config_manager.model_evaluation_config

    # Access training arguments config
    training_args_config = config_manager.training_args_config

    # Run evaluation
    evaluator = ModelEvaluation(eval_config, training_args_config)
    results = evaluator.evaluate()

    # Output results
    print("ROUGE Scores:", results)


INFO:__main__:Evaluation will run on: cuda


INFO:__main__:Loaded test dataset with 150 samples
INFO:__main__:Starting evaluation...


ERROR:__main__:Evaluation failed: name 'evaluate' is not defined


NameError: name 'evaluate' is not defined