In [74]:
import os
%pwd
os.getcwd()

'c:\\Users\\Admin\\Desktop\\text-summarizer\\text-summarizer-project'

In [75]:
from dataclasses import dataclass
from pathlib import Path        

@dataclass
class ModelEvaluationConfig:
    root_dir: Path 
    model_path: Path
    data_path: Path
    metric_file_name: Path
    tokenizer_path: Path



In [76]:
!pip install python-box




In [77]:
import sys
sys.path.append(r"c:\Users\Admin\Desktop\text-summarizer\text-summarizer-project\src")


In [78]:
from text_summarizer.constants import *
from text_summarizer.utils.common import read_yaml, create_directories



In [79]:
class ConfigurationManager:
    def __init__(self, config_path: Path):
        if not config_path.exists():
            raise FileNotFoundError(f"Config file not found: {config_path}")
        with open(config_path, "r") as f:
            self.config = yaml.safe_load(f)

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        cfg = self.config["model_evaluation"]
        return ModelEvaluationConfig(
            root_dir=Path(cfg["root_dir"]),
            model_path=Path(cfg["model_path"]),
            tokenizer_path=Path(cfg["tokenizer_path"]),
            data_path=Path(cfg["data_path"]),
            metric_file_name=Path(cfg["metric_file_name"])
        )

In [80]:
import os
from text_summarizer.logging import logger

In [81]:
!pip install evaluate





In [82]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from datasets import  Dataset , load_from_disk,load_dataset
import torch
from evaluate import load
import pandas as pd
from tqdm import tqdm

In [83]:
    
class ConfigurationManager:
    def __init__(self, config_path: Path):
        if not config_path.exists():
            raise FileNotFoundError(f"Config file not found: {config_path}")
        with open(config_path, "r") as f:
            self.config = yaml.safe_load(f)

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        cfg = self.config["model_evaluation"]
        return ModelEvaluationConfig(
            root_dir=Path(cfg["root_dir"]),
            model_path=Path(cfg["model_path"]),
            tokenizer_path=Path(cfg["tokenizer_path"]),
            data_path=Path(cfg["data_path"]),
            metric_file_name=Path(cfg["metric_file_name"])
        )

# =======================
# MODEL EVALUATION CLASS
# =======================
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load model safely on Windows
        model_dir = str(self.config.model_path.as_posix())
        tokenizer_dir = str(self.config.tokenizer_path.as_posix())

        logger.info(f"Loading model from {model_dir}")
        self.model = T5ForConditionalGeneration.from_pretrained(model_dir).to(self.device)

        logger.info(f"Loading tokenizer from {tokenizer_dir}")
        self.tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir)

    def generate_batch_sized_chunks(self, list_of_elements, batch_size):
        for i in range(0, len(list_of_elements), batch_size):
            yield list_of_elements[i:i + batch_size]

    def calculate_metrics_on_test_ds(
        self, dataset, metric, batch_size=4, column_text="article", column_summary="highlights"
    ):
        self.model.eval()
        preds, refs = [], []

        for batch in tqdm(self.generate_batch_sized_chunks(list(range(len(dataset))), batch_size)):
            texts = [dataset[i][column_text] for i in batch]
            summaries = [dataset[i][column_summary] for i in batch]

            inputs = self.tokenizer(
                texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=512
            ).to(self.device)

            with torch.no_grad():
                summary_ids = self.model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=150,
                    num_beams=2,
                    early_stopping=True
                )

            decoded_preds = self.tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
            preds.extend(decoded_preds)
            refs.extend(summaries)

        return metric.compute(predictions=preds, references=refs)

    def evaluate(self):
        logger.info("Loading ROUGE metric")
        metric = load("rouge")

        logger.info(f"Loading test dataset from {self.config.data_path}")
        test_dataset = load_from_disk(str(self.config.data_path.as_posix()))['test']

        logger.info("Calculating metrics on test dataset")
        results = self.calculate_metrics_on_test_ds(test_dataset, metric)

        logger.info(f"Saving metrics to {self.config.metric_file_name}")
        df = pd.DataFrame([results])
        df.to_csv(self.config.metric_file_name, index=False)
        logger.info("Evaluation complete!")

# =======================
# USAGE
# =======================



In [84]:
import os
os.chdir(r"c:\Users\Admin\Desktop\text-summarizer\text-summarizer-project")




In [85]:
if __name__ == "__main__":
    try:
        config_manager = ConfigurationManager(Path("config/config.yaml"))
        eval_config = config_manager.get_model_evaluation_config()
        evaluator = ModelEvaluation(eval_config)
        evaluator.evaluate()
    except Exception as e:
        raise e

2025-12-03 18:27:56,015 - INFO - 437885248 - Loading model from artifacts/model_trainer/t5-summarizer
2025-12-03 18:27:56,822 - INFO - 437885248 - Loading tokenizer from artifacts/model_trainer/tokenizer
2025-12-03 18:27:57,037 - INFO - 437885248 - Loading ROUGE metric
2025-12-03 18:28:02,160 - INFO - 437885248 - Loading test dataset from artifacts\data_ingestion


FileNotFoundError: Directory artifacts/data_ingestion is neither a `Dataset` directory nor a `DatasetDict` directory.

In [None]:
# class ModelEvaluation:
#     def __init__(self, config: ModelEvaluationConfig):
#         self.config = config
    
#     def generate_batch_sized_chunks(self, list_of_elements, batch_size):
#         for i in range(0, len(list_of_elements), batch_size):
#             yield list_of_elements[i:i + batch_size]
    
#     def calculate_metrics_on_test_ds(
#         self, dataset, metric, model, tokenizer,
#         batch_size=4,
#         device="cuda" if torch.cuda.is_available() else "cpu",
#         column_text="article",
#         column_summary="highlights"
#     ):
#         logger.info(f"Using device: {device}")
#         model.to(device)

#         preds = []
#         refs = []

#         for batch in tqdm(self.generate_batch_sized_chunks(list(range(len(dataset))), batch_size)):
#             texts = [dataset[i][column_text] for i in batch]
#             summaries = [dataset[i][column_summary] for i in batch]

#             inputs = tokenizer(
#                 texts,
#                 return_tensors="pt",
#                 padding=True,
#                 truncation=True,
#                 max_length=512
#             ).to(device)

#             with torch.no_grad():
#                 summary_ids = model.generate(
#                     input_ids=inputs["input_ids"],
#                     attention_mask=inputs["attention_mask"],
#                     max_length=150,
#                     num_beams=2,
#                     early_stopping=True
#                 )

#             decoded_preds = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
#             decoded_labels = summaries

#             preds.extend(decoded_preds)
#             refs.extend(decoded_labels)

#         return metric.compute(predictions=preds, references=refs)
    
#     def evaluate(self):
#         logger.info("Loading model and tokenizer")

#         # Fix Windows path issue
#         model_path = str(self.config.model_path).replace("\\", "/")
#         tokenizer_path = str(self.config.tokenizer_path).replace("\\", "/")

#         model = T5ForConditionalGeneration.from_pretrained(model_path)
#         tokenizer = T5TokenizerFast.from_pretrained(tokenizer_path)

#         logger.info("Loading ROUGE metric")
#         metric = load("rouge")

#         logger.info("Loading test dataset")
#         test_dataset = load_from_disk(self.config.data_path)['test']

#         logger.info("Calculating metrics on test dataset")
#         results = self.calculate_metrics_on_test_ds(test_dataset, metric, model, tokenizer)

#         logger.info("Saving metrics to CSV")
#         df = pd.DataFrame([results])
#         df.to_csv(self.config.metric_file_name, index=False)

#         logger.info(f"Metrics saved to {self.config.metric_file_name}")


In [None]:
# import os
# import yaml
# import torch
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# from pathlib import Path
# from text_summarizer.logging import logger

# CONFIG_FILE_PATH = "config/config.yaml"

# class ConfigurationManager:
#     def __init__(self, config_filepath=CONFIG_FILE_PATH):
#         self.config_filepath = config_filepath
#         self.config = self.read_yaml_file(self.config_filepath)

#     @staticmethod
#     def read_yaml_file(path):
#         if not os.path.exists(path):
#             raise FileNotFoundError(f"YAML file not found: {path}")
#         with open(path, "r") as file:
#             return yaml.safe_load(file)

#     def get_model_evaluation_config(self):
#         try:
#             return self.config["model_evaluation"]
#         except KeyError:
#             raise KeyError("Missing 'model_evaluation' section in config.yaml")

# class ModelEvaluation:
#     def __init__(self, config):
#         self.config = config
#         self.model_dir = Path(self.config["model_dir"])
#         self.tokenizer_dir = Path(self.config["tokenizer_dir"])
#         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#         self._load_model_and_tokenizer()

#     def _load_model_and_tokenizer(self):
#         model_path = self.model_dir / "model.safetensors"
#         if not model_path.exists():
#             raise FileNotFoundError(f"Model file not found: {model_path}")
#         if not self.tokenizer_dir.exists():
#             raise FileNotFoundError(f"Tokenizer folder not found: {self.tokenizer_dir}")

#         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_dir)
#         self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path, device_map="auto")
#         self.model.to(self.device)
#         logger.info(f"Model and tokenizer loaded from {self.model_dir} and {self.tokenizer_dir}")

#     def evaluate(self, input_texts):
#         self.model.eval()
#         results = []
#         for text in input_texts:
#             inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
#             with torch.no_grad():
#                 summary_ids = self.model.generate(**inputs, max_length=150, num_beams=4)
#                 summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#                 results.append(summary)
#         return results

# # Usage
# if __name__ == "__main__":
#     config_manager = ConfigurationManager()
#     eval_config = config_manager.get_model_evaluation_config()
#     evaluator = ModelEvaluation(config=eval_config)

#     sample_texts = [
#         "This is a long text that we want to summarize using our model."
#     ]
#     summaries = evaluator.evaluate(sample_texts)
#     for i, summary in enumerate(summaries):
#         print(f"Original: {sample_texts[i]}\nSummary: {summary}\n")


FileNotFoundError: YAML file not found: config/config.yaml

2025-12-03 16:58:36,585 - INFO - common - YAML file: C:\Users\Admin\Desktop\text-summarizer\text-summarizer-project\config\config.yaml loaded successfully.
2025-12-03 16:58:36,592 - INFO - common - YAML file: C:\Users\Admin\Desktop\text-summarizer\text-summarizer-project\config\params.yaml loaded successfully.
2025-12-03 16:58:36,594 - INFO - common - Directory created at: artifacts
2025-12-03 16:58:36,600 - INFO - 3102288557 - Using device: cpu
2025-12-03 16:58:36,602 - INFO - 3102288557 - Loading model and tokenizer


HFValidationError: Repo id must use alphanumeric chars, '-', '_' or '.'. The name cannot start or end with '-' or '.' and the maximum length is 96: 'C:\Users\Admin\Desktop\text-summarizer\text-summarizer-project\research\artifacts\model_trainer\t5-summarizer'.