In [1]:
import os
os.chdir("../")
%pwd

'd:\\AI\\NLP\\HandsOn\\Text Summarization'

In [2]:
from dataclasses import dataclass
from pathlib import Path
import torch

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    data_path: Path
    checkpoint: str
    max_length: int
    min_length: int
    output_dir: str
    learning_rate: float
    train_batch_size: int
    eval_batch_size: int
    weight_decay: float
    save_total_limit: int
    num_train_epochs: int
    prefix: str
    push_to_hub: bool
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from TextSummarizer.constants import *
from TextSummarizer.utils.file_utils import *
from TextSummarizer.utils.config_utils import *
from TextSummarizer.utils.model_utils import *
from TextSummarizer.utils.lib_utils import *

In [4]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_summarizer_config(self) -> TrainingConfig:
        config = self.params.model

        create_directories([config.root_dir])

        model_summarize_config = TrainingConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            checkpoint = config.checkpoint,
            max_length = config.max_length,
            min_length = config.min_length,
            output_dir = config.output_dir,
            learning_rate = float(config.learning_rate),
            train_batch_size = config.train_batch_size,
            eval_batch_size = config.eval_batch_size,
            weight_decay = config.weight_decay,
            save_total_limit = config.save_total_limit,
            num_train_epochs = config.num_train_epochs,
            push_to_hub = config.push_to_hub,
            device = config.device,
            prefix = config.prefix
        )

        return model_summarize_config

In [5]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer ,AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
from tqdm import tqdm
import evaluate
import nltk
from datasets import DatasetDict, load_dataset, load_from_disk 
from torch.utils.data import DataLoader
from pathlib import Path
from transformers import DataCollatorForSeq2Seq
from huggingface_hub import HfApi

  from .autonotebook import tqdm as notebook_tqdm


[2024-12-25 09:09:14,888: INFO: config: PyTorch version 2.5.1+cu121 available.]


In [9]:
class SummarizationModel:
    def __init__(self, config: TrainingConfig):
        logger.info("Initializing SummarizationModel with provided configuration.")
        self.config = config
        self.settings = get_settings()
        self.model_name = "TED_FineTuned_google-t5"
        self.data_path = self.config.data_path
        self.output_dir = self.config.output_dir
        self.device = self.config.device
        self.checkpoint = self.config.checkpoint
        self.max_length = self.config.max_length
        self.min_length = self.config.min_length
        self.prefix = self.config.prefix
        logger.info("Loading tokenizer and model from checkpoint: %s", self.checkpoint)
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.checkpoint).to(self.device)
        logger.info("Tokenizer and model loaded successfully.")
        self.data_collator = DataCollatorForSeq2Seq(
            tokenizer=self.tokenizer,
            model=self.model,
            padding=True,
            max_length=self.max_length
        )
        logger.info("Data collator initialized.")
        self.training_args = Seq2SeqTrainingArguments(
            output_dir=self.output_dir,
            eval_strategy="epoch",
            learning_rate=self.config.learning_rate,
            per_device_train_batch_size=self.config.train_batch_size,
            per_device_eval_batch_size=self.config.eval_batch_size,
            weight_decay=self.config.weight_decay,
            save_total_limit=self.config.save_total_limit,
            num_train_epochs=self.config.num_train_epochs,
            predict_with_generate=True,
            fp16=True,
            push_to_hub=self.config.push_to_hub,
        )
        logger.info("Training arguments configured.")

    def load_data_into_DatasetDict(self) -> DatasetDict:
        logger.info("Loading dataset from path: %s", self.data_path)
        loaded_dataset = load_from_disk(self.data_path)
        logger.info("Dataset loaded successfully with %d splits.", len(loaded_dataset))
        return loaded_dataset

    def compute_metrics(self, eval_pred):
        logger.info("Computing evaluation metrics.")
        rouge = evaluate.load("rouge")
        predictions, labels = eval_pred
        decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
        decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
        result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
        prediction_lens = [np.count_nonzero(pred != self.tokenizer.pad_token_id) for pred in predictions]
        result["gen_len"] = np.mean(prediction_lens)
        logger.info("Metrics computed: %s", result)
        return {k: round(v, 4) for k, v in result.items()}

    def model_trainer(self, dataset):
        logger.info("Initializing Seq2SeqTrainer for training.")
        trainer = Seq2SeqTrainer(
            model=self.model,
            args=self.training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
            compute_metrics=self.compute_metrics,
        )
        logger.info("Training started.")
        trainer.train()
        logger.info("Training completed.")
        return trainer
    

    def save_model(self, model):
        logger.info("Saving model to output directory: %s", self.output_dir)
        model.save_pretrained(self.output_dir)
        # model.save_model(self.output_dir)
        logger.info("Model saved successfully.")

    def upload_trained_model(self, trainer: Trainer):
        try:
            uploader = HuggingFaceModelUploader(
                model_name=self.model_name,
                output_dir=self.output_dir,
                private=False
            )
            uploader.run(trainer=trainer)
            return True
        except Exception as e:
            logger.error(f"Upload failed: {str(e)}")
            raise


    def predict(self, text: str) -> str:
        logger.info("Generating summary for input text.")
        inputs = self.tokenizer(
            self.prefix + text,
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        ).to(self.device)
        summary_ids = self.model.generate(
            inputs["input_ids"],
            max_length=self.max_length,
            min_length=self.min_length,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )
        summary = self.tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        logger.info("Summary generated: %s", summary)
        return summary

    def evaluate_model(self, dataset):
        logger.info("Evaluating the model on the test dataset.")
        
        def tokenize_function(batch):
            inputs = self.tokenizer(
                [self.prefix + text for text in batch["text"]],
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors=None  # Don't convert to tensors in map
            )
            return inputs

        # Tokenize the test dataset
        tokenized_dataset = dataset["test"].map(
            tokenize_function,
            batched=True,
            remove_columns=dataset["test"].column_names
        )

        # Create dataloader for batched processing
        eval_dataloader = DataLoader(
            tokenized_dataset,
            batch_size=self.training_args.per_device_eval_batch_size,
            collate_fn=self.data_collator
        )

        predictions = []
        references = dataset["test"]["summary"]

        # Generate predictions in batches
        self.model.eval()
        with torch.no_grad():
            for batch in eval_dataloader:
                # Only move non-None tensor values to device
                processed_batch = {}
                for k, v in batch.items():
                    if v is not None and isinstance(v, torch.Tensor):
                        processed_batch[k] = v.to(self.device)
                    else:
                        processed_batch[k] = v
                
                # Ensure we have input_ids
                if "input_ids" not in processed_batch or processed_batch["input_ids"] is None:
                    logger.warning("Skipping batch due to missing input_ids")
                    continue
                    
                outputs = self.model.generate(
                    input_ids=processed_batch["input_ids"],
                    max_length=self.max_length,
                    num_beams=4,
                    early_stopping=True
                )
                predictions.extend(
                    self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
                )

        # Use the existing compute_metrics logic
        rouge = evaluate.load("rouge")
        scores = rouge.compute(
            predictions=predictions,
            references=references,
            use_stemmer=True
        )
        
        logger.info("Evaluation completed with scores: %s", scores)
        return {k: round(v, 4) for k, v in scores.items()}

In [None]:
try:
    # Load configuration and initialize summarization model
    config = ConfigurationManager()
    model_summarizer_config = config.get_model_summarizer_config()
    model_summarizer = SummarizationModel(config=model_summarizer_config)
    
    # Load dataset
    data_set = model_summarizer.load_data_into_DatasetDict()
    
    # Train the summarizer
    trained_summarizer = model_summarizer.model_trainer(data_set)
    
    # Pushing the model to the Huggig Face Hub
    model_summarizer.push_model_to_hub(trained_summarizer)
    
    # Save the trained model
    model_summarizer.save_model(trained_summarizer)
    
    # Upload model to hub
    model_summarizer.upload_trained_model(trained_summarizer)
    
    # Evaluate the model on the test dataset
    logger.info("Starting model evaluation...")
    evaluation_results = model_summarizer.evaluate_model(data_set)
    logger.info(f"Evaluation Results: {evaluation_results}")


except Exception as e:
    logger.error(f"An error occurred: {e}")
    raise e


In [None]:
# Test prediction on a single example
logger.info("Starting prediction...")
sample_text = "Imagine a world overwhelmed by information, where sifting through endless articles, reports, and data consumes valuable time and energy. AI summarization offers a powerful solution, employing natural language processing to condense lengthy texts into concise summaries. These systems utilize two primary approaches: extractive summarization, selecting key sentences directly from the source, and abstractive summarization, generating new, more human-like summaries that capture the core meaning. This technology finds diverse applications, from streamlining news consumption and accelerating research analysis to optimizing business workflows by summarizing meetings and customer feedback. While challenges remain in handling complex language, nuanced meanings, and ensuring factual accuracy, ongoing research continually improves the fluency, coherence, and contextual understanding of AI-generated summaries. Ultimately, AI summarization promises to revolutionize how we process information, empowering us to access key insights quickly and efficiently, unlocking the potential of knowledge for a more informed future."
predicted_summary = model_summarizer.predict(sample_text)
logger.info(f"Predicted Summary: {predicted_summary}")

In [None]:
from transformers import AutoModelForSeq2SeqLM , AutoTokenizer

# Load your model and tokenizer
model_name = "AbdallahElraey/HFmodels"  # Your model path from the image
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Example text for inference
text = "Imagine a world overwhelmed by information, where sifting through endless articles, reports, and data consumes valuable time and energy. AI summarization offers a powerful solution, employing natural language processing to condense lengthy texts into concise summaries. These systems utilize two primary approaches: extractive summarization, selecting key sentences directly from the source, and abstractive summarization, generating new, more human-like summaries that capture the core meaning. This technology finds diverse applications, from streamlining news consumption and accelerating research analysis to optimizing business workflows by summarizing meetings and customer feedback. While challenges remain in handling complex language, nuanced meanings, and ensuring factual accuracy, ongoing research continually improves the fluency, coherence, and contextual understanding of AI-generated summaries. Ultimately, AI summarization promises to revolutionize how we process information, empowering us to access key insights quickly and efficiently, unlocking the potential of knowledge for a more informed future."

# Tokenize and generate
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
outputs = model.generate(**inputs)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(result)

------------