In [None]:
import os
os.chdir("../")
%pwd

In [None]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class SummarizationModelConfig:
    root_dir: Path
    data_path: Path
    tokenizer_name: Path

In [1]:
from TextSummarizer.constants import *
from TextSummarizer.utils.file_utils import *
from TextSummarizer.utils.config_utils import *
from TextSummarizer.utils.lib_utils import *

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_transformation_config(self) -> SummarizationModelConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = SummarizationModelConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
            tokenizer_name = config.tokenizer_name
        )

        return data_transformation_config

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqGeneration
import torch

In [None]:
class SummarizationModel:
    def __init__(self,
                    config: SummarizationModelConfig)
        logger.info("Initializing DataStandardization with config")
        self.config = config
        self.model_name = self.config.tokenizer_name
        pass
    """
    Handles all model-related operations and text generation.
    """
    def __init__(self, model_name="facebook/bart-large-cnn"):
        """
        Initialize the model and tokenizer handler.
        Args:
            model_name (str): Name of the pre-trained model to use
        """
        self.model = AutoModelForSeq2SeqGeneration.from_pretrained(model_name)
        self.tokenizer_handler = TextTokenizer(model_name)
        
    def generate_summary(self, text, max_summary_length=150):
        """
        Generate summary from input text.
        Args:
            text (str): Input text to summarize
            max_summary_length (int): Maximum length of the summary
        Returns:
            str: Generated summary
        """
        # Get tokenized input
        inputs = self.tokenizer_handler.tokenize_text(text)
        
        # Generate summary
        summary_ids = self.model.generate(
            inputs["input_ids"],
            max_length=max_summary_length,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        # Decode summary
        return self.tokenizer_handler.decode_tokens(summary_ids[0])

In [None]:
# Example usage
def main():
    # Initialize the model
    summarizer = SummarizationModel()
    
    # Example text
    text = """
    Artificial intelligence has transformed various sectors of society, from healthcare 
    to transportation. Machine learning algorithms now power everything from 
    recommendation systems to autonomous vehicles. Despite these advances, 
    challenges remain regarding AI ethics and bias.
    """
    
    # Get tokenized input (if needed for inspection)
    tokens = summarizer.tokenizer_handler.tokenize_text(text)
    print("Tokenized input shape:", tokens["input_ids"].shape)
    
    # Generate summary
    summary = summarizer.generate_summary(text)
    print("\nGenerated summary:", summary)
    
    # Get vocabulary sample (if needed)
    vocab = summarizer.tokenizer_handler.get_vocabulary()
    print("\nVocabulary size:", len(vocab))

if __name__ == "__main__":
    main()

In [None]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config=data_transformation_config)
    data_transformation.convert()
except Exception as e:
    raise e