#BART-Large-CNN Text Summarization


##Libraries and Dependencies

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from typing import List, Dict, Any, Optional
import textwrap
import numpy as np

##Implementation

In [None]:
class TextSummarizer:
    def __init__(self, model_name: str = "facebook/bart-large-cnn"):
        """
        Initialize the text summarizer with a Hugging Face model.

        Args:
            model_name (str): Name of the Hugging Face model to use.
                Default: "facebook/bart-large-cnn" (good balance of quality and speed)
                Other options:
                - "t5-small" (faster, lower quality)
                - "google/pegasus-xsum" (news-focused)
                - "philschmid/bart-large-cnn-samsum" (optimized for dialogues)
        """
        print(f"Loading model: {model_name} (this may take a minute on first run)")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

            # For advanced summarization
            self.summarization_pipeline = pipeline(
                "summarization",
                model=model_name,
                device=0 if torch.cuda.is_available() else -1  # Use GPU if available
            )

            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            print(f"Model loaded successfully. Using device: {self.device}")

            # Store model name for reference
            self.model_name = model_name

        except Exception as e:
            print(f"Error loading model: {e}")
            print("Trying to load a smaller model as fallback...")
            try:
                # Fallback to a smaller model
                fallback_model = "t5-small"
                self.tokenizer = AutoTokenizer.from_pretrained(fallback_model)
                self.model = AutoModelForSeq2SeqLM.from_pretrained(fallback_model)
                self.summarization_pipeline = pipeline(
                    "summarization",
                    model=fallback_model,
                    device=0 if torch.cuda.is_available() else -1
                )
                self.model_name = fallback_model
                print(f"Fallback model loaded: {fallback_model}")
            except Exception as e2:
                raise RuntimeError(f"Failed to load both primary and fallback models: {e2}")

    def summarize(self, text: str, max_length: int = 150, min_length: int = 30,
                  style: str = "concise", preserve_key_points: bool = True) -> str:
        """
        Summarize the given text using the loaded model.

        Args:
            text (str): The text to summarize.
            max_length (int): Maximum token length for the summary.
            min_length (int): Minimum token length for the summary.
            style (str): Style of summary ("concise", "detailed", "bullet-points").
            preserve_key_points (bool): Whether to try to preserve key information.

        Returns:
            str: The generated summary.
        """
        # Adjust parameters based on style
        if style == "concise":
            # Keep defaults, shorter summary
            pass
        elif style == "detailed":
            # Increase length for more details
            max_length = max(max_length * 2, 250)
            min_length = max(min_length * 2, 100)
        elif style == "bullet-points":
            # Will be post-processed into bullet points
            pass

        # For more control, directly set parameters for pipeline
        num_beams = 4
        if preserve_key_points:
            # Using more beams and lower temperature helps preserve key information
            num_beams = 6
            temperature = 0.7
        else:
            temperature = 1.0

        # Handle long texts by chunking if needed
        max_input_length = self.tokenizer.model_max_length
        input_ids = self.tokenizer.encode(text, truncation=False)

        if len(input_ids) > max_input_length:
            # Text is too long, need to chunk it
            return self._summarize_long_text(text, max_length, min_length, num_beams, temperature)

        # Generate summary
        summary = self.summarization_pipeline(
            text,
            max_length=max_length,
            min_length=min_length,
            num_beams=num_beams,
            temperature=temperature,
            early_stopping=True
        )

        result = summary[0]['summary_text']

        # Post-process for bullet points if requested
        if style == "bullet-points":
            # Convert to bullet points by identifying key sentences
            sentences = [s.strip() for s in result.split('.') if s.strip()]
            bullet_points = "\n".join([f"• {s}." for s in sentences])
            return bullet_points

        return result

    def _summarize_long_text(self, text: str, max_length: int, min_length: int,
                             num_beams: int, temperature: float) -> str:
        """
        Handle summarization of texts longer than the model's maximum input length.
        Uses a chunking approach to process long documents.

        Args:
            text (str): The long text to summarize.
            max_length (int): Maximum token length for the final summary.
            min_length (int): Minimum token length for the final summary.
            num_beams (int): Number of beams for beam search.
            temperature (float): Temperature for generation.

        Returns:
            str: The generated summary.
        """
        # Split text into sentences
        sentences = [s.strip() + '.' for s in text.replace('\n', ' ').split('.') if s.strip()]

        # Calculate max tokens for model
        max_tokens = self.tokenizer.model_max_length - 50  # Reserve tokens for generation

        # Create chunks that fit within model's context window
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            sentence_tokens = len(self.tokenizer.encode(sentence))

            if current_length + sentence_tokens > max_tokens:
                # This chunk is full, start a new one
                if current_chunk:  # Make sure we don't add empty chunks
                    chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_tokens
            else:
                current_chunk.append(sentence)
                current_length += sentence_tokens

        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append(' '.join(current_chunk))

        # Summarize each chunk
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            print(f"Summarizing chunk {i+1}/{len(chunks)}...")
            chunk_summary = self.summarization_pipeline(
                chunk,
                max_length=max(30, max_length // len(chunks)),  # Scale based on chunks
                min_length=min(min_length // len(chunks), 20),  # Ensure some minimum
                num_beams=num_beams,
                temperature=temperature
            )
            chunk_summaries.append(chunk_summary[0]['summary_text'])

        # If we have multiple chunk summaries, summarize them together
        if len(chunk_summaries) > 1:
            combined_summary = " ".join(chunk_summaries)

            # Check if the combined summary is still too long
            if len(self.tokenizer.encode(combined_summary)) > max_tokens:
                # Recursively summarize the combined summaries
                return self._summarize_long_text(
                    combined_summary, max_length, min_length, num_beams, temperature
                )
            else:
                # Final summary of the combined chunk summaries
                final_summary = self.summarization_pipeline(
                    combined_summary,
                    max_length=max_length,
                    min_length=min_length,
                    num_beams=num_beams,
                    temperature=temperature
                )
                return final_summary[0]['summary_text']
        else:
            # If we only had one chunk, its summary is the final summary
            return chunk_summaries[0]

    def batch_summarize(self, texts: List[str], **kwargs) -> List[str]:
        """
        Summarize multiple texts at once.

        Args:
            texts (List[str]): List of texts to summarize.
            **kwargs: Additional arguments passed to summarize method.

        Returns:
            List[str]: List of summaries.
        """
        return [self.summarize(text, **kwargs) for text in texts]

    def evaluate_summary(self, original_text: str, summary: str) -> Dict[str, float]:
        """
        Evaluate the quality of a summary compared to the original text using ROUGE metrics.

        Args:
            original_text (str): The original text.
            summary (str): The summary to evaluate.

        Returns:
            Dict[str, float]: Dictionary with evaluation metrics.
        """
        try:
            # Try to import rouge
            from rouge import Rouge
            rouge = Rouge()

            # Calculate ROUGE scores
            scores = rouge.get_scores(summary, original_text)

            # Extract metrics
            evaluation = {
                "rouge-1_precision": scores[0]["rouge-1"]["p"],
                "rouge-1_recall": scores[0]["rouge-1"]["r"],
                "rouge-1_f1": scores[0]["rouge-1"]["f"],
                "rouge-2_f1": scores[0]["rouge-2"]["f"],
                "rouge-l_f1": scores[0]["rouge-l"]["f"]
            }

            # Overall score (weighted average of F1 scores)
            evaluation["overall_score"] = (
                0.4 * evaluation["rouge-1_f1"] +
                0.3 * evaluation["rouge-2_f1"] +
                0.3 * evaluation["rouge-l_f1"]
            )

            return evaluation

        except ImportError:
            # If rouge is not installed, return basic word overlap analysis
            print("Rouge package not found. Using basic evaluation.")
            return self._basic_evaluation(original_text, summary)

    def _basic_evaluation(self, original_text: str, summary: str) -> Dict[str, float]:
        """Simple evaluation when Rouge is not available."""
        orig_words = set(original_text.lower().split())
        summary_words = set(summary.lower().split())

        # Calculate word overlap
        overlap = len(orig_words.intersection(summary_words))
        precision = overlap / len(summary_words) if summary_words else 0
        recall = overlap / len(orig_words) if orig_words else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0

        return {
            "word_precision": precision,
            "word_recall": recall,
            "word_f1": f1,
            "overall_score": f1
        }

    def get_available_models(self) -> List[Dict[str, str]]:
        """
        Returns a list of recommended free models for summarization.

        Returns:
            List[Dict[str, str]]: List of model information.
        """
        return [
            {
                "name": "facebook/bart-large-cnn",
                "description": "Good balance between quality and speed, trained on CNN/DM dataset",
                "size": "400MB",
                "strengths": "General purpose summarization, news articles"
            },
            {
                "name": "t5-small",
                "description": "Smaller and faster model, good for resource-constrained environments",
                "size": "242MB",
                "strengths": "Speed, works on lower-end hardware"
            },
            {
                "name": "google/pegasus-xsum",
                "description": "State-of-the-art model for extreme summarization (very concise)",
                "size": "2.2GB",
                "strengths": "Very concise summaries, news articles"
            },
            {
                "name": "philschmid/bart-large-cnn-samsum",
                "description": "Fine-tuned for dialogue summarization",
                "size": "400MB",
                "strengths": "Conversations, chat logs, meetings"
            },
            {
                "name": "sshleifer/distilbart-cnn-12-6",
                "description": "Distilled version of BART, faster with similar quality",
                "size": "306MB",
                "strengths": "Good balance of speed and quality"
            }
        ]



##Main function

In [None]:
# Example usage
if __name__ == "__main__":
    # Sample text to summarize
    sample_text = """
    An ancient jawbone dredged from the Taiwanese seabed has revealed new insights into the appearance and sweeping geographic range of an enigmatic human species called the Denisovans.

    The fossil was discovered by fishers trawling the Penghu Channel off Taiwan and is thought to be the most complete fossil that has been genetically identified as Denisovan. The male individual, who lived at least 10,000 years ago, had a strong jaw and very large, powerful molars.

    “From a tooth or a small bone fragment, there’s the mystery of their appearance,” said Prof Enrico Cappellini, of the University of Copenhagen, a co-senior author on the paper. A Denisovan jaw discovered in Tibet had begun to fill in this picture, and the latest discovery adds to the evidence of a prominent jaw with huge teeth.

    “Now we have a richer image,” Cappellini said. “Of course it would be good to have a skull and the rest of the skeleton, but it’s a step forward.”

    The fossil has been dated to one of two glacial periods when the channel is known to have been above sea level, either between 10,000 and 70,000 years ago or between 130,000 and 190,000 years ago.
    The discovery reveals an impressive geographic range for the ancient species, which lived at the same time as – and interbred with – modern humans and Neanderthals.

    The first Denisovan fossils, identified through analysis of ancient DNA, came from a cave in Siberia and comprised just a finger fragment and a few teeth. Since then, further discoveries show Denisovans also weathered the incredibly harsh conditions of the high-altitude Tibetan plateau, where temperatures can plunge to -30C. By contrast, in south-east Asia they would have lived alongside water buffaloes in a balmy climate.

    “These are climate and environmental conditions that are quite different,” Cappellini said. “The cold environment in Siberia, high altitude in Tibet. We cannot infer anything of their cognitive abilities … but they had an ability to adapt to environments that are quite diverse.”

    """

    try:
        # Print available models
        print("Available summarization models:")
        summarizer = TextSummarizer()  # Initialize with default model
        models = summarizer.get_available_models()
        for i, model in enumerate(models):
            print(f"{i+1}. {model['name']} - {model['description']} ({model['size']})")

        print("\nUsing default model:", summarizer.model_name)

        # Generate a concise summary
        print("\nGenerating concise summary...")
        concise_summary = summarizer.summarize(
            sample_text,
            max_length=75,
            min_length=30,
            style="concise"
        )

        # Generate a detailed summary
        print("Generating detailed summary...")
        detailed_summary = summarizer.summarize(
            sample_text,
            style="detailed"
        )

        # Generate bullet points
        print("Generating bullet points...")
        bullet_points = summarizer.summarize(
            sample_text,
            style="bullet-points"
        )

        # Evaluate the concise summary
        print("Evaluating summary...")
        evaluation = summarizer.evaluate_summary(sample_text, concise_summary)

        # Print results
        print("\n--- ORIGINAL TEXT ---")
        print(textwrap.fill(sample_text, width=80))

        print("\n--- CONCISE SUMMARY ---")
        print(textwrap.fill(concise_summary, width=80))

        print("\n--- DETAILED SUMMARY ---")
        print(textwrap.fill(detailed_summary, width=80))

        print("\n--- BULLET POINTS ---")
        print(bullet_points)

        print("\n--- EVALUATION ---")
        for metric, score in evaluation.items():
            print(f"{metric}: {score:.3f}")

    except Exception as e:
        print(f"Error: {e}")
        print("\nTroubleshooting:")
        print("1. Make sure you have installed the required packages:")
        print("   pip install torch transformers")
        print("2. For evaluation metrics, install rouge:")
        print("   pip install rouge")
        print("3. If you have limited RAM, try using a smaller model:")
        print("   summarizer = TextSummarizer('t5-small')")

Available summarization models:
Loading model: facebook/bart-large-cnn (this may take a minute on first run)


Device set to use cpu


Model loaded successfully. Using device: cpu
1. facebook/bart-large-cnn - Good balance between quality and speed, trained on CNN/DM dataset (400MB)
2. t5-small - Smaller and faster model, good for resource-constrained environments (242MB)
3. google/pegasus-xsum - State-of-the-art model for extreme summarization (very concise) (2.2GB)
4. philschmid/bart-large-cnn-samsum - Fine-tuned for dialogue summarization (400MB)
5. sshleifer/distilbart-cnn-12-6 - Distilled version of BART, faster with similar quality (306MB)

Using default model: facebook/bart-large-cnn

Generating concise summary...
Generating detailed summary...
Generating bullet points...
Evaluating summary...
Rouge package not found. Using basic evaluation.

--- ORIGINAL TEXT ---
     An ancient jawbone dredged from the Taiwanese seabed has revealed new
insights into the appearance and sweeping geographic range of an enigmatic human
species called the Denisovans.      The fossil was discovered by fishers
trawling the Penghu Cha