# Mercedes F1 Infringement Profile - Pegasus Summarization

This notebook implements Pegasus (abstractive summarization) for Mercedes F1 infringement documents.

## Objective:
- Process all `no_footer_` files from the preprocessed dataset
- Apply Pegasus model for abstractive summarization
- Maintain temporal analysis (year-by-year summaries)
- Generate abstractive summaries that create new sentences (not just extract)

## Input:
- `pre_proc_op/` folder containing `no_footer_*.txt` files organized by year

## Output:
- Console summaries for each year
- Overall consolidated summary
- Results saved in `pegasus_results/` folder


In [1]:
# Import required libraries
import os
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Pegasus implementation
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

print("Libraries imported successfully")


Libraries imported successfully


In [2]:
# Configuration
processed_base_path = Path("pre_proc_op")
years = ["2020", "2021", "2022", "2023", "2024"]

# Pegasus model configuration
MODEL_NAME = "google/pegasus-xsum"  # XSum variant for abstractive summarization
MAX_LENGTH = 512  # Maximum length for generated summary
MIN_LENGTH = 50   # Minimum length for generated summary
MAX_INPUT_LENGTH = 1024  # Pegasus max input tokens (will chunk if needed)

print("Configuration:")
print(f"Processed base path: {processed_base_path}")
print(f"Years to analyze: {years}")
print(f"Model: {MODEL_NAME}")
print(f"Max summary length: {MAX_LENGTH}")
print(f"Min summary length: {MIN_LENGTH}")


Configuration:
Processed base path: pre_proc_op
Years to analyze: ['2020', '2021', '2022', '2023', '2024']
Model: google/pegasus-xsum
Max summary length: 512
Min summary length: 50


In [3]:
# Initialize Pegasus summarizer
print("Loading Pegasus model...")
print("This may take a few minutes on first run (downloading model)...")

try:
    # Use pipeline for simplicity
    summarizer = pipeline(
        "summarization",
        model=MODEL_NAME,
        device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
        framework="pt"
    )
    print(f"Pegasus model loaded successfully")
    print(f"Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to CPU...")
    summarizer = pipeline(
        "summarization",
        model=MODEL_NAME,
        device=-1,
        framework="pt"
    )
    print("Pegasus model loaded on CPU")


Loading Pegasus model...
This may take a few minutes on first run (downloading model)...


config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

: 

In [None]:
# Helper function to chunk long texts
def chunk_text(text, max_chars=4000):
    """
    Split text into chunks that fit within token limits.
    Pegasus has max_position_embeddings of 1024, so we chunk conservatively.
    """
    if len(text) <= max_chars:
        return [text]
    
    # Split by sentences (rough approximation)
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) + 2 <= max_chars:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

# Helper function to summarize long texts
def summarize_long_text(text, summarizer, max_length=MAX_LENGTH, min_length=MIN_LENGTH):
    """
    Summarize text, handling long inputs by chunking.
    """
    chunks = chunk_text(text)
    
    if len(chunks) == 1:
        # Single chunk - summarize directly
        try:
            result = summarizer(
                text,
                max_length=max_length,
                min_length=min_length,
                do_sample=False
            )
            return result[0]['summary_text']
        except Exception as e:
            print(f"Error in summarization: {e}")
            return ""
    else:
        # Multiple chunks - summarize each and combine
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            try:
                result = summarizer(
                    chunk,
                    max_length=max_length,
                    min_length=min_length,
                    do_sample=False
                )
                chunk_summaries.append(result[0]['summary_text'])
            except Exception as e:
                print(f"Error summarizing chunk {i+1}/{len(chunks)}: {e}")
                continue
        
        # Combine chunk summaries
        combined = " ".join(chunk_summaries)
        
        # If combined is still too long, summarize again
        if len(combined) > 4000:
            try:
                result = summarizer(
                    combined,
                    max_length=max_length,
                    min_length=min_length,
                    do_sample=False
                )
                return result[0]['summary_text']
            except Exception as e:
                print(f"Error in final summarization: {e}")
                return combined
        
        return combined

print("Helper functions defined")


In [None]:
# Process documents year by year
print("MERCEDES F1 INFRINGEMENT PROFILE - PEGASUS SUMMARIZATION")

yearly_summaries = {}
yearly_stats = {}

for year in years:
    print(f"\nPROCESSING {year} - MERCEDES F1 INFRINGEMENTS")
    
    year_path = processed_base_path / year
    
    if not year_path.exists():
        print(f"Folder {year_path} does not exist")
        continue
    
    # Get all no_footer_ files
    no_footer_files = list(year_path.glob("no_footer_*.txt"))
    
    if not no_footer_files:
        print(f"No no_footer_ files found in {year}")
        continue
    
    print(f"Found {len(no_footer_files)} processed documents in {year}")
    
    # Read and combine all documents for the year
    combined_text = ""
    total_chars = 0
    processed_files = 0
    
    for file_path in no_footer_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
                if content:
                    combined_text += content + " "
                    total_chars += len(content)
                    processed_files += 1
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")
    
    if not combined_text.strip():
        print(f"No valid content found in {year}")
        continue
    
    print(f"Processed {processed_files} files, {total_chars:,} total characters")
    
    # Generate summary using Pegasus
    print(f"\nGenerating Pegasus summary for {year}...")
    print("This may take a moment...")
    
    summary = summarize_long_text(
        combined_text.strip(),
        summarizer,
        max_length=MAX_LENGTH,
        min_length=MIN_LENGTH
    )
    
    # Store results
    yearly_summaries[year] = summary
    yearly_stats[year] = {
        'files_processed': processed_files,
        'total_chars': total_chars,
        'summary_length': len(summary)
    }
    
    # Display summary
    print(f"\nPEGASUS SUMMARY FOR {year}:")
    print("-" * 50)
    print(summary)
    print("-" * 50)
    print(f"Summary length: {len(summary):,} characters")
    print(f"Compression ratio: {(len(summary)/total_chars)*100:.1f}%")

print(f"\nProcessed {len(yearly_summaries)} years successfully")


In [None]:
# Generate overall consolidated summary
print("\nOVERALL CONSOLIDATED SUMMARY - ALL YEARS")

if yearly_summaries:
    # Combine all yearly summaries
    all_summaries_text = " "
    for year, summary in yearly_summaries.items():
        all_summaries_text += f"{year} Summary: {summary} "
    
    # Generate overall summary
    print("Generating overall consolidated summary...")
    print("This may take a moment...")
    
    overall_summary = summarize_long_text(
        all_summaries_text.strip(),
        summarizer,
        max_length=MAX_LENGTH,
        min_length=MIN_LENGTH
    )
    
    print("\nOVERALL MERCEDES F1 INFRINGEMENT PROFILE (PEGASUS):")
    print("=" * 50)
    print(overall_summary)
    print("=" * 50)
    
    # Statistics
    total_files = sum(stats['files_processed'] for stats in yearly_stats.values())
    total_chars = sum(stats['total_chars'] for stats in yearly_stats.values())
    
    print(f"\nOVERALL STATISTICS:")
    print(f"Total documents processed: {total_files}")
    print(f"Total characters analyzed: {total_chars:,}")
    print(f"Overall summary length: {len(overall_summary):,} characters")
    print(f"Overall compression ratio: {(len(overall_summary)/total_chars)*100:.1f}%")
    
    print(f"\nYEARLY BREAKDOWN:")
    for year, stats in yearly_stats.items():
        compression = (stats['summary_length']/stats['total_chars'])*100 if stats['total_chars'] > 0 else 0
        print(f"{year}: {stats['files_processed']} docs, {stats['total_chars']:,} chars -> {stats['summary_length']:,} chars ({compression:.1f}%)")
    
else:
    print("No summaries generated. Please check the input files.")


In [None]:
# Save results to files
print("\nSAVING RESULTS")

# Create output directory
output_dir = Path("pegasus_results")
output_dir.mkdir(exist_ok=True)

# Save yearly summaries
for year, summary in yearly_summaries.items():
    output_file = output_dir / f"pegasus_summary_{year}.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"Mercedes F1 Infringement Summary - {year}\n")
        f.write("="*50 + "\n\n")
        f.write(summary)
    print(f"Saved {year} summary to {output_file}")

# Save overall summary
if yearly_summaries:
    overall_file = output_dir / "pegasus_overall_summary.txt"
    with open(overall_file, 'w', encoding='utf-8') as f:
        f.write("Mercedes F1 Infringement Profile - Overall Summary (Pegasus)\n")
        f.write("="*50 + "\n\n")
        f.write(overall_summary)
        f.write("\n\n" + "="*50 + "\n")
        f.write("STATISTICS\n")
        f.write("="*50 + "\n")
        total_files = sum(stats['files_processed'] for stats in yearly_stats.values())
        total_chars = sum(stats['total_chars'] for stats in yearly_stats.values())
        f.write(f"Total documents processed: {total_files}\n")
        f.write(f"Total characters analyzed: {total_chars:,}\n")
        f.write(f"Overall summary length: {len(overall_summary):,} characters\n")
        f.write(f"Overall compression ratio: {(len(overall_summary)/total_chars)*100:.1f}%\n")
    print(f"Saved overall summary to {overall_file}")

print("\nAll results saved successfully!")
