In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("./brevity_small_stage2")
tokenizer = AutoTokenizer.from_pretrained("./brevity_small_stage2")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def chunk_text(text, chunk_size=512):
    """ Function to split text into chunks smaller than the max token length. """
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return chunks

def summarize_chunks(chunks):
    """ Function to summarize each chunk and return the combined summary. """
    summaries = []
    
    for chunk in chunks:
        # Decode tokens back into text
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        
        # Dynamically adjust chunk and summary lengths based on the input size
        chunk_size = 512 if len(chunk_text.split()) < 1000 else 1024  # Adjust input chunk size
        target_length = 150 if len(chunk_text.split()) < 1000 else 250  # Adjust output summary length

        inputs = tokenizer(
            chunk_text,
            return_tensors="pt",
            max_length=chunk_size,
            truncation=True,
            padding="max_length"
        ).to(device)

        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=target_length,  # Dynamically adjusted target length
                min_length=target_length // 2,
                num_beams=4,
                early_stopping=True,
                length_penalty=1.2,
                no_repeat_ngram_size=3
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Combine summaries
    combined_summary = " ".join(summaries)
    return combined_summary

# Main code for reading text and summarizing it
with open('sample.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Step 1: Split the text into chunks
chunks = chunk_text(text)

# Step 2: Summarize each chunk
final_summary = summarize_chunks(chunks)

# Print the final summary
print("\n✅ Combined Chunk Summary:\n", final_summary)



✅ Combined Chunk Summary:
 Sentiment analysis is used to classify customer reviews on various online platforms .
Toxicity classification is a branch of NLP where the aim is to classify hostile intent .
Spam detection is a prevalent binary classification problem in NLP .
Machine translation automates translation between different languages .
Named entity recognition aims to extract entities in a piece of text into predefined categories . This is the latest in a series of attempts to solve the problems of the world's most serious problems .
It is the first attempt at solving the problem of global economic woes .
Inventive efforts have been made in the U.S. have failed to reach the same level of success as in the world of economic prosperity .
The world's best-selling company is now looking for ways to solve this problem .


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
import torch
from difflib import SequenceMatcher

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./brevity_small_stage3")
tokenizer = AutoTokenizer.from_pretrained("./brevity_small_stage3")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def is_similar(s1, s2, threshold = 0.85):
    return SequenceMatcher(None, s1.strip(), s2.strip()).ratio() > threshold

def filter_redundant(summaries):
    filtered = []

    for summary in summaries:
        if all(not is_similar(summary, existing) for existing in filtered):
            filtered.append(summary)
    return filtered

def chunk_text(text, chunk_size=512, overlap = 50):
    """ Function to split text into chunks smaller than the max token length. """
    tokens = tokenizer.encode(text, truncation=False)

    stride = chunk_size - overlap
    chunks = []

    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + chunk_size]
        chunks.append(chunk)

        # Stop if the chunk is smaller than chunk_size (last part)
        if i + chunk_size >= len(tokens):
            break
    return chunks

def summarize_chunks(chunks):
    """ Function to summarize each chunk and return the combined summary. """
    summaries = []

    for chunk in chunks:
        # Decode tokens back into text
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        
        # Dynamically adjust chunk and summary lengths based on the input size
        chunk_size = 512 if len(chunk_text.split()) < 1000 else 1024
        target_length = 150 if len(chunk_text.split()) < 1000 else 250

        inputs = tokenizer(
            chunk_text,
            return_tensors="pt",
            max_length=chunk_size,
            truncation=True,
            padding="max_length"
        ).to(device)

        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=target_length,
                min_length=target_length // 2,
                num_beams=4,
                early_stopping=True,
                length_penalty=1.2,
                no_repeat_ngram_size=3,
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    summaries = filter_redundant(summaries)

    # Combine summaries
    combined_summary = " ".join(summaries)
    return combined_summary

# Main code for reading text and summarizing it
with open('longerVariants.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Step 1: Split the text into chunks
chunks = chunk_text(text)

# Step 2: Summarize each chunk
final_summary = summarize_chunks(chunks)

# Print the final summary
print("\n✅ Combined Chunk Summary:\n", final_summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (1068 > 1024). Running this sequence through the model will result in indexing errors



✅ Combined Chunk Summary:
 Climate change is already here, unfolding in real-time across every continent .
Many governments still prioritize short-term economic interests over long-term sustainability .
Collaboration is the cornerstone of climate action .
Technology, regulation, education, and most importantly, collaboration are needed .
The power of informed citizens cannot be overstated, says co-authors . Climate justice must be at the heart of every conversation, authors say .
They say people least responsible for global emissions often bear the brunt of its consequences .
The climate crisis is no longer an environmental issue; it is an everything issue .
It affects jobs, migration, public health, geopolitical stability, and intergenerational justice .
Author: "Let us not wait for perfect solutions but commit to imperfect action that paves the way" The time to act is not tomorrow or next year -- it is now .
Let us not divide the world into victims and saviors but recognize our shar