In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained("./brevity_small_stage2")
tokenizer = AutoTokenizer.from_pretrained("./brevity_small_stage2")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def chunk_text(text, chunk_size=512):
    """ Function to split text into chunks smaller than the max token length. """
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return chunks

def summarize_chunks(chunks):
    """ Function to summarize each chunk and return the combined summary. """
    summaries = []
    
    for chunk in chunks:
        # Decode tokens back into text
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        
        # Dynamically adjust chunk and summary lengths based on the input size
        chunk_size = 512 if len(chunk_text.split()) < 1000 else 1024  # Adjust input chunk size
        target_length = 150 if len(chunk_text.split()) < 1000 else 250  # Adjust output summary length

        inputs = tokenizer(
            chunk_text,
            return_tensors="pt",
            max_length=chunk_size,
            truncation=True,
            padding="max_length"
        ).to(device)

        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=target_length,  # Dynamically adjusted target length
                min_length=target_length // 2,
                num_beams=4,
                early_stopping=True,
                length_penalty=1.2,
                no_repeat_ngram_size=3
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Combine summaries
    combined_summary = " ".join(summaries)
    return combined_summary

# Main code for reading text and summarizing it
with open('sample.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Step 1: Split the text into chunks
chunks = chunk_text(text)

# Step 2: Summarize each chunk
final_summary = summarize_chunks(chunks)

# Print the final summary
print("\n✅ Combined Chunk Summary:\n", final_summary)



✅ Combined Chunk Summary:
 Sentiment analysis is used to classify customer reviews on various online platforms .
Toxicity classification is a branch of NLP where the aim is to classify hostile intent .
Spam detection is a prevalent binary classification problem in NLP .
Machine translation automates translation between different languages .
Named entity recognition aims to extract entities in a piece of text into predefined categories . This is the latest in a series of attempts to solve the problems of the world's most serious problems .
It is the first attempt at solving the problem of global economic woes .
Inventive efforts have been made in the U.S. have failed to reach the same level of success as in the world of economic prosperity .
The world's best-selling company is now looking for ways to solve this problem .


In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
import torch

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./brevity_small_stage3")
tokenizer = AutoTokenizer.from_pretrained("./brevity_small_stage3")

# Override generation config to fix early_stopping issue
# model.generation_config = GenerationConfig.from_pretrained(
#     "./brevity_small_stage3",
#     early_stopping=True
# )

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# print("Device:", device)
# print("Model loaded successfully.")

def chunk_text(text, chunk_size=512):
    """ Function to split text into chunks smaller than the max token length. """
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]
    return chunks

def summarize_chunks(chunks):
    """ Function to summarize each chunk and return the combined summary. """
    summaries = []

    for chunk in chunks:
        # Decode tokens back into text
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        
        # Dynamically adjust chunk and summary lengths based on the input size
        chunk_size = 512 if len(chunk_text.split()) < 1000 else 1024
        target_length = 150 if len(chunk_text.split()) < 1000 else 250

        inputs = tokenizer(
            chunk_text,
            return_tensors="pt",
            max_length=chunk_size,
            truncation=True,
            padding="max_length"
        ).to(device)

        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=target_length,
                min_length=target_length // 2,
                num_beams=4,
                early_stopping=True,
                length_penalty=1.2,
                no_repeat_ngram_size=3,
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    # Combine summaries
    combined_summary = " ".join(summaries)
    return combined_summary

# Main code for reading text and summarizing it
with open('sample.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Step 1: Split the text into chunks
chunks = chunk_text(text)

# Step 2: Summarize each chunk
final_summary = summarize_chunks(chunks)

# Print the final summary
print("\n✅ Combined Chunk Summary:\n", final_summary)


✅ Combined Chunk Summary:
 Ukraine has launched a 'large-scale' drone attack on Russian military bombers in Siberia .
More than 40 warplanes were reportedly hit, including Tu-95 and Tu-22 strategic bombers .
Drones were smuggled into Russia and concealed under roofs of wooden sheds .
Ukraine's SBU domestic intelligence agency said it had hit Russian planes worth a combined $7bn (£5.2bn) in the wave of drone strikes . Ukrainian president: 'We had been preparing the operation for more than a year and a half'
He said: 'Thirty-four per cent of the strategic cruise missile carriers at the airfields were hit.
In total, 117 drones were used in the operation.
The office of Ukraine's operation in Russia had been located next to the FSB office .
