In [18]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch

In [19]:
# Prüfe ob GPU verfügbar ist
device = 0 if torch.cuda.is_available() else -1
print(f"Using device: {'GPU' if device == 0 else 'CPU'}")

Using device: CPU


In [20]:
text = """
"Share This On:

Pin 11 Shares

(NEWS ROOM GUYANA) — Three persons are currently hospitalized in a serious condition following an accident on the Crabwood Creek Public Road on New Year’s morning.

According to information received, motorcar PNN 7976 driven by 22-year-old Seeram Ramdat was speeding when it collided with a utility pole, injuring the driver and two passengers.

The News Room understands that while driving over the Blackwater Creek Bridge, Ramdat lost control of the vehicle which turned turtle and careened about 200 feet away before crashing into the utility pole and coming to a halt on a resident’s bridge.

The two occupants, 32-year-old Keron Phillips and 45-year-old Ramnand Kishwar were removed from the wreck in semi-conscious states and rushed to the Skeldon hospital.

The driver fled the scene and was subsequently apprehended at his Lot 80 Grant 1718 Crabwood Creek home in a traumatic state. He was also taken to the Skeldon Hospital where he is admitted in a stable condition.

The News Room understands that the vehicle is owned by an elderly woman, and Ramdatt took it without her knowledge.

Police Commissioner Leslie James on Wednesday disclosed that there has been an 8% increase in road fatalities in 2018.

( 0 ) ( 0 )"

"""

max_new_tokens = 128
min_length = 100

# mT5_multilingual_XLSum

In [21]:
model_name = "csebuetnlp/mT5_multilingual_XLSum"

# 3. Tokenizer und Modell laden
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 4. Pipeline definieren
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

Device set to use cpu


In [22]:
def summarize_article_mt5(text):
    # Spezifisches Prepending wie im XLSum Training
    formatted_text = "summarize: " + text.strip()

    input_ids = tokenizer.encode(formatted_text, return_tensors="pt", truncation=True, max_length=512)
    input_ids = input_ids.to(model.device)

    output_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        min_length=min_length,
        num_beams=4,
        length_penalty=1.5,  # Optional: Bevorzuge etwas längere Texte
        no_repeat_ngram_size=3,
        early_stopping=True,
    )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

summary = summarize_article_mt5(text)
print("🔎 Zusammenfassung:\n", summary)

🔎 Zusammenfassung:


In [24]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("data/full_dataset.csv")

for idx, row in tqdm(df.iterrows(), total=len(df)):    
    summary1 = summarize_article_mt5(row["text1"])
    summary2 = summarize_article_mt5(row["text2"])

    df.at[idx, "summary1"] = summary1
    df.at[idx, "summary2"] = summary2
    
df.to_csv("data/full_dataset_with_summaries.csv", index=False)

  1%|          | 16/2736 [16:00<45:21:32, 60.03s/it] 


KeyboardInterrupt: 

In [11]:
prompt = f"Summarize the contents of the following article in its original language, preserving as much information as possible. Focus on the content.\n\n{text}"

summary = summarizer(prompt, max_new_tokens=max_new_tokens, max_length = max_new_tokens, min_length=min_length, do_sample=False)[0]

print("Zusammenfassung: ", summary)

Both `max_new_tokens` (=128) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)




## MT5 Sum chunked Summaries

In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import nltk
import textwrap

# Download Punkt tokenizer
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# 1. Set device
device = 0 if torch.cuda.is_available() else -1

# 2. Load summarization model and tokenizer
model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)

# 3. Define helper to split long text into chunks of approx. 512 tokens
def chunk_text(text, tokenizer, max_tokens=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        candidate = f"{current_chunk} {sentence}".strip()
        tokenized_len = len(tokenizer.encode(candidate, add_special_tokens=False))
        if tokenized_len <= max_tokens:
            current_chunk = candidate
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# 4. Summarize a single long text hierarchically
def summarize_hierarchical(text, max_new_tokens=128):
    chunks = chunk_text(text, tokenizer)
    chunk_summaries = []

    for chunk in chunks:
        summary = summarizer(
            chunk,
            max_new_tokens=max_new_tokens,
            max_length = max_new_tokens,
            min_length=min_length,
            no_repeat_ngram_size=4,
            do_sample=False
        )[0]['summary_text']
        chunk_summaries.append(summary)

    # Meta-summarization step
    meta_input = " ".join(chunk_summaries)
    final_summary = summarizer(
        meta_input,
        max_new_tokens=max_new_tokens,
        max_length = max_new_tokens,
        min_length=min_length,
        length_penalty=1.2,
        no_repeat_ngram_size=4,
        do_sample=False
    )[0]['summary_text']

    return final_summary


# Run hierarchical summarization
summary_result = summarize_hierarchical(text)
print(summary_result)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laraw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cpu
Both `max_new_tokens` (=128) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Your max_length is set to 128, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Both `max_new_tokens` (=128) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


The number of people killed in a crash on New Year's Day has risen by almost a third, according to the latest figures from the Department of Motoring and Crime (DMRC) and the Department of Drivers and Drivers (DfD) in England and Wales, which have been linked to an 8% increase in fatalities. . . (Below is a full transcript of a crash in Northamptonshire.)


# MT5 Base

In [25]:
# Lade den Summarizer
model_name = "google/mt5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

summarizer = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)

Device set to use cpu


In [28]:
prompt = f"Summarize: {text}"

summary = summarizer(prompt, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)[0]

print("Zusammenfassung: ", summary)


Zusammenfassung:  {'generated_text': '<extra_id_0>, which is a lot more serious, <extra_id_1> is a lot more serious." "Share This On: Pin 11 Shares <extra_id_2>, <extra_id_40> <extra_id_41> <extra_id_41> <extra_id_56>.. "Pin 11 Shares Share This On: Pin 11 Shares Share This On: Pin 11 Shares "Share This On: Pin 11 Shares " Share This On: Pin 11 Shares " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " " "'}


# Flan T5 Base

In [None]:
# Modellname
model_name = "google/flan-t5-base"

# Tokenizer und Modell laden
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Pipeline erstellen
summarizer = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)

Device set to use cpu


In [None]:
# prompt = f"Summarize the following article in its original language, preserving as much information as possible, while being concise. Avoid teaser-like introductions or vague phrases. Shorten the input to about 128 tokens.\n\n{text}"

prompt = f"Summarize the following article as a short and dense summary. Focus only on the facts, avoid introductions or generic phrases. Keep the summary under 100 words.\n\n{text}"

summary = summarizer(prompt, max_new_tokens=max_new_tokens, min_length=min_length, do_sample=False)[0]

print("Zusammenfassung: ", summary)


Zusammenfassung:  {'generated_text': 'Three persons are currently hospitalized in a serious condition following an accident on the Crabwood Creek Public Road on New Year’s morning. According to information received, motorcar PNN 7976 driven by 22-year-old Seeram Ramdat was speeding when it collided with a utility pole, injuring the driver and two passengers. The News Room understands that while driving over the Blackwater Creek Bridge, Ramdat lost control of the vehicle which turned turtle and careened about 200 feet away before crashing into the utility pole and coming to a halt on a resident’'}


# M-Bart Large

In [None]:
import pandas as pd
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, pipeline

# Modell laden
model_name = "facebook/mbart-large-cc25"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device=device)


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBartTokenizer'. 
The class this function is called from is 'MBart50TokenizerFast'.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
def summarize_text_m_bart(text, target_token_count=128):
    try:
        summary = summarizer(
            text,
            max_length=target_token_count,
            min_length=target_token_count//2,
            do_sample=False
        )[0]["summary_text"]
        return summary
    except Exception as e:
        return f"[Fehler: {e}]"


summary = summarize_text_m_bart(text)

print("Zusammenfassung: ", summary)

Zusammenfassung:  ( 0 ) Police Commissioner Police Commissioner Leslie James on Wednesday disclosed that there has been an 8% increase in road fatal fatal fatal fatalities in road fatalities in road fatalities in road fatalities in road fatalities in police. Police. Police. Police. Police. Police. Police. Police. Police. Police. Police, , owned by an 8% increase in an 8% increase in road fatalities in 2018. ( 0 )""""


In [None]:
# Modell laden
model_name = "facebook/mbart-large-cc25"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Sprache setzen (für englischen Text)
tokenizer.src_lang = "en_XX"

def summarize(text, target_lang="en_XX", max_tokens=128):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    
    # Optional: Zielsprachen-Token setzen (bei multilingualer Ausgabe)
    generated_ids = model.generate(
        inputs["input_ids"],
        max_length=max_tokens,
        min_length=int(max_tokens * 0.5),
        length_penalty=1.0,
        num_beams=4,
        early_stopping=True,
        decoder_start_token_id=tokenizer.lang_code_to_id[target_lang],
    )
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

summary = summarize(text)
print("Zusammenfassung:", summary)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBartTokenizer'. 
The class this function is called from is 'MBart50TokenizerFast'.


Zusammenfassung: "Share This On: Pin 11 Shares (NEWS ROOM GUYANA) — Three persons are currently hospitalized in a serious condition following an accident on the Crabwood Creek Public Road on New Year’s morning. According to information received, motorcar PNN 7976 driven by 22-year-old Seeram Ramdat was speeding when it collided with a utility pole, injuring the driver and two passengers. The News Room understands that while driving over the Blackwater Creek Bridge, Ramdat lost control of the vehicle which turned turtle and careened about 200 feet
