<a href="https://colab.research.google.com/github/DmitriyShalashov/T5-Text-Summarization-ROUGE-BERT-scores/blob/main/Text_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Загрузка данных

In [1]:
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
import pandas as pd
import os
import string
import re
from datasets import Dataset
from transformers import  Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, T5ForConditionalGeneration

In [3]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("gowrishankarp/newspaper-text-summarization-cnn-dailymail")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/gowrishankarp/newspaper-text-summarization-cnn-dailymail?dataset_version_number=2...


100%|██████████| 503M/503M [00:23<00:00, 22.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail/versions/2


In [5]:
for root, dirs, files in os.walk(path):
    level = root.replace(path, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f"{subindent}{file}")

2/
  cnn_dailymail/
    validation.csv
    train.csv
    test.csv


In [6]:
train_dir = path + '/cnn_dailymail/train.csv'
valid_dir = path + '/cnn_dailymail/train.csv'
test_dir = path + '/cnn_dailymail/train.csv'

In [7]:
df_train=pd.read_csv(train_dir)
df_val=pd.read_csv(valid_dir)
df_test=pd.read_csv(test_dir)

In [8]:
df_train.sample(5)

Unnamed: 0,id,article,highlights
236465,be1382972f8b42ad73c32717652f012e09b32f15,London (CNN) -- At least 57 people were arrest...,People accused of having weapons with a violen...
39372,6f3a4791e6b3455876606cf9cd3970034450f51e,"By . Charlie Mccann . PUBLISHED: . 06:58 EST, ...",The first DIY device of its kind it uses light...
11674,212b9806868ab1b04d8afff7282fb0ac58e12e78,Brutal: A youngster said to have been killed f...,IS seized 149 schoolchildren and subjected the...
140783,420bc3f1e7929964d3596df2fb38e3b4e8e7579c,Allyson Ng was jailed for fraudulently helping...,"Allyson Ng, 45, charged £110 to get Mandarin s..."
37621,6a8bd18665530faa59095d2449bb4f16530d2003,By . Daily Mail Reporter . PUBLISHED: . 10:08 ...,"Sharon Carpenter, 61, died in Ehab Aly Mohamed..."


In [9]:
df_train = df_train.sample(10000, random_state=42)
df_val = df_val.sample(4000, random_state=42)
df_test = df_test.sample(2000, random_state=42)

# Фильтрация данных стоп-слова + пунктуация

In [10]:
stop_words = set(stopwords.words("english"))
negation_words= {
    "not", "no", "never", "none", "nobody", "nothing", "neither", "nowhere",
    "hasn't", "haven't", "hadn't", "doesn't", "don't", "didn't",
    "won't", "wouldn't", "can't", "couldn't", "isn't", "aren't", "wasn't", "weren't",
    "without", "nor"
}
filtered_stopwords = stop_words - negation_words
punctuation = set(string.punctuation)

def preprocessing_text(text):
  text =text.lower()

  text = re.sub(r'<[^>]+>', '', text)

  text = re.sub(r"http\S+", "", text)

  text = re.sub(r"[^\w\s]", "", text)

  text = re.sub(r'\s+', ' ', text).strip()

  words = text.split()

  filtered = [word for word in words if word not in filtered_stopwords]

  return " ".join(filtered)
  return text

In [11]:
df_train["article"] = df_train["article"].apply(preprocessing_text)
df_train["article"].iloc[0]

'mia de graaf britons flocked beaches across southern coast yesterday millions look set bask glorious sunshine today temperatures soared 17c brighton dorset people starting long weekend deck chairs sea figures asda suggest unexpected sunshine also inspired wave impromptu barbecues sales sausages equipment expected triple april suns brighton beach packed britons enjoying unexpected sunshine start long weekend temperatures hit 17c across south coast although frost set hit south tonight temperatures dropping 1c britons stocking barbecue luck tomorrow forecasters predicting dry sunny weather across southern england southern wales south midlands weymouth dorset sun came time towns annual kite festival held beach good weather not enjoyed heavy rain poured across north west unfortunately dark clouds intermittent rain across northern england scotland set last throughout long weekend tuesday however north east enjoyed bright spell midday today sun shining harrogate york ahead rainy weekend met 

In [12]:
df_train["highlights"] = df_train["highlights"].apply(preprocessing_text)
df_train["highlights"].iloc[0]

'people enjoyed temperatures 17c brighton beach west sussex weymouth dorset asda claims sell million sausages long weekend despite night temperatures dropping minus 1c good weather not enjoyed north west scotland seen heavy rain'

In [13]:
df_val["article"] = df_val["article"].apply(preprocessing_text)
df_val["article"].iloc[0]

'mia de graaf britons flocked beaches across southern coast yesterday millions look set bask glorious sunshine today temperatures soared 17c brighton dorset people starting long weekend deck chairs sea figures asda suggest unexpected sunshine also inspired wave impromptu barbecues sales sausages equipment expected triple april suns brighton beach packed britons enjoying unexpected sunshine start long weekend temperatures hit 17c across south coast although frost set hit south tonight temperatures dropping 1c britons stocking barbecue luck tomorrow forecasters predicting dry sunny weather across southern england southern wales south midlands weymouth dorset sun came time towns annual kite festival held beach good weather not enjoyed heavy rain poured across north west unfortunately dark clouds intermittent rain across northern england scotland set last throughout long weekend tuesday however north east enjoyed bright spell midday today sun shining harrogate york ahead rainy weekend met 

In [14]:
df_val["highlights"] = df_val["highlights"].apply(preprocessing_text)
df_val["highlights"].iloc[0]

'people enjoyed temperatures 17c brighton beach west sussex weymouth dorset asda claims sell million sausages long weekend despite night temperatures dropping minus 1c good weather not enjoyed north west scotland seen heavy rain'

# Токенизация

In [15]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
df_train["input_len"] = df_train["article"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))
df_val["input_len"] = df_val["article"].apply(lambda x: len(tokenizer.encode(x, truncation=False)))

print("Train max:", df_train["input_len"].max())
print("Val max:", df_val["input_len"].max())

Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors


Train max: 2167
Val max: 2103


In [17]:
max_input_len = df_train["input_len"].quantile(0.95)
max_target_len = df_train["highlights"].apply(lambda x: len(tokenizer.encode(x, truncation=False))).quantile(0.95)
print(max_input_len)
print(max_target_len)

1215.0
88.0


In [18]:
def tokenization(examples):
    model_inputs = tokenizer(
        examples["article"],
        max_length=int(max_input_len),
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        examples["highlights"],
        max_length=int(max_target_len),
        truncation=True,
        padding="max_length"
        )

    labels_ids = labels["input_ids"]


    new_labels = []
    for label_seq in labels_ids:
        new_labels.append([
            (label if label != tokenizer.pad_token_id else -100) for label in label_seq
        ])

    model_inputs["labels"] = new_labels

    return model_inputs

In [19]:
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

In [20]:
train_dataset

Dataset({
    features: ['id', 'article', 'highlights', 'input_len', '__index_level_0__'],
    num_rows: 10000
})

In [21]:
tokenized_train = train_dataset.map(tokenization, batched=True, batch_size=16, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenization, batched=True, batch_size=16, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

#  Обучение

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-model_result",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    report_to="none",
    no_cuda=False,
    dataloader_num_workers=0,
    fp16=True
)

In [23]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [24]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [25]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [26]:
torch.cuda.is_available()

True

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.8626,2.56273
2,2.7303,2.500046
3,2.7068,2.455302
4,2.685,2.439349
5,2.7001,2.431238


TrainOutput(global_step=25000, training_loss=2.7587990673828124, metrics={'train_runtime': 7502.8996, 'train_samples_per_second': 6.664, 'train_steps_per_second': 3.332, 'total_flos': 1.6058621952e+16, 'train_loss': 2.7587990673828124, 'epoch': 5.0})

In [52]:
from google.colab import drive
drive.mount('/content/drive')

# Сохраняем модель в Google Drive
model.save_pretrained("/content/drive/MyDrive/my_t5_model")
tokenizer.save_pretrained("/content/drive/MyDrive/my_t5_model")

Mounted at /content/drive


('/content/drive/MyDrive/my_t5_model/tokenizer_config.json',
 '/content/drive/MyDrive/my_t5_model/special_tokens_map.json',
 '/content/drive/MyDrive/my_t5_model/spiece.model',
 '/content/drive/MyDrive/my_t5_model/added_tokens.json')

# Результаты

In [35]:
from tqdm import tqdm
tqdm.pandas()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def summarize_text(article):
    article = preprocessing_text(article)

    inputs = tokenizer(
        article,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=150,
        min_length=30,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary


df_test["predicted_summary"] = df_test["article"].progress_apply(summarize_text)

100%|██████████| 2000/2000 [27:28<00:00,  1.21it/s]


In [38]:
print("The Article :\n",df_test["article"].iloc[0])

print(f"The Highlights:\n",df_test["highlights"].iloc[0])

print(f"The Predicted Summary:\n",df_test["predicted_summary"].iloc[0])

The Article :
 By . Mia De Graaf . Britons flocked to beaches across the southern coast yesterday as millions look set to bask in glorious sunshine today. Temperatures soared to 17C in Brighton and Dorset, with people starting their long weekend in deck chairs by the sea. Figures from Asda suggest the unexpected sunshine has also inspired a wave of impromptu barbecues, with sales of sausages and equipment expected to triple those in April. Sun's out: Brighton beach was packed with Britons enjoying the unexpected sunshine to start the long weekend as temperatures hit 17C across the south coast . Although frost is set to hit the south tonight - with temperatures dropping to 1C - Britons stocking up for a barbecue will be in luck tomorrow, with forecasters predicting dry and sunny weather across southern England, southern Wales and the south Midlands. In Weymouth, Dorset, the sun came out in time for the town's annual kite festival, held on the beach. But the good weather has not been enj

In [41]:
!pip install rouge_score
!pip install bert_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=9526d68b0634b54165f7a0440b9d59f2bd87e4403838c3209ba47b757c9c6768
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [42]:
from rouge_score import rouge_scorer
from bert_score import score

refs = df_test["highlights"].tolist()
preds = df_test["predicted_summary"].tolist()


P, R, F1 = score(cands=preds, refs=refs, lang="en", verbose=False)

print(f"BERTScore Precision: {P.mean().item():.4f}")
print(f"BERTScore Recall:    {R.mean().item():.4f}")
print(f"BERTScore F1:        {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.8182
BERTScore Recall:    0.8226
BERTScore F1:        0.8203


In [50]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_list = []


for ref, pred in zip(df_test["highlights"], df_test["predicted_summary"]):
    scores = scorer.score(ref, pred)
    rouge1_list.append(scores['rouge1'].fmeasure)

print(f"Average ROUGE-1 F1: {sum(rouge1_list)/len(rouge1_list):.4f}")

Average ROUGE-1 F1: 0.2569
