### Load libraries

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset,DataLoader
from tqdm.auto import tqdm
import torch.utils.data as data
import pandas as pd
import torch
import transformers

torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = True

  from .autonotebook import tqdm as notebook_tqdm


### Load tokenizer and model

In [2]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


### Data processing

In [3]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

dataset = load_dataset("json", data_files="train.json")
body_text = dataset['train']['body']
title_text = dataset['train']['title']
input_text = []
summary = []
data_size = 100000
for body in range(data_size):
    input_text.append('summarize: ' + str(body_text[body]))
for title in range(data_size):
    summary.append(str(title_text[title]))

input_train, input_val, summary_train, summary_val = train_test_split(input_text, summary, test_size=0.1, random_state=42)


Found cached dataset json (C:/Users/user/.cache/huggingface/datasets/json/default-ca6af2841e934a3b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
100%|██████████| 1/1 [00:00<00:00, 42.04it/s]


### Build datasets

In [4]:
class HeadlineGenerationDataset(Dataset):
    def __init__(self, text, summary, tokenizer, max_len = 512):
        self.data = []
        for t, s in zip(text, summary):
            input_t = tokenizer(t, truncation=True, padding="max_length", max_length=max_len)
            label_t = tokenizer(s, truncation=True, padding="max_length", max_length=max_len)

            #轉換-100
            for cnt,tmp in enumerate(label_t['input_ids']):
                if tmp == 0:
                    label_t['input_ids'][cnt] = -100
                    
            self.data.append({'input_ids':torch.tensor(input_t['input_ids']),
                              'attention_mask':torch.tensor(input_t['attention_mask']),
                              'labels':torch.tensor(label_t['input_ids'])})

    def __getitem__(self, index):
        
         
        return self.data[index]
        

    def __len__(self):
        return len(self.data)

### Dataloader

In [5]:
train_set = HeadlineGenerationDataset(input_train, summary_train, tokenizer,max_len = 512)
train_loader = DataLoader(train_set,batch_size = 2,shuffle = True, num_workers = 0, pin_memory = True)
val_set = HeadlineGenerationDataset(input_val, summary_val, tokenizer,max_len = 512)
val_loader = DataLoader(val_set,batch_size = 2,shuffle = True, num_workers = 0, pin_memory = True)

### Valdation model

In [6]:
def valdation(val_loader, model, tokenizer):
    model.eval()
    preds = []
    targets = []
    with torch.no_grad():
        for batch in tqdm(val_loader):
            input_ids = batch['input_ids'].cuda()
            attention_mask = batch['attention_mask'].cuda()
            labels = batch['labels'].cuda()
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask)
            for i in range(len(outputs)):
                output = tokenizer.decode(outputs[i], skip_special_tokens=True)
                preds.append(output)
                label = labels[i].cpu().numpy()
                label = label[label != -100]  # filter out padding labels
                targets.append(tokenizer.decode(label, skip_special_tokens=True))
    return preds, targets


### Evaluation

In [7]:
import evaluate

def evaluation(outputs, targets):
    metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"])
    rouge = metric_rouge.compute(predictions=outputs, references=targets, use_stemmer=True)
    return rouge

### Train model

In [10]:
import os

model.cuda()
optimizer = torch.optim.AdamW(params = model.parameters(), lr = 1e-4)
if os.path.exists('checkpoint.pth'):
    checkpoint = torch.load('checkpoint.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

for epoch in range(10):
    model.train()
    train = tqdm(train_loader)
    for data in train:
        for key in data.keys():
            data[key] = data[key].cuda()
        outputs = model(**data)
        loss = outputs.loss
        train.set_description(f'Epoch {epoch+1}')
        train.set_postfix({'Loss': loss.item()})
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, 'checkpoint.pth')
    outputs, targets = valdation(val_loader, model, tokenizer)
    rouge = evaluation(outputs, targets)
    print("Rouge scores: " , rouge)
        
    
    model.save_pretrained('model_{}'.format(epoch+1))

Epoch 1: 100%|██████████| 45000/45000 [5:28:51<00:00,  2.28it/s, Loss=1.93]      
100%|██████████| 5000/5000 [22:59<00:00,  3.63it/s]


Rouge scores:  {'rouge1': 0.4383037130718228, 'rouge2': 0.25733217517877927, 'rougeL': 0.40248471906622996, 'rougeLsum': 0.4025951650614007}


Epoch 2: 100%|██████████| 45000/45000 [5:23:41<00:00,  2.32it/s, Loss=1.84]      
100%|██████████| 5000/5000 [21:01<00:00,  3.96it/s]


Rouge scores:  {'rouge1': 0.4478721153096462, 'rouge2': 0.26634157589627416, 'rougeL': 0.41275185668574155, 'rougeLsum': 0.41256062064590165}


Epoch 3: 100%|██████████| 45000/45000 [5:21:39<00:00,  2.33it/s, Loss=1.38]      
100%|██████████| 5000/5000 [21:08<00:00,  3.94it/s]


Rouge scores:  {'rouge1': 0.45478295791469847, 'rouge2': 0.27161383269310724, 'rougeL': 0.41781696653018463, 'rougeLsum': 0.41804808595261844}


Epoch 4: 100%|██████████| 45000/45000 [5:21:40<00:00,  2.33it/s, Loss=1.55]      
100%|██████████| 5000/5000 [21:24<00:00,  3.89it/s]


Rouge scores:  {'rouge1': 0.45993916006354185, 'rouge2': 0.27430596267822527, 'rougeL': 0.42164516271097885, 'rougeLsum': 0.4217303176463495}


Epoch 5: 100%|██████████| 45000/45000 [5:25:42<00:00,  2.30it/s, Loss=0.768]     
100%|██████████| 5000/5000 [22:30<00:00,  3.70it/s]


Rouge scores:  {'rouge1': 0.4622527862773976, 'rouge2': 0.2788786859745255, 'rougeL': 0.4245154737074517, 'rougeLsum': 0.4246260630353029}


Epoch 6: 100%|██████████| 45000/45000 [5:26:47<00:00,  2.30it/s, Loss=1.33]      
100%|██████████| 5000/5000 [20:13<00:00,  4.12it/s]


Rouge scores:  {'rouge1': 0.4659912351844707, 'rouge2': 0.2829143937043717, 'rougeL': 0.4288290651995629, 'rougeLsum': 0.42884417264946995}


Epoch 7: 100%|██████████| 45000/45000 [5:22:03<00:00,  2.33it/s, Loss=0.892]     
100%|██████████| 5000/5000 [20:59<00:00,  3.97it/s]


Rouge scores:  {'rouge1': 0.4641527924506976, 'rouge2': 0.282577865960329, 'rougeL': 0.4277944873590433, 'rougeLsum': 0.4280565851014666}


Epoch 8: 100%|██████████| 45000/45000 [5:21:41<00:00,  2.33it/s, Loss=1.89]      
100%|██████████| 5000/5000 [20:55<00:00,  3.98it/s]


Rouge scores:  {'rouge1': 0.46440037254909167, 'rouge2': 0.28339393737741037, 'rougeL': 0.42864769094796507, 'rougeLsum': 0.4285585096304675}


Epoch 9: 100%|██████████| 45000/45000 [5:25:16<00:00,  2.31it/s, Loss=0.668]     
100%|██████████| 5000/5000 [21:01<00:00,  3.96it/s]


Rouge scores:  {'rouge1': 0.46214239154382775, 'rouge2': 0.28171389806537184, 'rougeL': 0.4259167077192326, 'rougeLsum': 0.42579484142673707}


Epoch 10: 100%|██████████| 45000/45000 [5:21:31<00:00,  2.33it/s, Loss=0.61]      
100%|██████████| 5000/5000 [21:27<00:00,  3.88it/s]


Rouge scores:  {'rouge1': 0.45998606430860156, 'rouge2': 0.2802696819136486, 'rougeL': 0.42387975099678343, 'rougeLsum': 0.4236974574441758}
