Install all the dependencies required for this script

In [None]:
pip install torch transformers datasets peft evaluate py7zr nltk absl-py rouge_score bert_score

Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the '/cvmfs/hpc.rug.nl/versions/2023.01/rocky8/x86_64/intel/icelake/software/Python/3.10.4-GCCcore-11.3.0/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


Import the dependencies

In [2]:
import torch as t
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import transformers, datasets, pickle, multiprocessing, peft, evaluate, py7zr, functools 


Create global reference to the GPU

In [3]:
output_device = t.device('cpu')
model_run_device = t.device('cuda') if t.cuda.is_available() else t.device('cpu')

Load Datasets

In [4]:
samsum = datasets.load_dataset('Samsung/samsum')
cnn = datasets.load_dataset('abisee/cnn_dailymail', '3.0.0')
samsum = samsum.rename_column("dialogue", "article")
samsum = samsum.rename_column("summary", "highlights")

Load tokenizer for t5-base

In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Filter the cnn dataset to only include articles with 512 tokens or less. Luckily the CNN dataset is large enough to still have plenty of articles left.

In [10]:
def is_short_article(example):
    tokens = tokenizer(example["article"], truncation=True, padding=False)["input_ids"]
    return len(tokens) < 512

cnn_short = cnn["train"].filter(is_short_article)
with open(f'./preprocessing/cnn_short.pickle', 'wb') as file:
    pickle.dump(cnn_short, file)

Function for tokenizing the data. Pads to the longest sequence being processed. Adds "summarize: " as summarization is a pre-trained task for t5 which can be accessed with this prefix. 

In [11]:
def tokenize_data(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=transformers.tokenization_utils_base.TruncationStrategy.LONGEST_FIRST,
        padding=transformers.utils.PaddingStrategy.LONGEST,
        is_split_into_words=False,
        return_tensors=transformers.utils.TensorType.PYTORCH,
        return_attention_mask=True
    )
    model_inputs
    with tokenizer.as_target_tokenizer():
        model_inputs['labels'] = tokenizer(
            examples["highlights"],
            max_length=128,
            truncation=transformers.tokenization_utils_base.TruncationStrategy.LONGEST_FIRST,
            padding=transformers.utils.PaddingStrategy.LONGEST,
            is_split_into_words=False,
            return_tensors=transformers.utils.TensorType.PYTORCH,
            return_attention_mask=True
        )['input_ids']
    return model_inputs

Function for creating a train, validation, and testing set of tokenized inputs, labels, and an attention mask

In [12]:
def create_token_set(file_name, tokens_train, tokens_validation, tokens_test):
    train_tokens = tokens_train.map(
        functools.partial(tokenize_data),
        batched=True,
        num_proc=multiprocessing.cpu_count() 
    )
    validation_tokens = tokens_validation.map(
        functools.partial(tokenize_data),
        batched=True,
        num_proc=multiprocessing.cpu_count() 
    )
    test_tokens = tokens_test.map(
        functools.partial(tokenize_data),
        batched=True,
        num_proc=multiprocessing.cpu_count() 
    )
    train_len = max([len(x) for x in train_tokens['input_ids']])
    train_label_len = max([len(x) for x in train_tokens['labels']])
    validation_len = max([len(x) for x in validation_tokens['input_ids']])
    validation_label_len = max([len(x) for x in validation_tokens['labels']])
    test_len = max([len(x) for x in test_tokens['input_ids']])
    test_label_len = max([len(x) for x in test_tokens['labels']])
    
    tokens = datasets.DatasetDict({
        'train': datasets.Dataset.from_dict({
            'input_ids': t.LongTensor([x + [0] * (train_len - len(x)) for x in train_tokens['input_ids']]),
            'attention_mask': t.LongTensor([x + [0] * (train_len - len(x)) for x in train_tokens['attention_mask']]),
            'labels': t.LongTensor([x + [0] * (train_label_len - len(x)) for x in train_tokens['labels']]),
        }),
        'validation': datasets.Dataset.from_dict({
            'input_ids': t.LongTensor([x + [0] * (validation_len - len(x)) for x in validation_tokens['input_ids']]),
            'attention_mask': t.LongTensor([x + [0] * (validation_len - len(x)) for x in validation_tokens['attention_mask']]),
            'labels': t.LongTensor([x + [0] * (validation_label_len - len(x)) for x in validation_tokens['labels']]),
        }),
        'test': datasets.Dataset.from_dict({
            'input_ids': t.LongTensor([x + [0] * (test_len - len(x)) for x in test_tokens['input_ids']]),
            'attention_mask': t.LongTensor([x + [0] * (test_len - len(x)) for x in test_tokens['attention_mask']]),
            'labels': t.LongTensor([x + [0] * (test_label_len - len(x)) for x in test_tokens['labels']]),
        })
    })
    with open(f'./preprocessing/{file_name}_tokens.pickle', 'wb') as file:
        pickle.dump(tokens, file)
    return(tokens)

Generates the token sets used. See explanation in report. Saves them in pickle format

In [None]:
samsum_train = samsum["train"].select(range(14000))
samsum_validation = samsum["validation"].select(range(800))
samsum_test = samsum["test"].select(range(800))

with open(f'./preprocessing/cnn_short.pickle', 'rb') as file:
    cnn_short = pickle.load(file)
cnn_shuffled = cnn_short.shuffle(seed=42)
cnn_train = cnn_shuffled.select(range(14000))
cnn_validation = cnn_shuffled.select(range(14000, 14800))
cnn_test = cnn_shuffled.select(range(14800, 15600))

mixed_train = datasets.concatenate_datasets([samsum_train.select(range(7000)), cnn_train.select(range(7000))]).shuffle(seed=42)
mixed_validation = datasets.concatenate_datasets([samsum_validation.select(range(400)), cnn_validation.select(range(400))]).shuffle(seed=42)
mixed_test = datasets.concatenate_datasets([samsum_test.select(range(400)), cnn_test.select(range(400))]).shuffle(seed=42)


with open(f'./preprocessing/cnn_test.pickle', 'wb') as file:
    pickle.dump(cnn_test, file)
with open(f'./preprocessing/samsum_test.pickle', 'wb') as file:
    pickle.dump(samsum_test, file)
with open(f'./preprocessing/mixed_test.pickle', 'wb') as file:
        pickle.dump(mixed_test, file)
    
samsum_tokens = create_token_set("samsum", samsum_train, samsum_validation, samsum_test)
cnn_tokens = create_token_set("cnn", cnn_train, cnn_validation, cnn_test)
mixed_tokens = create_token_set("mixed", mixed_train, mixed_validation, mixed_test)

Map (num_proc=64):   0%|          | 0/14000 [00:00<?, ? examples/s]









Map (num_proc=64):   0%|          | 0/800 [00:00<?, ? examples/s]









Map (num_proc=64):   0%|          | 0/800 [00:00<?, ? examples/s]









Map (num_proc=64):   0%|          | 0/14000 [00:00<?, ? examples/s]









Map (num_proc=64):   0%|          | 0/800 [00:00<?, ? examples/s]





