In [61]:
import transformers 
import datasets 
import evaluate 
import rouge_score
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer
from datasets import load_dataset
import numpy as np
import torch
from tqdm import tqdm

In [58]:
data_files = {"train": "../data/Dataset1/train_summary.csv", "validation": "../data/Dataset1/val_summary.csv", "test": "../data/Dataset1/test_summary.csv"}

# reduce number of training samples to help with training
train_ds = load_dataset("csv", data_files=data_files, sep="\t", split="train[:25%]")
val_ds = load_dataset("csv", data_files=data_files, sep="\t", split="validation[:25%]")
test_ds = load_dataset("csv", data_files=data_files, sep="\t", split="test")

test_ds

Dataset({
    features: ['review/summary', 'review/text'],
    num_rows: 1000
})

In [59]:
# rename columns to align with hf dataset format

train_ds = train_ds.rename_column("review/summary", "summary")
val_ds = val_ds.rename_column("review/summary", "summary")
test_ds = test_ds.rename_column("review/summary", "summary")

train_ds = train_ds.rename_column("review/text", "text")
val_ds = val_ds.rename_column("review/text", "text")
test_ds = test_ds.rename_column("review/text", "text")

train_ds

Dataset({
    features: ['summary', 'text'],
    num_rows: 1000
})

In [21]:
# load tokenizer for preprocessing
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: " # instructions included in prompt for the model

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]

    # tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=1024,
        truncation=True
    )

    # tokenize labels
    labels = tokenizer(
        text_target=examples["summary"],
        max_length=128,
        truncation=True,
    )["input_ids"]

    # replace padding token id with -100 for T5 loss
    labels = [
        [(token if token != tokenizer.pad_token_id else -100) for token in seq]
        for seq in labels
    ]

    model_inputs["labels"] = labels
    return model_inputs


# apply the preprocessing function, batch for faster mapping
tokenized_train = train_ds.map(preprocess_function, batched=True)
tokenized_val = val_ds.map(preprocess_function, batched=True)
tokenized_test = test_ds.map(preprocess_function, batched=True)

# pad the text in each review to the longest length in a batch
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 5890.34 examples/s]


In [29]:
# define evaluation metrics
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [30]:
# load model

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


device(type='cuda')

In [32]:
# to speed up training, freeze the encoder layer
for param in model.encoder.parameters():
    param.requires_grad = False

# freeze all decoder layers except last
num_decoder_layers = len(model.decoder.block)
for i, block in enumerate(model.decoder.block):
    if i != num_decoder_layers - 1:
        for param in block.parameters():
            param.requires_grad = False

# retain language model head for training
for param in model.lm_head.parameters():
    param.requires_grad = True

training_args = Seq2SeqTrainingArguments(
    output_dir="t5_summarizer",
    optim="adamw_torch",
    eval_strategy="epoch",
    learning_rate=2e-4,                  # higher LR since fewer params update
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=32,      # simulate batch size 32
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    gradient_checkpointing=True,         # huge memory reduction
)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,5.114164,0.1411,0.0408,0.1283,0.1287,19.656
2,No log,5.038428,0.1414,0.0416,0.1303,0.1307,19.6
3,No log,4.979244,0.1441,0.0419,0.1326,0.133,19.476
4,No log,4.929502,0.1431,0.042,0.1314,0.132,19.42
5,No log,4.885674,0.1413,0.0417,0.1293,0.1298,19.38
6,No log,4.851595,0.1382,0.0401,0.1271,0.1277,19.256
7,No log,4.826695,0.1401,0.0396,0.1279,0.1284,19.208
8,No log,4.811897,0.1393,0.0383,0.1271,0.1276,19.176
9,No log,4.801108,0.1403,0.0387,0.1279,0.1285,19.176
10,No log,4.796399,0.1378,0.0359,0.1258,0.1261,19.096


TrainOutput(global_step=20, training_loss=5.022878646850586, metrics={'train_runtime': 761.1054, 'train_samples_per_second': 13.139, 'train_steps_per_second': 0.026, 'total_flos': 2032324211638272.0, 'train_loss': 5.022878646850586, 'epoch': 10.0})

In [57]:
predictions = [test_ds["summary"][0]]
references = [test_ds["text"][0]]
results = rouge.compute(predictions = predictions, references = references)
print(results)

{'rouge1': np.float64(0.019801980198019802), 'rouge2': np.float64(0.0), 'rougeL': np.float64(0.0165016501650165), 'rougeLsum': np.float64(0.0165016501650165)}


In [60]:
len(test_ds["summary"])

1000

In [None]:
# conduct inference
tokenizer = AutoTokenizer.from_pretrained(r"\Users\sorui\Desktop\NLPExplorationProject\Application-2\t5_summarizer\checkpoint-20")

model = AutoModelForSeq2SeqLM.from_pretrained(r"\Users\sorui\Desktop\NLPExplorationProject\Application-2\t5_summarizer\checkpoint-20")

predictions = []
references = []

for i in tqdm(range(len(test_ds["summary"])), desc = "Generating summaries"):
    references.append(test_ds["summary"][i])
    inputs = tokenizer(test_ds["text"][i], return_tensors="pt", truncation=True).input_ids
    outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
    predictions.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    
results = rouge.compute(predictions = predictions, references = references, use_aggregator=True)
results = {k: float(v) for k, v in results.items()}
print(results)

{'rouge1': 0.09580201404263411, 'rouge2': 0.021806565429959714, 'rougeL': 0.08418221112029639, 'rougeLsum': 0.08417014998309315}
