In [1]:
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer
from collections import defaultdict
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [3]:
import datasets
# train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
idx_intent = np.load('data/total_idx_intent.npy', allow_pickle=True).item()
idx_titles = np.load('data/total_idx_titles.npy', allow_pickle=True).item()

intents = []
titles = []
for k,v in idx_intent.items():
    intents.append(v)
    titles.append(idx_titles[k])
    
# split train, test set = 8:2
test_num = int(len(intents)*0.2)

train_intent = intents[:-test_num]
train_titles = titles[:-test_num]
test_intent = intents[-test_num:]
test_titles = titles[-test_num:]

In [4]:
train_intent.pop(1031)
train_titles.pop(1031)
test_intent.pop(816)
test_titles.pop(816)

'Nestle Hot Cocoa Mix Rich Chocolate - 70/0.75oz. Envelopes, Community Coffee Whole Bean Coffee, French Roast, 12-Ounce Bags (Pack of 3)'

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base", do_lower_case=True)

In [6]:
title_intent = defaultdict(list)
# encoder_max_length=512
# decoder_max_length=32

for i in range(len(train_intent)):
    # token_titles = tokenizer(train_titles[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=encoder_max_length)
    # title_intent['titles'].append(torch.as_tensor(token_titles.input_ids, dtype=torch.int))
    # title_intent['attention_mask'].append(torch.as_tensor(token_titles.attention_mask, dtype=torch.int))
    # token_labels = tokenizer(train_intent[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=decoder_max_length)
    # title_intent['labels'].append(torch.as_tensor(token_labels.input_ids, dtype=torch.int))
    # title_intent['decoder_attention_mask'].append(torch.as_tensor(token_labels.attention_mask, dtype=torch.int))
    title_intent['titles'].append(train_titles[i])
    title_intent['intents'].append(train_intent[i])
    
test_title_intent = defaultdict(list)
for i in range(len(test_intent)):
    # token_titles = tokenizer(test_titles[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=encoder_max_length)
    # test_title_intent['titles'].append(torch.as_tensor(token_titles.input_ids, dtype=torch.int))
    # test_title_intent['attention_mask'].append(torch.as_tensor(token_titles.attention_mask, dtype=torch.int))
    # token_labels = tokenizer(test_intent[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=decoder_max_length)
    # test_title_intent['labels'].append(torch.as_tensor(token_labels.input_ids, dtype=torch.int))
    # test_title_intent['decoder_attention_mask'].append(torch.as_tensor(token_labels.attention_mask, dtype=torch.int))
    test_title_intent['titles'].append(test_titles[i])
    test_title_intent['intents'].append(test_intent[i])
    
dataset = Dataset.from_dict(title_intent)
vali_dataset = Dataset.from_dict(test_title_intent)

# dataset.set_format("torch")
# vali_dataset.set_format("torch")

In [7]:
encoder_max_length=512
decoder_max_length=64

prefix = "summarize: "

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
    inputs = [prefix + doc for doc in batch["titles"]]
    model_inputs = tokenizer(inputs, max_length=encoder_max_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["intents"], max_length=decoder_max_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
#     inputs = tokenizer(batch["titles"], padding="max_length", truncation=True, max_length=encoder_max_length)
#     outputs = tokenizer(batch["intents"], padding="max_length", truncation=True, max_length=decoder_max_length)

#     batch["input_ids"] = inputs.input_ids
#     batch["attention_mask"] = inputs.attention_mask
#   # batch["decoder_input_ids"] = outputs.input_ids
#     batch["decoder_attention_mask"] = outputs.attention_mask
#     batch["labels"] = outputs.input_ids

#   # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
#   # We have to make sure that the PAD token is ignored
#     batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

#     return batch

In [8]:
batch_size = 4

train_data = dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["titles", "intents"]
)

  0%|          | 0/1089 [00:00<?, ?ba/s]

In [9]:
train_data.set_format(
    type="torch",
)

In [10]:
val_data = vali_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["titles", "intents"]
)

  0%|          | 0/272 [00:00<?, ?ba/s]

In [11]:
val_data.set_format(
    type="torch",
)

In [12]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [13]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [17]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True, 
    output_dir="/home/workshop/dataset/fkd/bertGeneration/t5",
    logging_steps=2500,
    save_steps=10000,
    eval_steps=2500,
    learning_rate=7e-5,
    num_train_epochs=3,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [15]:
rouge = datasets.load_metric("rouge")
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    # print(pred_str)
    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [18]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)
trainer.train()

Using amp fp16 backend
***** Running training *****
  Num examples = 4355
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 13065


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
2500,2.926,2.596875,0.0596,0.0506,0.0523
5000,2.6009,2.551652,0.0697,0.0594,0.0611
7500,2.2962,2.53199,0.0669,0.0587,0.0599
10000,2.0983,2.584431,0.0773,0.0675,0.0691
12500,1.943,2.588775,0.0752,0.0669,0.0686


***** Running Evaluation *****
  Num examples = 1087
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 1
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 1
Saving model checkpoint to /home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-10000
Configuration saved in /home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-10000/config.json
Model weights saved in /home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in /home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-10000/tokenizer_config.json
Special tokens file saved in /home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-10000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=13065, training_loss=2.3520315807530854, metrics={'train_runtime': 2082.0537, 'train_samples_per_second': 6.275, 'train_steps_per_second': 6.275, 'total_flos': 1389644284999680.0, 'train_loss': 2.3520315807530854, 'epoch': 3.0})

In [14]:

model = AutoModelForSeq2SeqLM.from_pretrained("/home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-3000/").to(device)

In [22]:
model.config

T5Config {
  "_name_or_path": "/home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-3000/",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "prefix": "translate English to German: "
    

In [12]:
tokenizer = AutoTokenizer.from_pretrained("/home/workshop/dataset/fkd/bertGeneration/t5/checkpoint-3000/")

In [24]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def generate_summary(batch):
    # cut off at BERT max length 512
    inputs = tokenizer(batch["titles"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred_summary"] = output_str

    return batch

In [25]:
batch_size = 4  # change to 64 for full evaluation

results = vali_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["titles"])


  0%|          | 0/272 [00:00<?, ?ba/s]

In [26]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rouge2"])["rouge2"].mid

In [27]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.0646, 0.0551, 0.0568)

In [28]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rouge1"])["rouge1"].mid

In [29]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.3101, 0.2338, 0.2438)

In [30]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rougeL"])["rougeL"].mid

In [31]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.3086, 0.2328, 0.2426)

In [40]:
0.0646*0.0551*2

0.007118920000000001

In [41]:
(0.0646+0.0551)

0.1197

In [42]:
0.007118920000000001/0.1197

0.05947301587301588

In [43]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rougeL"])["rougeL"]

In [44]:
rouge_output

AggregateScore(low=Score(precision=0.28496243483594, recall=0.21267733726076649, fmeasure=0.22256775223265482), mid=Score(precision=0.3086465588995487, recall=0.2327505923239594, fmeasure=0.24264675530895946), high=Score(precision=0.3321674442546107, recall=0.25151860336405263, fmeasure=0.261203010433095))