In [1]:
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer
from collections import defaultdict
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [2]:
import datasets
# train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
idx_intent = np.load('data/total_idx_intent.npy', allow_pickle=True).item()
idx_titles = np.load('data/total_idx_titles.npy', allow_pickle=True).item()

intents = []
titles = []
for k,v in idx_intent.items():
    intents.append(v)
    titles.append(idx_titles[k])
    
# split train, test set = 8:2
test_num = int(len(intents)*0.2)

train_intent = intents[:-test_num]
train_titles = titles[:-test_num]
test_intent = intents[-test_num:]
test_titles = titles[-test_num:]

In [3]:
train_intent.pop(1031)
train_titles.pop(1031)
test_intent.pop(816)
test_titles.pop(816)

'Nestle Hot Cocoa Mix Rich Chocolate - 70/0.75oz. Envelopes, Community Coffee Whole Bean Coffee, French Roast, 12-Ounce Bags (Pack of 3)'

In [4]:
from transformers import BartTokenizerFast
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base", do_lower_case=True)

In [5]:
title_intent = defaultdict(list)
# encoder_max_length=512
# decoder_max_length=32

for i in range(len(train_intent)):
    # token_titles = tokenizer(train_titles[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=encoder_max_length)
    # title_intent['titles'].append(torch.as_tensor(token_titles.input_ids, dtype=torch.int))
    # title_intent['attention_mask'].append(torch.as_tensor(token_titles.attention_mask, dtype=torch.int))
    # token_labels = tokenizer(train_intent[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=decoder_max_length)
    # title_intent['labels'].append(torch.as_tensor(token_labels.input_ids, dtype=torch.int))
    # title_intent['decoder_attention_mask'].append(torch.as_tensor(token_labels.attention_mask, dtype=torch.int))
    title_intent['titles'].append(train_titles[i])
    title_intent['intents'].append(train_intent[i])
    
test_title_intent = defaultdict(list)
for i in range(len(test_intent)):
    # token_titles = tokenizer(test_titles[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=encoder_max_length)
    # test_title_intent['titles'].append(torch.as_tensor(token_titles.input_ids, dtype=torch.int))
    # test_title_intent['attention_mask'].append(torch.as_tensor(token_titles.attention_mask, dtype=torch.int))
    # token_labels = tokenizer(test_intent[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=decoder_max_length)
    # test_title_intent['labels'].append(torch.as_tensor(token_labels.input_ids, dtype=torch.int))
    # test_title_intent['decoder_attention_mask'].append(torch.as_tensor(token_labels.attention_mask, dtype=torch.int))
    test_title_intent['titles'].append(test_titles[i])
    test_title_intent['intents'].append(test_intent[i])
    
dataset = Dataset.from_dict(title_intent)
vali_dataset = Dataset.from_dict(test_title_intent)

# dataset.set_format("torch")
# vali_dataset.set_format("torch")

In [6]:
encoder_max_length=512
decoder_max_length=64

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
    inputs = tokenizer(batch["titles"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["intents"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
  # batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch

In [7]:
batch_size = 4

train_data = dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["titles", "intents"]
)

  0%|          | 0/1089 [00:00<?, ?ba/s]

In [8]:
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
)

In [9]:
val_data = vali_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["titles", "intents"]
)

  0%|          | 0/272 [00:00<?, ?ba/s]

In [10]:
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_attention_mask", "labels"],
)

In [11]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

In [12]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True, 
    output_dir="/home/workshop/dataset/fkd/bertGeneration/bart/",
    logging_steps=500,
    save_steps=1000,
    eval_steps=500,
    learning_rate=0.00002,
    num_train_epochs=3,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)

In [13]:
rouge = datasets.load_metric("rouge")
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    # print(pred_str)
    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [14]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

Using amp fp16 backend
***** Running training *****
  Num examples = 4355
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3267


Step,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
500,3.0732,2.468399,0.0542,0.0498,0.0498
1000,2.6735,2.368719,0.0739,0.0618,0.0645
1500,2.3198,2.374057,0.0738,0.0688,0.0684
2000,2.2568,2.299472,0.0757,0.0666,0.0673
2500,2.0582,2.309595,0.08,0.0725,0.0724
3000,1.9896,2.286724,0.0779,0.0694,0.0691


***** Running Evaluation *****
  Num examples = 1087
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 4
Saving model checkpoint to /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-1000
Configuration saved in /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-1000/config.json
Model weights saved in /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 4
***** Running Evaluation *****
  Num examples = 1087
  Batch size = 4
Saving model checkpoint to /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-2000
Configuration saved in /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-2000/config.json
Model we

TrainOutput(global_step=3267, training_loss=2.3597072464703857, metrics={'train_runtime': 677.9933, 'train_samples_per_second': 19.27, 'train_steps_per_second': 4.819, 'total_flos': 3983103413452800.0, 'train_loss': 2.3597072464703857, 'epoch': 3.0})

In [15]:
from transformers import AutoModelForSeq2SeqLM

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = AutoModelForSeq2SeqLM.from_pretrained("/home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/").to(device)

loading configuration file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/config.json
Model config BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_de

In [16]:
tokenizer = BartTokenizerFast.from_pretrained("/home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/")

Didn't find file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/added_tokens.json. We won't load it.
loading file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/vocab.json
loading file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/merges.txt
loading file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/tokenizer.json
loading file None
loading file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/special_tokens_map.json
loading file /home/workshop/dataset/fkd/bertGeneration/bart/checkpoint-3000/tokenizer_config.json


In [17]:
def generate_summary(batch):
    # cut off at BERT max length 512
    inputs = tokenizer(batch["titles"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred_summary"] = output_str

    return batch

In [18]:
batch_size = 4  # change to 64 for full evaluation

results = vali_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["titles"])


  0%|          | 0/272 [00:00<?, ?ba/s]

In [19]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rouge2"])["rouge2"].mid

In [20]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.0779, 0.0694, 0.0691)

In [21]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rouge1"])["rouge1"].mid

In [22]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.2961, 0.2421, 0.2448)

In [23]:
rouge_output = rouge.compute(predictions=results["pred_summary"], references=results["intents"], rouge_types=["rougeL"])["rougeL"].mid

In [24]:
round(rouge_output.precision, 4),round(rouge_output.recall, 4),round(rouge_output.fmeasure, 4)

(0.2931, 0.2401, 0.2422)

In [None]:
#lr = 0.0001
(0.282, 0.2271, 0.2327)
(0.0731, 0.0654, 0.0659)
(0.2798, 0.2257, 0.231)

#lr = 0.00005 5e-5
(0.2854, 0.2317, 0.2372)
(0.0786, 0.0699, 0.0707)
(0.2814, 0.2288, 0.2342)

In [5]:
# idx_intent = np.load('./total_idx_intent.npy', allow_pickle=True).item()
# idx_titles = np.load('./total_idx_titles.npy', allow_pickle=True).item()

all_bundle = np.load('data/food_evaluation_replace.npy', allow_pickle=True).item()

intents = []
titles = []
for k,v in all_bundle.items():
    intents.append(v[2])
    titles.append(v[1])
    
# split train, test set = 8:2
# test_num = int(len(intents)*0.2)

# train_intent = intents[:-test_num]
# train_titles = titles[:-test_num]
# test_intent = intents[-test_num:]
# test_titles = titles[-test_num:]

# title_intent = defaultdict(list)
# # encoder_max_length=512
# # decoder_max_length=32

# for i in range(len(train_intent)):
#     # token_titles = tokenizer(train_titles[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=encoder_max_length)
#     # title_intent['titles'].append(torch.as_tensor(token_titles.input_ids, dtype=torch.int))
#     # title_intent['attention_mask'].append(torch.as_tensor(token_titles.attention_mask, dtype=torch.int))
#     # token_labels = tokenizer(train_intent[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=decoder_max_length)
#     # title_intent['labels'].append(torch.as_tensor(token_labels.input_ids, dtype=torch.int))
#     # title_intent['decoder_attention_mask'].append(torch.as_tensor(token_labels.attention_mask, dtype=torch.int))
#     title_intent['titles'].append(train_titles[i])
#     title_intent['intents'].append(train_intent[i])
    
test_title_intent = defaultdict(list)
for i in range(len(intents)):
    # token_titles = tokenizer(test_titles[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=encoder_max_length)
    # test_title_intent['titles'].append(torch.as_tensor(token_titles.input_ids, dtype=torch.int))
    # test_title_intent['attention_mask'].append(torch.as_tensor(token_titles.attention_mask, dtype=torch.int))
    # token_labels = tokenizer(test_intent[i], add_special_tokens=False, return_tensors="pt", padding="max_length",truncation=True, max_length=decoder_max_length)
    # test_title_intent['labels'].append(torch.as_tensor(token_labels.input_ids, dtype=torch.int))
    # test_title_intent['decoder_attention_mask'].append(torch.as_tensor(token_labels.attention_mask, dtype=torch.int))
    test_title_intent['titles'].append(titles[i])
    test_title_intent['intents'].append(intents[i])
    
# dataset = Dataset.from_dict(title_intent)
vali_dataset = Dataset.from_dict(test_title_intent)

In [6]:
finalresults = []
for i in range(len(results["pred_summary"])):
    finalresults.append((results["pred_summary"][i], results["intents"][i]))
np.save('bart_food_replace.npy', finalresults)

NameError: name 'results' is not defined