In [1]:
#model_checkpoint = "t5-base"
# model_checkpoint = "unicamp-dl/ptt5-base-t5-vocab"
model_checkpoint = 'unicamp-dl/ptt5-base-portuguese-vocab'
#model_checkpoint = "facebook/bart-base"
#model_checkpoint = "unicamp-dl/ptt5-base-t5-vocab"
#model_checkpoint = 'neuralmind/bert-base-portuguese-cased'

In [2]:
K_Retrieval = 10
max_input_length = 1024
max_target_length = 32


train_path = "finetune_qa_pairs_14k_news/train_doc.pickle"
val_path = "finetune_qa_pairs_14k_news/val_doc.pickle"
test_path = "finetune_qa_pairs_14k_news/test_doc.pickle"
model_name = "QA-ptt5-base-news-k10"

In [3]:
import pickle

with open(train_path, 'rb') as f:
    train_docs = pickle.load(f)

with open(val_path, 'rb') as f:
    val_docs = pickle.load(f)
    
with open(test_path, 'rb') as f:
    test_docs = pickle.load(f)

08/29/2021 22:52:26 - INFO - faiss.loader -   Loading faiss with AVX2 support.
08/29/2021 22:52:26 - INFO - faiss.loader -   Loading faiss.
08/29/2021 22:52:27 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [4]:
import pandas as pd
import ast

def preprocess(docs):
    questions = []
    answer = []
    for instance in docs:
        question = "question: " + instance["question"][0]
        documents = ""
        for i in range (K_Retrieval):
            if i < len(instance["documents"]):
                document_dict = ast.literal_eval(str(instance["documents"][i]))
                document = document_dict["meta"]["title"] + " " + document_dict["text"]
                question += "  context: "+document
        questions.append(question)
        answer.append(instance["answer"][0])

    pd_dataset = pd.DataFrame({'question': questions, 'answer' : answer})
    pd_dataset.head()
    
    return(pd_dataset)

pd_dataset_train = preprocess(train_docs)

pd_dataset_val = preprocess(val_docs)

pd_dataset_test = preprocess(test_docs)


In [5]:
from datasets import load_dataset, load_metric, DatasetDict, Dataset

#raw_datasets = load_dataset("xsum")
dataset_train = Dataset.from_pandas(df=pd_dataset_train)
dataset_val = Dataset.from_pandas(df=pd_dataset_val)
dataset_test = Dataset.from_pandas(df=pd_dataset_test)

#dataset.train_test_split()
#train_validtest = dataset.train_test_split(0.1)
#valid_test = train_validtest['test'].train_test_split(0.5)

raw_datasets = DatasetDict({
    'train': dataset_train,
    'validation': dataset_val,
    'test': dataset_test})
metric = load_metric("rouge")



In [6]:
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [7]:
from transformers import AutoTokenizer, T5Tokenizer, T5TokenizerFast, TFT5Model, TFT5ForConditionalGeneration

#tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

In [8]:
def preprocess_input(examples):
    model_inputs = tokenizer(examples["question"], max_length=max_input_length, truncation=True)
    return model_inputs

def preprocess_function(examples):
    model_inputs = preprocess_input(examples)
    # Setup the tokenizer for targets
    labels = tokenizer(examples["answer"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [10]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [11]:
batch_size = 4
args = Seq2SeqTrainingArguments(
    model_name,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps = 4
)

In [12]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
import nltk
#nltk.download('punkt')
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [14]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnakasato[0m (use `wandb login --relogin` to force relogin)




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Runtime,Samples Per Second
0,No log,3.324145,18.1353,3.3236,18.1187,18.0836,3.2258,182.1947,11.839
1,4.330000,3.03697,21.3038,4.512,21.2129,21.1854,3.235,178.4415,12.088
2,4.330000,2.900413,23.117,5.0876,23.0849,23.0523,3.3769,180.2598,11.966
3,3.023300,2.809012,23.6988,5.5757,23.6511,23.6144,3.4019,180.9013,11.924
4,2.785700,2.738003,24.5407,6.0877,24.4559,24.4293,3.4618,181.3445,11.894
5,2.785700,2.678118,24.73,6.3682,24.6749,24.6259,3.4975,181.7821,11.866
6,2.625100,2.632679,25.377,6.6874,25.2882,25.2355,3.5257,181.9612,11.854
7,2.497300,2.590995,26.0007,7.0869,25.9029,25.9079,3.5828,181.9209,11.857
8,2.497300,2.558925,26.0237,7.2552,25.9238,25.9233,3.5906,182.8035,11.8
9,2.397700,2.531365,26.2737,7.281,26.1833,26.1516,3.6203,183.542,11.752


08/29/2021 23:11:46 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/29/2021 23:28:43 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/29/2021 23:45:37 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/30/2021 00:02:34 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/30/2021 00:19:32 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metric

08/30/2021 05:25:11 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/30/2021 05:42:10 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/30/2021 05:59:08 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/30/2021 06:16:10 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
08/30/2021 06:33:08 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metric

TrainOutput(global_step=9420, training_loss=2.3208622375617867, metrics={'train_runtime': 30571.031, 'train_samples_per_second': 0.308, 'total_flos': 4.1663920860261274e+17, 'epoch': 30.0, 'init_mem_cpu_alloc_delta': 1899499520, 'init_mem_gpu_alloc_delta': 891614208, 'init_mem_cpu_peaked_delta': 94273536, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 4253302784, 'train_mem_gpu_alloc_delta': 2704384512, 'train_mem_cpu_peaked_delta': 206872576, 'train_mem_gpu_peaked_delta': 10222008320})

In [16]:
a = trainer.predict(tokenized_datasets['test'], max_length=max_target_length)

06/18/2021 12:38:13 - INFO - /home/blab-answerer/anaconda3/lib/python3.8/site-packages/datasets/metric.py -   Removing /home/blab-answerer/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow


In [17]:
test_outputs = []
test_labels = []
for i in range(len(a.predictions)):
    test_outputs.append(tokenizer.decode(a.predictions[i]))
    test_labels.append(raw_datasets['test'][i]['answer'])
pd_output = pd.DataFrame({'predicted' : test_outputs, 'label' : test_labels})

In [18]:
pd_output.head()

Unnamed: 0,predicted,label
0,<pad> Mais de 90%</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> ...,85%
1,<pad> Florestas do Brasil</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad...,Bacia do Rio da Madeira no Brasil
2,<pad> mais de 400.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>...,58.
3,<pad> O bendito</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <p...,Falcão
4,<pad> O ribeirinho</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>...,Falcão
