<a href="https://colab.research.google.com/github/EdmilsonSantana/tcc-2022-2/blob/main/notebooks/PTT5_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Instalação de pacotes

In [1]:
%%bash
pip install nltk
pip install datasets
pip install transformers[torch]
pip install tokenizers
pip install evaluate
pip install rouge_score
pip install sentencepiece
pip install huggingface_hub

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 542.0/542.0 kB 9.4 MB/s eta 0:00:00
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 13.7 MB/s eta 0:00:00
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 9.1 MB/s eta 0:00:00
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 14.0 MB/s eta 0:00:00
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 401.2/401.2 kB 19.6 MB/s eta 0:00:00
Installing collected packages: xxhash, dill, multiprocess, huggingfa

In [2]:
import evaluate
import nltk
import json
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import EvalPrediction
from transformers import DataCollatorForSeq2Seq
import gc
import torch

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Preparação dos dados

In [4]:
DATA_DIR = '/content/drive/MyDrive/tcc'

In [5]:
with open(f"{DATA_DIR}/vehicle_repair_and_maintenance_qa.json", 'r', encoding='utf-8') as fp:
    data = json.load(fp)

In [6]:
questions = [entry['data']['question'] for entry in data]
answers = [entry['data']['answer'] for entry in data]

In [7]:
df_qa = pd.DataFrame({'question': questions, 'answer': answers})

In [8]:
df_qa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676 entries, 0 to 2675
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  2676 non-null   object
 1   answer    2676 non-null   object
dtypes: object(2)
memory usage: 41.9+ KB


In [9]:
df_qa.drop_duplicates(subset=['question'], inplace=True)

In [10]:
dataset = Dataset.from_dict({'question': df_qa['question'], 'answer': df_qa['answer']})

In [11]:
dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 2547
})

In [12]:
dataset.save_to_disk(f"{DATA_DIR}/vehicle_repair_and_maintenance_qa.hf")

Saving the dataset (0/1 shards):   0%|          | 0/2547 [00:00<?, ? examples/s]

## Definição de métrica

In [13]:
metric = evaluate.load("rouge")
def calculate_rogue(predictions: list, labels: list) -> dict:
  decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip(), language='portuguese')) for pred in predictions]
  decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip(), language='portuguese')) for label in labels]
  return metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

## Treinamento do modelo

In [14]:
model_name = 'unicamp-dl/ptt5-base-portuguese-vocab'
max_length = 512
learning_rate = 3e-4
weight_decay = 0.01
n_epochs = 20
train_batch_size = 8

In [15]:
def tokenize_data(examples):
    model_inputs = tokenizer(examples['question'], max_length=max_length, truncation=True)
    labels = tokenizer(text_target=examples['answer'], max_length=max_length, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [16]:
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
tokenized_dataset = dataset.map(tokenize_data, batched=True)

training_args = Seq2SeqTrainingArguments(
    output_dir=DATA_DIR,
    num_train_epochs=n_epochs,
    per_device_train_batch_size=train_batch_size,
    weight_decay=weight_decay,
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=learning_rate
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/756k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Map:   0%|          | 0/2547 [00:00<?, ? examples/s]

Step,Training Loss
319,2.8312
638,1.9875
957,1.5207
1276,1.2009
1595,0.9739
1914,0.8024
2233,0.6684
2552,0.5731
2871,0.4921
3190,0.4213


TrainOutput(global_step=6380, training_loss=0.6943159563788053, metrics={'train_runtime': 2559.1185, 'train_samples_per_second': 19.905, 'train_steps_per_second': 2.493, 'total_flos': 1325591905628160.0, 'train_loss': 0.6943159563788053, 'epoch': 20.0})

In [17]:
final_model_dir = f'{DATA_DIR}/final_model'

In [18]:
trainer.save_model(final_model_dir)

## Avaliação do modelo

In [75]:
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()

107

In [53]:
tokenizer = T5Tokenizer.from_pretrained(final_model_dir)
model = T5ForConditionalGeneration.from_pretrained(final_model_dir).to('cuda')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
def inference(model, tokenizer, questions):
  model.eval()

  inputs = tokenizer(questions, return_tensors="pt", padding=True).to('cuda')
  outputs = model.generate(**inputs, max_new_tokens=512)
  return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [22]:
!pip install more-itertools
import more_itertools as mit
import pandas as pd



In [51]:
questions = dataset['question']
answers = dataset['answer']

In [67]:
preds = []
batch_size = 100
for chunk in mit.chunked(questions, batch_size):
  preds.extend(inference(model, tokenizer, chunk))

In [68]:
assert(len(preds) == len(questions))

In [69]:
df = pd.DataFrame({'inputs': questions, 'preds': preds, 'labels': answers})

In [70]:
df.to_csv(f'{DATA_DIR}/train_result.csv')
df = pd.read_csv(f'{DATA_DIR}/train_result.csv')

In [71]:
df.head()

Unnamed: 0.1,Unnamed: 0,inputs,preds,labels
0,0,Quais foram as principais formas de transporte...,"Na origem da civilização, o homem utilizou pri...","Na origem da civilização, o homem utilizou pri..."
1,1,Quando foi construído o primeiro veículo com p...,O primeiro triciclo a vapor foi construído em ...,O primeiro triciclo a vapor foi construído em ...
2,2,Quais foram os problemas que a máquina a vapor...,A máquina a vapor apresentava baixo rendimento...,A máquina a vapor apresentava baixo rendimento...
3,3,Quem foi responsável pela invenção da máquina ...,A máquina endotérmica de quatro tempos movida ...,A máquina endotérmica de quatro tempos movida ...
4,4,Quem construiu o primeiro automóvel com motor ...,O primeiro automóvel com motor de quatro tempo...,O primeiro automóvel com motor de quatro tempo...


In [72]:
print("Exact matches:", df[df['preds'] == df['labels']].shape[0])

Exact matches: 1876


In [74]:
not_exact_matches = df[df['preds'] != df['labels']]
labels = not_exact_matches['labels'].astype(str).values
preds = not_exact_matches['preds'].astype(str).values
calculate_rogue(preds, labels)

{'rouge1': 0.6397551795544161,
 'rouge2': 0.5629563553854164,
 'rougeL': 0.6136698256825557,
 'rougeLsum': 0.6207585337282113}

## Publicação do modelo

In [30]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [31]:
model.save_pretrained(f"{DATA_DIR}/emgs/ptt5-qa")

In [32]:
model.push_to_hub("emgs/ptt5-qa")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/emgs/ptt5-qa/commit/7cdd1fb349d9b40148c7ed9179629ef280bf7871', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='7cdd1fb349d9b40148c7ed9179629ef280bf7871', pr_url=None, pr_revision=None, pr_num=None)

In [33]:
tokenizer.save_pretrained(f"{DATA_DIR}/emgs/ptt5-qa")

('/content/drive/MyDrive/tcc/emgs/ptt5-qa/tokenizer_config.json',
 '/content/drive/MyDrive/tcc/emgs/ptt5-qa/special_tokens_map.json',
 '/content/drive/MyDrive/tcc/emgs/ptt5-qa/spiece.model',
 '/content/drive/MyDrive/tcc/emgs/ptt5-qa/added_tokens.json')

In [34]:
tokenizer.push_to_hub("emgs/ptt5-qa")

CommitInfo(commit_url='https://huggingface.co/emgs/ptt5-qa/commit/62d08f882a70f521f92cd589330dcf8cb7404c9c', commit_message='Upload tokenizer', commit_description='', oid='62d08f882a70f521f92cd589330dcf8cb7404c9c', pr_url=None, pr_revision=None, pr_num=None)

## Explorando modelo

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [76]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained('emgs/ptt5-qa').to("cuda")
tokenizer = T5Tokenizer.from_pretrained('emgs/ptt5-qa')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [94]:
inference(model, tokenizer, ["O micrômetro centesimal foi inventado por quem?"])

['O micrômetro centesimal foi inventado por Pierre Vernier.']

In [38]:
inference(model, tokenizer, ["Quais são algumas das características que a árvore de manivelas deve possuir?"])

['Deve ser feita de aços especiais que garantam uma resistência, de acordo com a potência do motor; a árvore de manivelas deve ser maciça, pesada e balanceada.']