In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!apt install git-lfs

In [33]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")



  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")         

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
encoder_max_len = 250
decoder_max_len = 54

In [None]:
def preprocess_data(example,
           encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
  
    context = example['context']
    question = example['question']
    answer = example['answers']['text']
  
    question_plus = f"answer_me: {str(question)}"
    question_plus += f" context: {str(context)} </s>"
    
    answer_plus = ', '.join([i for i in list(answer)])
    answer_plus = f"{answer_plus} </s>"
    
    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              pad_to_max_length=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              pad_to_max_length=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]
    
    outputs = {'input_ids':input_ids, 'attention_mask': input_attention, 
               'labels':target_ids, 'decoder_attention_mask':target_attention}
    return outputs

In [None]:
train_dataset = raw_datasets["train"].map(
    preprocess_data,
    remove_columns=raw_datasets["train"].column_names)

  0%|          | 0/87599 [00:00<?, ?ex/s]



In [None]:
test_dataset = raw_datasets["validation"].map(
    preprocess_data,
    remove_columns=raw_datasets["validation"].column_names)

  0%|          | 0/10570 [00:00<?, ?ex/s]

In [None]:
len(raw_datasets["train"]), len(train_dataset)

(87599, 87599)

In [None]:
len(raw_datasets["validation"]), len(test_dataset)

(10570, 10570)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
train_dataset.features

{'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'decoder_attention_mask': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}

In [None]:
tf_train_dataset = train_dataset.to_tf_dataset(
   columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)
tf_eval_dataset = test_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [None]:
from transformers import TFT5ForConditionalGeneration

In [None]:
model = TFT5ForConditionalGeneration.from_pretrained("t5-base")

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
from transformers import create_optimizer
import tensorflow as tf

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_epochs = 3
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
model.fit(tf_train_dataset,  epochs=num_train_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f94e3514ed0>

In [9]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
%pwd

'/content'

In [10]:
%cd "/content/gdrive/MyDrive/Transformers_files"

/content/gdrive/MyDrive/Transformers_files


In [None]:
model.save_pretrained("scratch_T5_squad")
tokenizer.save_pretrained("scratch_T5_squad")

('scratch_T5_squad/tokenizer_config.json',
 'scratch_T5_squad/special_tokens_map.json',
 'scratch_T5_squad/spiece.model',
 'scratch_T5_squad/added_tokens.json',
 'scratch_T5_squad/tokenizer.json')

## Prueba del modelo

In [4]:
from transformers import TFT5ForConditionalGeneration

In [11]:
model_checkpoint = "scratch_T5_squad"
modelo_prueba = TFT5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer_prueba = AutoTokenizer.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at scratch_T5_squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [12]:
context = "I am Laura Camila and I am colombian"
question = "Where from is she?"

In [None]:
input_text =  f"answer_me: {question} context: {context} </s>"
encoded_input = tokenizer_prueba(input_text, 
                         return_tensors='tf', pad_to_max_length=True, truncation=True, max_length=250)
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]

In [14]:
generated_answer = modelo_prueba.generate(input_ids, attention_mask=attention_mask)
generated_answer



<tf.Tensor: shape=(1, 6), dtype=int32, numpy=array([[    0,     3,  8135,    51, 12032,     1]], dtype=int32)>

In [15]:
tokenizer_prueba.decode(generated_answer.numpy()[0])

'<pad> colombian</s>'

In [16]:
tokenizer_prueba.decode(generated_answer.numpy()[0], clean_up_tokenization_spaces=True, skip_special_tokens=True)

'colombian'

## Evaluar el modelo

In [27]:
model = modelo_prueba
tokenizer = tokenizer_prueba

In [50]:
predicciones = []
j= 1

for i in raw_datasets['validation']:
    if j%1000==0: print(j)
    j = j+1
    id = i["id"]
    context_id = i["context"]
    question_id = i['question']

    input_text =  f"answer_me: {question_id} context: {context_id} </s>"
    encoded_input = tokenizer_prueba(input_text, 
                            return_tensors='tf', pad_to_max_length=True, truncation=True, max_length=250)
    input_ids = encoded_input["input_ids"]
    attention_mask = encoded_input["attention_mask"]    

    generated_answer = model.generate(input_ids, attention_mask=attention_mask)

    respuesta = tokenizer.decode(generated_answer.numpy()[0], clean_up_tokenization_spaces=True, skip_special_tokens=True)

    predicciones.append( {"id": id, "prediction_text": respuesta})

    if j ==5000:
      break



1000
2000
3000
4000


In [None]:
predicciones

In [52]:
theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in raw_datasets['validation']][0:4999]

In [53]:
metric = evaluate.load("squad")
metric.compute(predictions=predicciones, references=theoretical_answers)

{'exact_match': 80.97619523904781, 'f1': 88.54090361057868}