In [1]:
import os
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets
import torch
import torch.nn as nn

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
dataset_name = "flax-sentence-embeddings/stackexchange_titlebody_best_and_down_voted_answer_jsonl"
ai_dataset = datasets.load_dataset(dataset_name, 'ai')['train']
ds_dataset = datasets.load_dataset(dataset_name, 'datascience')['train']
se_dataset = datasets.load_dataset(dataset_name, 'softwareengineering')['train']

Downloading builder script:   0%|          | 0.00/14.7k [00:00<?, ?B/s]

Downloading and preparing dataset stack_exchange/ai to /root/.cache/huggingface/datasets/flax-sentence-embeddings___stack_exchange/ai/1.1.0/a767719a162391b61f7fecca12b41572102b8cf2909d9c06f55eb7a70c7aa579...


Downloading data:   0%|          | 0.00/146k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset stack_exchange downloaded and prepared to /root/.cache/huggingface/datasets/flax-sentence-embeddings___stack_exchange/ai/1.1.0/a767719a162391b61f7fecca12b41572102b8cf2909d9c06f55eb7a70c7aa579. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset stack_exchange/datascience to /root/.cache/huggingface/datasets/flax-sentence-embeddings___stack_exchange/datascience/1.1.0/a767719a162391b61f7fecca12b41572102b8cf2909d9c06f55eb7a70c7aa579...


Downloading data:   0%|          | 0.00/261k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset stack_exchange downloaded and prepared to /root/.cache/huggingface/datasets/flax-sentence-embeddings___stack_exchange/datascience/1.1.0/a767719a162391b61f7fecca12b41572102b8cf2909d9c06f55eb7a70c7aa579. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset stack_exchange/softwareengineering to /root/.cache/huggingface/datasets/flax-sentence-embeddings___stack_exchange/softwareengineering/1.1.0/a767719a162391b61f7fecca12b41572102b8cf2909d9c06f55eb7a70c7aa579...


Downloading data:   0%|          | 0.00/5.01M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset stack_exchange downloaded and prepared to /root/.cache/huggingface/datasets/flax-sentence-embeddings___stack_exchange/softwareengineering/1.1.0/a767719a162391b61f7fecca12b41572102b8cf2909d9c06f55eb7a70c7aa579. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
combined_dataset = datasets.concatenate_datasets([ai_dataset, ds_dataset, se_dataset])
# Change column names
combined_dataset = combined_dataset.rename_column("title_body", "question")
combined_dataset = combined_dataset.rename_column("upvoted_answer", "answer")

In [5]:
del ai_dataset, ds_dataset, se_dataset

In [6]:
# Fine-tune T5 on the dataset
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    inputs = [f"question: {q} context and answer: {a}" for q, a in zip(examples["question"], examples["answer"])]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["question"], max_length=512, padding="max_length", truncation=True)


    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
tokenized_datasets = combined_dataset.map(preprocess_function, batched=True, num_proc=4)

        

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


In [9]:
def split_train_validation(examples):
    train_size = int(0.9 * len(examples))
    return {"train": examples.select(range(train_size)), "validation": examples.select(range(train_size, len(examples)))}

In [10]:
split = split_train_validation(tokenized_datasets)
print(len(split['train']))
print(len(split['validation']))

4223
470


In [11]:
training_args = Seq2SeqTrainingArguments(
    "test-t5",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=1,
    predict_with_generate=True,
)

In [12]:
model = model.to(device)

In [13]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=split["train"],
    eval_dataset=split["validation"],
    tokenizer=tokenizer,
)

In [14]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: downvoted_answer, answer, question. If downvoted_answer, answer, question are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4223
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1056
  Number of trainable parameters = 222903552
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c t

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0313,0.005845


Saving model checkpoint to test-t5/checkpoint-500
Configuration saved in test-t5/checkpoint-500/config.json
Configuration saved in test-t5/checkpoint-500/generation_config.json
Model weights saved in test-t5/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-t5/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-t5/checkpoint-500/special_tokens_map.json
Copy vocab file to test-t5/checkpoint-500/spiece.model
Saving model checkpoint to test-t5/checkpoint-1000
Configuration saved in test-t5/checkpoint-1000/config.json
Configuration saved in test-t5/checkpoint-1000/generation_config.json
Model weights saved in test-t5/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-t5/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-t5/checkpoint-1000/special_tokens_map.json
Copy vocab file to test-t5/checkpoint-1000/spiece.model
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditio

TrainOutput(global_step=1056, training_loss=0.4252825099410433, metrics={'train_runtime': 805.9463, 'train_samples_per_second': 5.24, 'train_steps_per_second': 1.31, 'total_flos': 2571629171834880.0, 'train_loss': 0.4252825099410433, 'epoch': 1.0})