<a href="https://colab.research.google.com/github/ComponentSoftTeam/Flan-T5-K8S-QA/blob/main/notebooks/flan_t5_large_k8s.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used to fine-tune flan-t5-large model on k8s question-command pairs gathered from k8s documentation

In [2]:
!pip install transformers[torch] tokenizers datasets evaluate peft accelerate bitsandbytes rouge_score sentencepiece tensorboard py7zr loralib huggingface_hub --upgrade

Collecting transformers[torch]
  Using cached transformers-4.34.1-py3-none-any.whl (7.7 MB)
Collecting tokenizers
  Using cached tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Collecting datasets
  Using cached datasets-2.14.6-py3-none-any.whl (493 kB)
Collecting evaluate
  Using cached evaluate-0.4.1-py3-none-any.whl (84 kB)
Collecting peft
  Using cached peft-0.5.0-py3-none-any.whl (85 kB)
Collecting accelerate
  Using cached accelerate-0.24.0-py3-none-any.whl (260 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.41.1-py3-none-any.whl (92.6 MB)
Collecting rouge_score
  Using cached rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece
  Using cached sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting tensorboard
  Using cached tensorboard-2.15.0-py3-none-any.whl (5.6 MB)
Collecting py7zr
  Using cached py7zr-0.20.6-py3-none-any.whl (

In [3]:
import nltk
from datasets import load_dataset, Dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Loading the dataset

In [4]:
dataset = load_dataset("ComponentSoft/k8s-kubectl-35k")

Downloading readme:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/34884 [00:00<?, ? examples/s]

In [5]:
dataset = dataset['train'].train_test_split(test_size=0.1, shuffle=True)

In [6]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    """tokenize the text, and set the labels"""

    inputs = [doc for doc in examples['question']]

    inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length', return_tensors='pt')

    labels = tokenizer(text_target=examples['command'], max_length=128, truncation=True, padding='max_length', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [8]:
# Map the preprocessing function across our dataset
tokenized_dataset_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_dataset_eval = dataset['test'].map(preprocess_function, batched=True)

Map:   0%|          | 0/31395 [00:00<?, ? examples/s]

Map:   0%|          | 0/3489 [00:00<?, ? examples/s]

## Set up model for LoRA

In [9]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

lora_config = LoraConfig(
  r=64,
  lora_alpha=32,
  target_modules=["q"],
  lora_dropout=0.05,
  bias="none",
  task_type=TaskType.SEQ_2_SEQ_LM
)

model = prepare_model_for_int8_training(model)

model = get_peft_model(model, lora_config)



In [10]:
def compute_metrics(p):
    logits = p.predictions
    target_ids = p.label_ids
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits.view(-1, logits.shape[-1]), target_ids.view(-1))
    return {"eval_loss": loss.item()}

In [11]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8,
)

## Set up training args and start training

In [12]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="flan-t5-large-k8s-question-code-v2"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="steps",
    bf16=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps"
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [17]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.8647,0.714687
1000,1.3481,0.567802
1500,0.9153,0.429866
2000,0.7295,0.379392
2500,0.6248,0.348617
3000,0.5694,0.32397
3500,0.5172,0.302746
4000,0.4879,0.285063
4500,0.457,0.268244
5000,0.4203,0.255715


Step,Training Loss,Validation Loss
500,2.8647,0.714687
1000,1.3481,0.567802
1500,0.9153,0.429866
2000,0.7295,0.379392
2500,0.6248,0.348617
3000,0.5694,0.32397
3500,0.5172,0.302746
4000,0.4879,0.285063
4500,0.457,0.268244
5000,0.4203,0.255715


TrainOutput(global_step=9815, training_loss=0.6333093589747736, metrics={'train_runtime': 3321.1477, 'train_samples_per_second': 47.265, 'train_steps_per_second': 2.955, 'total_flos': 9.15854539751424e+16, 'train_loss': 0.6333093589747736, 'epoch': 5.0})

In [18]:
model.push_to_hub('ComponentSoft/flan-t5-large-k8s-question-code-v2')

adapter_model.bin:   0%|          | 0.00/37.8M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ComponentSoft/flan-t5-large-k8s-question-code-v2/commit/0f4f2d7b6013e542e74735926ffed24fa43a79cc', commit_message='Upload model', commit_description='', oid='0f4f2d7b6013e542e74735926ffed24fa43a79cc', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
tokenizer.push_to_hub('ComponentSoft/flan-t5-large-k8s-question-code-v2')

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ComponentSoft/flan-t5-large-k8s-question-code-v2/commit/4c845fb33ff7efaf355bf39d26e97cab24a5dad5', commit_message='Upload tokenizer', commit_description='', oid='4c845fb33ff7efaf355bf39d26e97cab24a5dad5', pr_url=None, pr_revision=None, pr_num=None)