# This notebook is used to fine-tune flan-t5-large model on k8s question-command pairs gathered from k8s documentation

In [None]:
!pip install transformers[torch] tokenizers datasets evaluate peft accelerate bitsandbytes rouge_score sentencepiece tensorboard py7zr loralib sklearn huggingface_hub --upgrade

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
import nltk
from datasets import load_dataset, Dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Loading the dataset

In [None]:
dataset = load_dataset("ComponentSoft/k8s-kubectl-35k")

Downloading readme:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/34884 [00:00<?, ? examples/s]

In [None]:
dataset = dataset['train'].train_test_split(test_size=0.1, shuffle=False)

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")

In [None]:
def preprocess_function(examples):
    """tokenize the text, and set the labels"""

    inputs = [doc for doc in examples['question']]

    inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    labels = tokenizer(text_target=examples['command'], max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [None]:
# Map the preprocessing function across our dataset
tokenized_dataset_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_dataset_eval = dataset['test'].map(preprocess_function, batched=True)

## Set up model for LoRA

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

lora_config = LoraConfig(
  r=16,
  lora_alpha=32,
  target_modules=["q", "v"],
  lora_dropout=0.05,
  bias="none",
  task_type=TaskType.SEQ_2_SEQ_LM
)

model = prepare_model_for_int8_training(model)

model = get_peft_model(model, lora_config)

In [None]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## Set up training args and start training

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="flan-t5-large-k8s-question-code"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
	  auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=5,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [None]:
trainer.train()

Step,Training Loss
500,0.1229
1000,0.0948
1500,0.0835
2000,0.0823
2500,0.0677
3000,0.8934
3500,0.1138
4000,0.0804
4500,0.0656
5000,0.0593


TrainOutput(global_step=39245, training_loss=0.05676162180011782, metrics={'train_runtime': 26387.9365, 'train_samples_per_second': 5.949, 'train_steps_per_second': 1.487, 'total_flos': 3.640663824924672e+17, 'train_loss': 0.05676162180011782, 'epoch': 5.0})

In [None]:
model.push_to_hub('ComponentSoft/flan-t5-large-k8s-question-code')

adapter_model.bin:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ComponentSoft/flan-t5-xl-k8s-question-code/commit/0c1b88c6830dd57fff2eb1cf9402c7971b6b264b', commit_message='Upload model', commit_description='', oid='0c1b88c6830dd57fff2eb1cf9402c7971b6b264b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub('ComponentSoft/flan-t5-large-k8s-question-code')

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ComponentSoft/flan-t5-xl-k8s-question-code/commit/290160a0fae6e3aa608f88c2e1a45678184f4377', commit_message='Upload tokenizer', commit_description='', oid='290160a0fae6e3aa608f88c2e1a45678184f4377', pr_url=None, pr_revision=None, pr_num=None)