<a href="https://colab.research.google.com/github/tmskss/Flan-T5-K8S-QA/blob/main/notebooks/flan_t5_k8s.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used to fine-tune flan-t5-base model on k8s question-command pairs gathered from k8s documentation

## Installing base requirements

In [None]:
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

In [2]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Loading the dataset

In [3]:
dataset = load_dataset("ComponentSoft/k8s-kubectl")

Downloading readme:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3161 [00:00<?, ? examples/s]

Generating validate split:   0%|          | 0/316 [00:00<?, ? examples/s]

In [4]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
def preprocess_function(examples):
    """tokenize the text, and set the labels"""

    inputs = [doc for doc in examples['question']]
    inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length', return_tensors='pt')

    labels = tokenizer(text_target=examples['code'], max_length=256, truncation=True, padding='max_length', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [7]:
# Map the preprocessing function across our dataset
tokenized_dataset_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_dataset_eval = dataset['validate'].map(preprocess_function, batched=True)

Map:   0%|          | 0/3161 [00:00<?, ? examples/s]

Map:   0%|          | 0/316 [00:00<?, ? examples/s]

## Set up training args and start training

In [8]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-k8s-question-code",
    evaluation_strategy="steps",
    learning_rate=5e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    num_train_epochs=5,
    push_to_hub=False
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
    data_collator=None
)

# Train the model
trainer.train()

Step,Training Loss,Validation Loss
500,0.561,0.035498
1000,0.0315,0.010901
1500,0.0157,0.006889
2000,0.0079,0.005116
2500,0.0057,0.003869
3000,0.0035,0.00348
3500,0.0024,0.003303


TrainOutput(global_step=3955, training_loss=0.07957126126729433, metrics={'train_runtime': 772.1439, 'train_samples_per_second': 20.469, 'train_steps_per_second': 5.122, 'total_flos': 5411294977720320.0, 'train_loss': 0.07957126126729433, 'epoch': 5.0})

In [17]:
model.push_to_hub('ComponentSoft/flan-t5-base-k8s-question-code')

CommitInfo(commit_url='https://huggingface.co/ComponentSoft/flan-t5-base-k8s-question-code/commit/2c4266f9c35b02bac6959c878e1a405de94727f1', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='2c4266f9c35b02bac6959c878e1a405de94727f1', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
tokenizer.push_to_hub('ComponentSoft/flan-t5-base-k8s-question-code')

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ComponentSoft/flan-t5-base-k8s-question-code/commit/456dd1e0f3c9249a39cb3fd06fc1c05bdf134f64', commit_message='Upload tokenizer', commit_description='', oid='456dd1e0f3c9249a39cb3fd06fc1c05bdf134f64', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# Define your text input
input_text = "List all  pods in Kubernetes"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to('cuda')
eval_model.to('cuda')

# Generate text based on the input
output_ids = eval_model.generate(input_ids, max_length=100, num_return_sequences=1, num_beams=8)

# Decode the generated output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)


kubectl get pods
