<a href="https://colab.research.google.com/github/tmskss/Flan-T5-K8S-QA/blob/main/flan_t5_k8s_cot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used to fine-tune flan-t5-base model on k8s question-command pairs gathered from k8s documentation with Chain of (CoT)

In [2]:
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

Collecting transformers[torch]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m57.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collectin

In [3]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Loading the dataset

In [4]:
dataset = load_dataset("Kristofy/k8s-kubectl")

Downloading readme:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3161 [00:00<?, ? examples/s]

Generating validate split:   0%|          | 0/316 [00:00<?, ? examples/s]

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
def preprocess_function(examples):
    """tokenize the text, and set the labels"""

    inputs = [doc + "\nLet's think step by step.\n" for doc in examples['question']]
    inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    labels = []
    for i in range(0, len(examples['question'])):
      labels.append('#Steps:\n' + examples['cot'][i] + '\n#Code:\n' + examples['code'][i])

    labels = tokenizer(text_target=labels, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [27]:
# Map the preprocessing function across our dataset
tokenized_dataset_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_dataset_eval = dataset['validate'].map(preprocess_function, batched=True)

Map:   0%|          | 0/3161 [00:00<?, ? examples/s]

Map:   0%|          | 0/316 [00:00<?, ? examples/s]

## Set up training args and start training

In [31]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-k8s-question-code-cot",
    evaluation_strategy="steps",
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=5,
    push_to_hub=True
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
    data_collator=None
)

# Train the model
trainer.train()

Step,Training Loss,Validation Loss
500,0.1495,0.029869
1000,0.0388,0.008621
1500,0.0176,0.004847


TrainOutput(global_step=1980, training_loss=0.0544975555304325, metrics={'train_runtime': 1150.9677, 'train_samples_per_second': 13.732, 'train_steps_per_second': 1.72, 'total_flos': 1.082258995544064e+16, 'train_loss': 0.0544975555304325, 'epoch': 5.0})

In [32]:
trainer.push_to_hub("first finetune")

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

'https://huggingface.co/tmskss/flan-t5-base-k8s-question-code-cot/tree/main/'

In [33]:
from transformers import AutoModelForSeq2SeqLM

eval_model = AutoModelForSeq2SeqLM.from_pretrained("tmskss/flan-t5-base-k8s-question-code-cot")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [36]:
# Define your text input
input_text = "Initialize a cluster role named 'pod-reader' with ResourceName specified\nLet's think step by step"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to('cuda')
eval_model.to('cuda')

# Generate text based on the input
output_ids = eval_model.generate(input_ids, max_length=512, num_return_sequences=1, num_beams=4)

# Decode the generated output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)


#Steps: 1) Use the 'create' subcommand to create a resource 2) Set the resource type to 'clusterrole' and provide the name 'pod-reader' 3) Specify the'verb' flag as 'get' to allow the role to have permissions to only perform 'get' operations 4) Specify the'resource' flag as 'pods' to limit the role's permissions to only pods 5) Use the'resource-name' flag twice to specify the specific resource names as'readablepod' and 'anotherpod' #Code: kubectl create clusterrole pod-reader --verb=get --resource=pods --resource-name=readablepod --resource-name=anotherpod
