<a href="https://colab.research.google.com/github/tmskss/Flan-T5-K8S-QA/blob/main/notebooks/flan_t5_k8s_cot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used to fine-tune flan-t5-base model on k8s question-command pairs gathered from k8s documentation with Chain of Thought (CoT)

In [None]:
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

In [None]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Loading the dataset

In [None]:
dataset = load_dataset('ComponentSoft/k8s-kubectl')

In [None]:
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def preprocess_function(examples):
    """tokenize the text, and set the labels"""

    inputs = [doc + "\nLet's think step by step.\n" for doc in examples['question']]
    inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    labels = []
    for i in range(0, len(examples['question'])):
      labels.append('#Steps:\n' + examples['cot'][i] + '\n#Code:\n' + examples['code'][i])

    labels = tokenizer(text_target=labels, max_length=512, truncation=True, padding='max_length', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [None]:
# Map the preprocessing function across our dataset
tokenized_dataset_train = dataset['train'].map(preprocess_function, batched=True)
tokenized_dataset_eval = dataset['validate'].map(preprocess_function, batched=True)

## Set up training args and start training

In [None]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-k8s-question-code-cot",
    evaluation_strategy="steps",
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=5,
    push_to_hub=True
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_eval,
    data_collator=None
)

# Train the model
trainer.train()

In [None]:
model.push_to_hub('ComponentSoft/flan-t5-base-k8s-question-code-cot')

In [None]:
tokenizer.push_to_hub('ComponentSoft/flan-t5-base-k8s-question-code-cot')