<a href="https://colab.research.google.com/github/tmskss/Flan-T5-K8S-QA/blob/main/flan_t5_k8s.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used to fine-tune flan-t5-base model on k8s question-answer pairs gathered from stackoverflow

## Installing base requirements

In [None]:
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

In [2]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

## Loading the dataset

In [3]:
dataset = load_dataset("tmskss/k8s-stackoverflow-qa", split='train[:15%]')
dataset = dataset.train_test_split(test_size=0.2)

Downloading readme:   0%|          | 0.00/215 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/91.3M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [20]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
prefix = "answer the question: "

def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""

    inputs = [prefix + doc for doc in examples['instruction']]
    inputs = tokenizer(inputs, max_length=512, truncation=True, padding='longest', return_tensors='pt')

    labels = tokenizer(text_target=examples['output'], max_length=512, truncation=True, padding='longest', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [None]:
# Map the preprocessing function across our dataset
tokenized_dataset = dict()
tokenized_dataset['train'] = dataset['train'].map(preprocess_function, batched=True)
tokenized_dataset['test'] = dataset['test'].map(preprocess_function, batched=True)

## Testing script on a possibly correct dataset

In [25]:
dataset = load_dataset("truthful_qa", "generation")

In [28]:
prefix = "answer the question: "

def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""

    inputs = [prefix + doc for doc in examples['question']]
    inputs = tokenizer(inputs, max_length=512, truncation=True, padding='longest', return_tensors='pt')

    labels = tokenizer(text_target=examples['best_answer'], max_length=512, truncation=True, padding='longest', return_tensors='pt')

    return {
        'input_ids': inputs.input_ids,
        'attention_mask': inputs.attention_mask,
        'labels': labels.input_ids,
    }


In [29]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/817 [00:00<?, ? examples/s]

In [33]:
print(tokenized_dataset)

DatasetDict({
    validation: Dataset({
        features: ['type', 'category', 'question', 'best_answer', 'correct_answers', 'incorrect_answers', 'source', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 817
    })
})


In [36]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-base-dummy-test",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    tf32=True
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['validation'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=None
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.185867
2,No log,0.09692
3,0.252500,0.051598
4,0.252500,0.032065
5,0.114200,0.022737


TrainOutput(global_step=1025, training_loss=0.1813702218125506, metrics={'train_runtime': 205.4518, 'train_samples_per_second': 19.883, 'train_steps_per_second': 4.989, 'total_flos': 382434306508800.0, 'train_loss': 0.1813702218125506, 'epoch': 5.0})

In [40]:
# Define your text input
input_text = "answer the question: What is the spiciest part of a chili pepper?"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to('cuda')

# Generate text based on the input
output_ids = model.generate(input_ids, max_length=100, num_return_sequences=1, num_beams=4)

# Decode the generated output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

The spiciest part of a chili pepper is the placenta


## Set up training args and start training

In [12]:
# Set up training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-large-k8s-stackoverflow-qa",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    tf32=True,
    push_to_hub=True
)

# Set up trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=None
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0364,0.954494
2,0.9407,0.932356


TrainOutput(global_step=2112, training_loss=1.0895198439106797, metrics={'train_runtime': 381.2719, 'train_samples_per_second': 22.152, 'train_steps_per_second': 5.539, 'total_flos': 5783460598775808.0, 'train_loss': 1.0895198439106797, 'epoch': 2.0})

In [13]:
trainer.push_to_hub("second finetune")

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

'https://huggingface.co/tmskss/flan-t5-large-k8s-stackoverflow-qa/tree/main/'

In [14]:
from transformers import AutoModelForSeq2SeqLM

eval_model = AutoModelForSeq2SeqLM.from_pretrained("tmskss/flan-t5-large-k8s-stackoverflow-qa")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [30]:
# Define your text input
input_text = "answer the question: What happens to you if you eat watermelon seeds?"

# Tokenize input text
input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate text based on the input
output_ids = eval_model.generate(input_ids, max_length=100, num_return_sequences=1, num_beams=4)

# Decode the generated output
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)




You will be able to ingest the watermelon in a very small amount of time. If you do not ingest the watermelon in a very small amount of time, you will be able to ingest the watermelon in a very small amount of time. If you do not ingest the watermelon in a very small amount of time, you will be able to ingest the watermelon in
