<a href="https://colab.research.google.com/github/tmskss/Flan-T5-K8S-QA/blob/main/notebooks/evaluate_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This notebook is used to evaluate models with ROUGE metrics using a specified dataset

In [1]:
!pip install transformers[torch] sentencepiece tokenizers evaluate datasets rouge_score

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K   

In [2]:
from datasets import load_dataset

dataset = load_dataset('ComponentSoft/k8s-kubectl')['validate']

Downloading readme:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/204k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/77.5k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3161 [00:00<?, ? examples/s]

Generating validate split:   0%|          | 0/316 [00:00<?, ? examples/s]

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained('ComponentSoft/flan-t5-base-k8s-question-code')
model = AutoModelForSeq2SeqLM.from_pretrained('ComponentSoft/flan-t5-base-k8s-question-code').to('cuda')

Downloading (…)okenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.63k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [4]:
import evaluate

rouge = evaluate.load('rouge')
rouge_scores = []
outputs = []

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [5]:
for item in dataset:
  # Tokenize text
  input_ids = tokenizer.encode(item['question'], return_tensors="pt", max_length=512, truncation=True).to('cuda')

  # Generate output
  output_ids = model.generate(input_ids, max_length=256, num_return_sequences=1, num_beams=4)

  # Decode output
  generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  # Create rouge metrics
  predictions = [generated_text]
  references = [item['code']]
  results = rouge.compute(predictions=predictions, references=references)

  rouge_scores.append(results)

  # Print results
  print('Question: ' + item['question'] + '\nGenerated output: ' + generated_text + '\nReference: ' + item['code'] + '\nROUGE scores: ' + str(results) + '\n--------------------------------------------------------------------------')
  outputs.append('Question: ' + item['question'] + '\nGenerated output: ' + generated_text + '\nReference: ' + item['code'] + '\nROUGE scores: ' + str(results) + '\n--------------------------------------------------------------------------')

Question: Create a pod using the data in pod.json
Generated output: kubectl create -f./pod.json
Reference: kubectl create -f ./pod.json
ROUGE scores: {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
--------------------------------------------------------------------------
Question: Create a pod based on the JSON passed into stdin
Generated output: cat pod.json | kubectl create -f -
Reference: cat pod.json | kubectl create -f -
ROUGE scores: {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
--------------------------------------------------------------------------
Question: Edit the data in docker-registry.yaml in JSON then create the resource using the edited data
Generated output: kubectl create -f docker-registry.yaml --edit -o json
Reference: kubectl create -f docker-registry.yaml --edit -o json
ROUGE scores: {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0, 'rougeLsum': 1.0}
--------------------------------------------------------------------------
Questio

In [6]:
# Calculate average from rouge scores
avg_scores = {
    'avg_rouge1': sum(score['rouge1'] for score in rouge_scores) / len(rouge_scores),
    'avg_rouge2': sum(score['rouge2'] for score in rouge_scores) / len(rouge_scores),
    'avg_rougeL': sum(score['rougeL'] for score in rouge_scores) / len(rouge_scores),
    'avg_rougeLsum': sum(score['rougeLsum'] for score in rouge_scores) / len(rouge_scores),
}

print('Average ROUGE scores: ' + str(avg_scores))

Average ROUGE scores: {'avg_rouge1': 0.964986437780824, 'avg_rouge2': 0.9513970308309867, 'avg_rougeL': 0.9636096500116312, 'avg_rougeLsum': 0.9636096500116312}


In [7]:
# Dump output to a txt file
file_name = 'output.txt'

with open(file_name, 'w') as file:
    for item in outputs:
        file.write(f"{item}\n")

    file.write('Average ROUGE scores: ' + str(avg_scores))