## Import required libraries and packages

In [1]:
import evaluate
import yaml
import nltk
nltk.download('punkt_tab', quiet=True)
import pathlib
import numpy as np

In [2]:
OUTPUT_PATH = "../output_dir"

## Helper functions

In [3]:
def create_path_generator(file_path):
    return pathlib.Path(f"{file_path}").glob("**/*")

In [4]:
def read_yaml_file(file_path, content_type):
    with open(file_path) as file:
        instructions = yaml.load(file, Loader=yaml.Loader)
    return yaml.dump(instructions[content_type])

In [5]:
path_generator = create_path_generator(
    file_path=f"{OUTPUT_PATH}/intermediate_responses/"
)
correct_response_file_paths = [str(path) for path in path_generator]
correct_response_file_paths = correct_response_file_paths[1:]

In [6]:
path_generator = create_path_generator(file_path=f"{OUTPUT_PATH}/gpt_responses/")
gpt_responses_file_paths = [str(path) for path in path_generator]

In [7]:
bleu = evaluate.load("bleu")
google_bleu = evaluate.load("google_bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amilas_Windows_VM\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amilas_Windows_VM\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Amilas_Windows_VM\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package omw-1.4 is already up-to-date!


In [10]:
bleu_scores = []
gleu_scores = []
rouge_scores = []
meteor_scores = []
for test_index in range(len(correct_response_file_paths)):
    correct_file_path = correct_response_file_paths[test_index]
    predicted_file_path = gpt_responses_file_paths[test_index]

    # Correct instructions
    correct_instructions = read_yaml_file(
        file_path=correct_file_path, content_type="intermediate response"
    ).astype(str).split("- Step")
    correct_instructions = [
        instruction.strip().replace("\n", "")
        for instruction in correct_instructions
        if instruction != ""
    ]

    # predicted instructions
    predicted_instructions = read_yaml_file(
        file_path=predicted_file_path, content_type="gpt_response"
    ).astype(str).split("- Step")
    predicted_instructions = [
        instruction.strip().replace("\n", "")
        for instruction in predicted_instructions
        if instruction != ""
    ]
    print(f"Predicted file: {predicted_file_path}")
    print(f"Correct file: {correct_file_path}")
    if len(predicted_instructions) != len(correct_instructions):
        print(f"Predicted: {predicted_instructions}")
        print(f"Correct: {correct_instructions}")

    # BLEU
    bleu_results = bleu.compute(predictions=predicted_instructions, references=correct_instructions)
    bleu_scores.append(bleu_results['bleu'])

    # GLEU
    google_bleu_results = google_bleu.compute(predictions=predicted_instructions, references=correct_instructions)
    gleu_scores.append(google_bleu_results['google_bleu'])

    # Rouge-L
    rouge_results = rouge.compute(predictions=predicted_instructions, references=correct_instructions)
    rouge_scores.append(rouge_results['rougeL'])

    # METEOR
    meteor_results = meteor.compute(predictions=predicted_instructions, references=correct_instructions)
    meteor_results['meteor']

Predicted file: ..\output_dir\gpt_responses\2_Dragging_gpt_response.yaml
Correct file: ..\output_dir\intermediate_responses\2_Dragging.yaml
Predicted file: ..\output_dir\gpt_responses\2_FutureValue_gpt_response.yaml
Correct file: ..\output_dir\intermediate_responses\2_FutureValue.yaml
Predicted file: ..\output_dir\gpt_responses\3_Dragging_gpt_response.yaml
Correct file: ..\output_dir\intermediate_responses\3_Dragging.yaml
Predicted: ['- \'Step 1: Fill down column B from cell B2 to B122 using the formula in B2.\'- \'Step 2: Create a new sheet named "ScatterChart".\'- \'Step 3: Generate a scatter chart in "ScatterChart" based on data from columns A  and B in "Sheet1".\'- \'Step 4: Set the chart title to "Acceleration vs Hanging Mass".\'- \'Step 5: Label the X-axis as "Hanging Mass (m2) (kg)".\'- \'Step 6: Label the Y-axis as "Acceleration (m/s^2)".\'']
Correct: ['1. Fill out the rest of the rows in column B using the formula in B2.', '2. Create a new sheet for the scatter chart.', '3. Cr

ValueError: Mismatch in the number of predictions (1) and references (6)

In [None]:
print(f"BLEU: {np.mean(bleu_scores):.3f} {u'\u00B1'} {np.std(bleu_scores):.3f}")
print(f"GLEU: {np.mean(gleu_scores):.3f} {u'\u00B1'} {np.std(gleu_scores):.3f}")
print(f"ROUGE-L: {np.mean(rouge_scores):.3f} {u'\u00B1'} {np.std(rouge_scores):.3f}")
print(f"METEOR: {np.mean(meteor_scores):.3f} {u'\u00B1'} {np.std(meteor_scores):.3f}")