## Import required libraries and packages

In [1]:
import evaluate
import yaml
import nltk
nltk.download('punkt_tab', quiet=True)
import pathlib
import numpy as np
from tqdm import tqdm

In [2]:
OUTPUT_PATH = "../output_dir"
model_name = "gpt-4o-mini"

## Helper functions

In [3]:
def create_path_generator(file_path):
    return pathlib.Path(f"{file_path}").glob("**/*")

In [4]:
def read_yaml_file(file_path, content_type):
    with open(file_path) as file:
        instructions = yaml.load(file, Loader=yaml.Loader)
    return yaml.dump(instructions[content_type])

In [5]:
path_generator = create_path_generator(
    file_path=f"{OUTPUT_PATH}/intermediate_responses/"
)
correct_response_file_paths = [str(path) for path in path_generator]
correct_response_file_paths = correct_response_file_paths[1:]

In [6]:
path_generator = create_path_generator(file_path=f"{OUTPUT_PATH}/{model_name}/gpt_responses/")
gpt_responses_file_paths = [str(path) for path in path_generator]

In [7]:
bleu = evaluate.load("bleu")
google_bleu = evaluate.load("google_bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amilas_Windows_VM\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Amilas_Windows_VM\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Amilas_Windows_VM\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
bleu_scores = []
gleu_scores = []
rouge_scores = []
meteor_scores = []
for test_index in tqdm(range(10)):
    correct_file_path = correct_response_file_paths[test_index]
    predicted_file_path = gpt_responses_file_paths[test_index]

    # Correct instructions
    correct_instructions = read_yaml_file(
        file_path=correct_file_path, content_type="intermediate response"
    )
    correct_instructions = (
        correct_instructions.replace("'", "").replace('"', "").split("- Step")
    )
    correct_instructions = [
        instruction.strip().replace("\n", "")
        for instruction in correct_instructions
        if instruction != ""
    ]

    # predicted instructions
    predicted_instructions = read_yaml_file(
        file_path=predicted_file_path, content_type="gpt_response"
    )
    predicted_instructions = (
        predicted_instructions.replace("'", "").replace('"', "").split("- Step")
    )
    predicted_instructions = [
        instruction.strip().replace("\n", "")
        for instruction in predicted_instructions
        if instruction != ""
    ]
    predicted_instructions = ["\n".join(predicted_instructions)]
    correct_instructions = ["\n".join(correct_instructions)]
    
    if len(predicted_instructions) != len(correct_instructions):
        print(f"Predicted file: {predicted_file_path}")
        print(f"Correct file: {correct_file_path}")
        print(f"Predicted: {predicted_instructions}")
        print(f"Correct: {correct_instructions}")

    # BLEU
    bleu_results = bleu.compute(
        predictions=predicted_instructions, references=correct_instructions
    )
    bleu_scores.append(bleu_results["bleu"])

    # GLEU
    google_bleu_results = google_bleu.compute(
        predictions=predicted_instructions, references=correct_instructions
    )
    gleu_scores.append(google_bleu_results["google_bleu"])

    # Rouge-L
    rouge_results = rouge.compute(
        predictions=predicted_instructions, references=correct_instructions
    )
    rouge_scores.append(rouge_results["rougeL"])

    # METEOR
    meteor_results = meteor.compute(
        predictions=predicted_instructions, references=correct_instructions
    )
    meteor_scores.append(meteor_results["meteor"])

100%|██████████| 10/10 [00:09<00:00,  1.09it/s]


In [9]:
print(f"BLEU: {np.mean(bleu_scores):.3f} +/- {np.std(bleu_scores):.3f}")
print(f"GLEU: {np.mean(gleu_scores):.3f} +/- {np.std(gleu_scores):.3f}")
print(f"ROUGE-L: {np.mean(rouge_scores):.3f} +/- {np.std(rouge_scores):.3f}")
print(f"METEOR: {np.mean(meteor_scores):.3f} +/- {np.std(meteor_scores):.3f}")

BLEU: 0.236 +/- 0.176
GLEU: 0.304 +/- 0.142
ROUGE-L: 0.551 +/- 0.112
METEOR: 0.637 +/- 0.165
