In [1]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

from data_parser import load_dataset

In [2]:
model_id = "microsoft/phi-1_5"

# Load the Phi-1.5 model and tokenizer
phi_model = AutoModelForCausalLM.from_pretrained(model_id)
phi_tokenizer = AutoTokenizer.from_pretrained(model_id)

phi_tokenizer.pad_token = phi_tokenizer.eos_token
smoothing_function = SmoothingFunction().method1

In [3]:
codexglue_data_path = "../data/CodeXGLUE/codexglue_method_generation/test.jsonl"
batch_size = 8
tasks_generator = load_dataset(codexglue_data_path, batch_size)

In [4]:
bleu_scores = []
for batch in tqdm(tasks_generator):
    instructions, targets = batch[0], batch[1]
    
    model_inputs = phi_tokenizer(instructions, return_tensors="pt", padding=True, truncation=True, max_length=500)
    model_outputs = phi_model.generate(**model_inputs, max_length=400)
    
    output_texts = phi_tokenizer.batch_decode(model_outputs, skip_special_tokens=True)
    
    for output_text, target in zip(output_texts, targets):
        output_tokens = output_text.split()
        target_tokens = target.split()

        bleu_score = sentence_bleu([target_tokens], output_tokens, smoothing_function=smoothing_function)
        bleu_scores.append(bleu_score)
    
avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU score: {avg_bleu_score}")

2it [08:09, 244.64s/it]


KeyboardInterrupt: 