In [None]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer
model_name = "gokul-pv/Llama-3.2-1B-Instruct-16bit-CodeArchitect"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def calculate_perplexity(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss
    return torch.exp(loss).item()

with open("data/codearchitect_val_data.json", "r") as f:
    data = json.load(f)["ValidationData"]

perplexities = []

for sample in data:
    sample_id = sample.get("id", "N/A")
    input_code = sample["inputCode"]
    perp = calculate_perplexity(input_code)
    perplexities.append(perp)
    print(f"ID: {sample_id}, Perplexity: {perp:.4f}")

# Optionally, print the average perplexity over the validation set
if perplexities:
    avg_perplexity = sum(perplexities) / len(perplexities)
    print(f"\nAverage Perplexity: {avg_perplexity:.4f}")
else:
    print("No validation samples found.")


ID: 62, Perplexity: 7.1007
ID: 63, Perplexity: 12.7885
ID: 64, Perplexity: 11.0817
ID: 65, Perplexity: 3.9705
ID: 66, Perplexity: 4.7665

Average Perplexity: 7.9416


In [None]:
import torch
import json

from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score_fn

# Load the model and tokenizer
model_name = "gokul-pv/Llama-3.2-1B-Instruct-16bit-CodeArchitect"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the validation data from a JSON file
with open("data/codearchitect_val_data.json", "r") as f:
    data = json.load(f)["ValidationData"]

def evaluate_model(input_code, max_new_tokens=150, temperature=0.7):
    # Tokenize the input code
    inputs = tokenizer(input_code, return_tensors="pt")
    
    # Generate output tokens
    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id  # Avoid warnings in case the model doesn't have a pad_token_id
        )
    
    # Decode the generated tokens into a string
    prediction = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return prediction


# Prepare lists to store metric scores for each sample
bleu_scores = []
rouge1_scores = []
rougeL_scores = []

# Instantiate a ROUGE scorer (using stemmer for normalization)
rouge_scorer_inst = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Use smoothing for BLEU to handle cases with very few tokens
smoothing_fn = SmoothingFunction().method1

# Store predictions and references for BERTScore calculation
predictions = []
references = []

for sample in data:
    input_code = sample["inputCode"]
    expected_output = sample["outputText"]
    
    # Get model prediction
    predicted_output = evaluate_model(input_code)
    
    # Append for BERTScore evaluation later
    predictions.append(predicted_output)
    references.append(expected_output)
    
    # Calculate BLEU score (tokenize by splitting on whitespace)
    reference_tokens = expected_output.split()
    candidate_tokens = predicted_output.split()
    bleu = sentence_bleu(
        [reference_tokens], 
        candidate_tokens, 
        smoothing_function=smoothing_fn
    )
    bleu_scores.append(bleu)
    
    # Calculate ROUGE scores
    rouge_scores = rouge_scorer_inst.score(expected_output, predicted_output)
    rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
    rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

# Calculate average BLEU and ROUGE scores across the validation set
avg_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0

# Calculate BERTScore (F1 score) for the entire corpus
P, R, F1 = bert_score_fn(predictions, references, lang="en", model_type="bert-base-uncased")
avg_bert = F1.mean().item()

# Print out the results with the metric ranges and interpretation
print(f"Average BLEU Score: {avg_bleu:.4f} (Range: 0 to 1, higher is better)")
print(f"Average ROUGE-1 F1 Score: {avg_rouge1:.4f} (Range: 0 to 1, higher is better)")
print(f"Average ROUGE-L F1 Score: {avg_rougeL:.4f} (Range: 0 to 1, higher is better)")
print(f"Average BERTScore F1: {avg_bert:.4f} (Range: 0 to 1, higher is better)")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Average BLEU Score: 0.0040 (Range: 0 to 1, higher is better)
Average ROUGE-1 F1 Score: 0.1536 (Range: 0 to 1, higher is better)
Average ROUGE-L F1 Score: 0.1025 (Range: 0 to 1, higher is better)
Average BERTScore F1: 0.4911 (Range: 0 to 1, higher is better)
