Importing Libraries

In [16]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments
)
from peft import PeftModel, PeftConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

Global Variables

In [17]:
BASE_MODEL = "microsoft/codebert-base"
BIGVUL_ADAPTER_PATH = "./model-bigvul-lora"
JULIET_ADAPTER_PATH = "./model-juliet-lora"
BATCH_SIZE = 32

Helper Functions

In [18]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
def evaluate_adapter(adapter_path, dataset, dataset_name):
    print(f"\n========================================================")
    print(f" EVALUATING MODEL: {adapter_path}")
    print(f" ON DATASET:       {dataset_name}")
    print(f"========================================================")
    
    # 1. Load Tokenizer
    # We load the tokenizer saved with the adapter to ensure consistency
    try:
        tokenizer = RobertaTokenizer.from_pretrained(adapter_path)
    except:
        # Fallback to base tokenizer if adapter didn't save one
        print("Warning: Could not load tokenizer from adapter path. Using base tokenizer.")
        tokenizer = RobertaTokenizer.from_pretrained(BASE_MODEL)

    # 2. Tokenize Dataset
    print(f"Tokenizing {dataset_name}...")
    def tokenize_function(examples):
        return tokenizer(examples["code"], padding="max_length", truncation=True, max_length=512)
    
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    # Remove raw text column
    tokenized_dataset = tokenized_dataset.remove_columns(["code"])
    tokenized_dataset.set_format("torch")

    # 3. Load Model (Base + Adapter)
    print("Loading Model...")
    # Load the base model (empty/pretrained weights)
    base_model = RobertaForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=2)
    # Load the trained LoRA adapter on top
    model = PeftModel.from_pretrained(base_model, adapter_path)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # 4. Run Evaluation
    trainer = Trainer(
        model=model,
        args=TrainingArguments(
            output_dir="./temp_eval_output", 
            per_device_eval_batch_size=BATCH_SIZE,
            remove_unused_columns=False
        ),
        compute_metrics=compute_metrics
    )
    
    print("Running Prediction...")
    result = trainer.predict(tokenized_dataset)
    
    print("\n--- RESULTS ---")
    metrics = result.metrics
    # Print clearly
    print(f"Accuracy:  {metrics['test_accuracy']:.4f}")
    print(f"Precision: {metrics['test_precision']:.4f}")
    print(f"Recall:    {metrics['test_recall']:.4f}")
    print(f"F1 Score:  {metrics['test_f1']:.4f}")
    print("--------------------------------------------------------\n")
    return metrics

Load and Structure Datasets

In [20]:
# Prepare Big-Vul Test Set
bigvul_dataset = load_dataset("bstee615/bigvul", "default", split="test")
bigvul_dataset = bigvul_dataset.rename_column("func_before", "code")
bigvul_dataset = bigvul_dataset.rename_column("vul", "labels")
# Filter to only keep necessary columns
bigvul_test = bigvul_dataset.remove_columns([c for c in bigvul_dataset.column_names if c not in ['code', 'labels']])

# B. Prepare Juliet Test Set
print("Loading Juliet Test Set...")
juliet_dataset = load_dataset("LorenzH/juliet_test_suite_c_1_3", "default", split="test")

print("Restructuring Juliet Test Set...")
def restructure_juliet(examples):
    new_codes = []
    new_labels = []
    goods = examples['good']
    bads = examples['bad']
    for good_code, bad_code in zip(goods, bads):
        if good_code:
            new_codes.append(good_code)
            new_labels.append(0)
        if bad_code:
            new_codes.append(bad_code)
            new_labels.append(1)
    return {"code": new_codes, "labels": new_labels}

# Apply restructuring
original_cols = juliet_dataset.column_names
juliet_test = juliet_dataset.map(restructure_juliet, batched=True, remove_columns=original_cols)

Loading Juliet Test Set...
Restructuring Juliet Test Set...


Main Evaluation

In [21]:

# Experiment A: Big-Vul Model -> Juliet Data
if os.path.exists(BIGVUL_ADAPTER_PATH):
    evaluate_adapter(BIGVUL_ADAPTER_PATH, juliet_test, "Juliet (Synthetic)")
else:
    print(f"Skipping Big-Vul evaluation: {BIGVUL_ADAPTER_PATH} not found.")

# Experiment B: Juliet Model -> Big-Vul Data
if os.path.exists(JULIET_ADAPTER_PATH):
    evaluate_adapter(JULIET_ADAPTER_PATH, bigvul_test, "Big-Vul (Real-World)")
else:
    print(f"Skipping Juliet evaluation: {JULIET_ADAPTER_PATH} not found.")


 EVALUATING MODEL: ./model-bigvul-lora
 ON DATASET:       Juliet (Synthetic)
Tokenizing Juliet (Synthetic)...
Loading Model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running Prediction...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



--- RESULTS ---
Accuracy:  0.5000
Precision: 0.0000
Recall:    0.0000
F1 Score:  0.0000
--------------------------------------------------------


 EVALUATING MODEL: ./model-juliet-lora
 ON DATASET:       Big-Vul (Real-World)
Tokenizing Big-Vul (Real-World)...
Loading Model...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running Prediction...



--- RESULTS ---
Accuracy:  0.9421
Precision: 0.0868
Recall:    0.0906
F1 Score:  0.0887
--------------------------------------------------------

