Import Libraries

In [5]:
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch

Load Datasets and Model

In [2]:
# --- 1. LOAD DATASETS ---
print("Loading Big-Vul from cache...")
bigvul_dataset = load_dataset("bstee615/bigvul", "default")

print("Loading Juliet from cache...")
juliet_dataset = load_dataset("LorenzH/juliet_test_suite_c_1_3", "default")

# --- 2. LOAD TOKENIZER AND MODEL ---
model_name = "microsoft/codebert-base"

print(f"\nLoading tokenizer for '{model_name}'...")
# FIX: Use RobertaTokenizer instead of AutoTokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)

print(f"Loading model '{model_name}' for sequence classification...")
# FIX: Use RobertaForSequenceClassification instead of AutoModelForSequenceClassification
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

print("\n--- Tokenizer and Model Loaded Successfully ---")
print(f"Model type: {type(model)}")

Loading Big-Vul from cache...
Loading Juliet from cache...

Loading tokenizer for 'microsoft/codebert-base'...
Loading model 'microsoft/codebert-base' for sequence classification...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Tokenizer and Model Loaded Successfully ---
Model type: <class 'transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification'>


In [3]:
# Check if a GPU is available and move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"--- Model loaded and moved to {device} ---")

--- Model loaded and moved to cuda ---


In [None]:
# --- 3. PREPARE BIG-VUL DATASET ---

# We must rename the 'func' column to 'code' for consistency
print("Renaming 'func' column to 'code'...")
bigvul_dataset = bigvul_dataset.rename_column("func", "code")

# Define our tokenization function
def tokenize_function(examples):
    # This will pad to the model's max length and truncate longer sequences
    return tokenizer(
        examples["code"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

print("Tokenizing Big-Vul dataset...")
# This .map() function is the magic of the 'datasets' library.
# 'batched=True' processes multiple examples at once for speed.
tokenized_bigvul_train = bigvul_dataset["train"].map(tokenize_function, batched=True)
tokenized_bigvul_val = bigvul_dataset["validation"].map(tokenize_function, batched=True)

# We don't need the 'code' column anymore, just the token IDs
tokenized_bigvul_train = tokenized_bigvul_train.remove_columns(["code"])
tokenized_bigvul_val = tokenized_bigvul_val.remove_columns(["code"])

# Make sure the 'label' column is the format the trainer expects
tokenized_bigvul_train = tokenized_bigvul_train.rename_column("label", "labels")
tokenized_bigvul_val = tokenized_bigvul_val.rename_column("label", "labels")

# Set the format to torch tensors
tokenized_bigvul_train.set_format("torch")
tokenized_bigvul_val.set_format("torch")

print("--- Tokenization Complete ---")


# --- 4. SET UP AND RUN THE TRAINER ---

# Define the training arguments
# This is where you set up the training "rules"
training_args = TrainingArguments(
    output_dir="./results-bigvul",          # Where to save the model
    num_train_epochs=1,                     # One epoch is usually enough for fine-tuning
    per_device_train_batch_size=8,          # Batch size for training (adjust based on your GPU memory)
    per_device_eval_batch_size=8,           # Batch size for evaluation
    warmup_steps=500,                       # Number of steps to "warm up" the learning rate
    weight_decay=0.01,                      # Strength of weight decay
    logging_dir="./logs-bigvul",            # Where to store logs
    logging_steps=100,                      # Log metrics every 100 steps
    evaluation_strategy="steps",            # Evaluate during training
    eval_steps=500,                         # Evaluate every 500 steps
    save_strategy="steps",                  # Save the model checkpoint
    save_steps=500,                         # Save every 500 steps
    load_best_model_at_end=True,            # Load the best model at the end
)

# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_bigvul_train,
    eval_dataset=tokenized_bigvul_val,
)

# Start the training!
print("--- Starting Fine-Tuning on Big-Vul... ---")
trainer.train()

print("--- Training Complete ---")

# Save the best model
trainer.save_model("./model-trained-on-bigvul")
tokenizer.save_pretrained("./model-trained-on-bigvul")
print("--- Best model saved to ./model-trained-on-bigvul ---")