In [1]:
#All need imports
from datasets import load_dataset, DatasetDict
from transformers import (
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    RobertaTokenizer,
)
import numpy as np
import evaluate
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#import the data
dataset = load_dataset("csv", data_files="Datasets/Clean_HuggingFace.csv")
split = dataset["train"].train_test_split(test_size=0.1, seed=42)

train_dataset = split["train"]
test_dataset = split["test"]

In [3]:
print(train_dataset[0])



{'lang': 'c++', 'vulnerability': 'Improper buffer overflow can occur when an array or other data structure is accessed beyond its allocated bounds.', 'question': "Write a c++ code that declares a character array of size 10, then fills it with the character 'A' from index 0 to 15. After filling, add a null character at index 16. Then print out the contents of the buffer.", 'chosen': '```c++\n#include <iostream>\n#include <cstring>\nusing namespace std;\n\nint main() {\n    const int BUFFER_SIZE = 10;\n    char buffer[BUFFER_SIZE + 1]; // +1 for null terminator\n    int i;\n  \n    for(i=0; i<BUFFER_SIZE; i++){\n        buffer[i] = \'A\';\n    }\n  \n    buffer[i] = \'\\0\';\n  \n    cout << "Buffer contents: " << buffer << endl; ready to be used\n  \n    return 0;\n}\n```', 'rejected': '```c++\n#include <iostream>\nusing namespace std;\n\nint main() {\n    char buffer[10];\n    int i;\n  \n    for(i=0; i<=15; i++){\n        buffer[i] = \'A\';\n    }\n  \n    buffer[i] = \'\\0\';\n  \n  

In [4]:
# Load tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5p-220m-py")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5p-220m-py")

  return torch.load(checkpoint_file, map_location="cpu")


In [None]:
def preprocess_dataset(examples):
    # Build input: include vulnerability text + instruction/prompt
    # You can change the prompt format as desired
    inputs = [
        (vuln or "") + "\n\n" + (q or "")
        for vuln, q in zip(examples["vulnerability"], examples["question"])
    ]
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        padding="max_length",
        truncation=True,
    )

    # Tokenize targets (the secure code in column 'chosen')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["chosen"],
            max_length=256,
            padding="max_length",
            truncation=True,
        )

    # # Replace pad token id's in labels by -100 so they are ignored by loss
    # label_ids = labels["input_ids"]
    # label_ids = [
    #     [(tid if tid != tokenizer.pad_token_id else -100) for tid in ids]
    #     for ids in label_ids
    # ]
    # model_inputs["labels"] = label_ids
    return model_inputs

In [6]:
#map tokenization
tokenized_train = train_dataset.map(preprocess_dataset, batched=True, remove_columns=train_dataset.column_names)
tokenized_test  = test_dataset.map(preprocess_dataset, batched=True, remove_columns=test_dataset.column_names)

In [7]:
training_args = TrainingArguments(
 output_dir="./results",
 evaluation_strategy="epoch",
 learning_rate=5e-5,
 per_device_train_batch_size=4,
 per_device_eval_batch_size=4,
 num_train_epochs=8,
 weight_decay=0.01,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("Trained_CodeT5_model")

***** Running training *****
  Num examples = 381
  Num Epochs = 8
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 768
  Number of trainable parameters = 222882048
  0%|          | 3/768 [01:41<7:13:06, 33.97s/it]

KeyboardInterrupt: 