In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install git+https://github.com/huggingface/peft.git

In [None]:
import os
import json
import concurrent.futures
import pandas as pd
import random
import torch
import transformers
import torch
import re
from google.colab import userdata
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, logging
from transformers import BitsAndBytesConfig, GemmaTokenizer, pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

<a name="Data"></a>
### Data Prep
We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.

**[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).

**[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!

If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).

For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from datasets import load_dataset

def format_for_training(examples):
    formatted_texts = []
    for instruction, inp, out in zip(examples['instruction'], examples['input'], examples['output']):
        formatted_text = f"{instruction}\n{inp}\nResponse: {out}"
        formatted_texts.append(formatted_text)
    return {'text': formatted_texts}

# Load your dataset
dataset = load_dataset('json', data_files='/content/drive/MyDrive/fine_tuning_dataset.json', split='train')

# Apply the formatting function
formatted_dataset = dataset.map(format_for_training, batched=True)

# Verify that 'text' is now part of the dataset
print(formatted_dataset['text'][0])  # This should print the first 'text' field to confirm it's correct

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/30871 [00:00<?, ? examples/s]

Given the following summary and function, determine if the function contains the described vulnerability.
Does the following function have a vulnerability? 
Function:
static char *make_filename_safe(const char *filename TSRMLS_DC)
{
	if (*filename && strncmp(filename, ":memory:", sizeof(":memory:")-1)) {
		char *fullpath = expand_filepath(filename, NULL TSRMLS_CC);

		if (!fullpath) {
			return NULL;
		}

		if (PG(safe_mode) && (!php_checkuid(fullpath, NULL, CHECKUID_CHECK_FILE_AND_DIR))) {
			efree(fullpath);
			return NULL;
		}

		if (php_check_open_basedir(fullpath TSRMLS_CC)) {
			efree(fullpath);
			return NULL;
		}
		return fullpath;
	}
	return estrdup(filename);
}
Is the function vulnerable? Yes or No. Explain your answer in one sentence.
Response: Yes, this function is vulnerable because it falls under the classification of: Weaknesses in this category are related to the management of permissions, privileges, and other security features that are used to perform access control.


In [None]:
# If already loaded just pull model from Drive
model_directory = "/content/drive/MyDrive/Models"

model = FastLanguageModel.from_pretrained(model_directory)
tokenizer = FastLanguageModel.from_pretrained(model_directory)

In [None]:
def tokenize_data(examples):
    return tokenizer(
        examples['text'],  # Tokenize the 'text' field
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"  # Ensure PyTorch tensors are returned
    )

# Tokenize the formatted dataset
tokenized_dataset = formatted_dataset.map(tokenize_data, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Check the output to ensure correct formatting and tokenization
for i in range(3):
    print(tokenized_dataset[i]['input_ids'].shape, tokenized_dataset[i]['attention_mask'].shape)

Map:   0%|          | 0/30871 [00:00<?, ? examples/s]

torch.Size([512]) torch.Size([512])
torch.Size([512]) torch.Size([512])
torch.Size([512]) torch.Size([512])


In [None]:
alpaca_prompt = """
### Instruction:
{}

### Input:
{}
"""

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=4,                     # Increase number of processors if possible
    packing=False,                          # Turn off packing if not needed, based on your context length
    args = TrainingArguments(
      output_dir="outputs",                   # Output directory
      per_device_train_batch_size=4,          # Batch size per device during training, adjust as per GPU memory
      gradient_accumulation_steps=4,          # Number of updates steps to accumulate before performing a backward/update pass.
      warmup_steps=100,                       # Number of steps to perform learning rate warmup
      max_steps=60,                           # Total number of training steps to perform
      learning_rate=3e-4,                     # Start with a slightly higher learning rate
      fp16=not is_bfloat16_supported(),       # Use mixed precision if BFloat16 is not supported
      bf16=is_bfloat16_supported(),           # Use BFloat16 if supported by the hardware
      logging_steps=10,                       # Log every 10 steps
      optim="adamw_8bit",                     # Using 8-bit precision AdamW optimizer
      weight_decay=0.02,                      # Slightly higher weight decay
      lr_scheduler_type="cosine",             # Using cosine annealing learning rate scheduler
      seed=3407                               # Seed for reproducibility
    ),

)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 30,871 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.5077
20,1.2085
30,1.0323
40,0.9478
50,0.9793
60,0.9444


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

476.2773 seconds used for training.
7.94 minutes used for training.
Peak reserved memory = 7.535 GB.
Peak reserved memory for training = 1.941 GB.
Peak reserved memory % of max memory = 51.092 %.
Peak reserved memory for training % of max memory = 13.161 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Given the summary and function, determine if the function contains the vulnerability.",
        '''void vulnerable_function(char *input) {
              char buffer[10];
              strcpy(buffer, input);
        }, Is the function vulnerable? Yes or No. Explain your answer in one sentence.'''
    )
], return_tensors = "pt").to("cuda")

# Generate response
outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    use_cache=True,
    temperature=0.1,  # Higher temperature increases randomness
    top_p=0.8,
    num_return_sequences=1
)

# Decode and clean up output
decoded_responses = tokenizer.batch_decode(outputs, skip_special_tokens=True)
cleaned_responses = [response.split("Response:\n")[-1].strip() for response in decoded_responses]
print(cleaned_responses[0])

### Instruction:
Given the following summary and function, determine if the function contains the described vulnerability.

### Input:
void vulnerable_function(char *input) {
              char buffer[10];
              strcpy(buffer, input);
        }, Is the function vulnerable? Yes or No. Explain your answer in one sentence.
Response: Yes, this function is vulnerable because it falls under the classification of: The software reads data past the end, or before the beginning, of the intended buffer. Extended Description  This typically occurs when the pointer or its index is incremented or decremented to a position beyond the bounds of the buffer or


Run multiple tests on the diversevul functions to see the preformance.

In [None]:
# Load your dataset
dataset = load_dataset('json', data_files='/content/drive/MyDrive/fine_tuning_dataset.json', split='train')

# Print the structure of the first sample
print(dataset[0])

{'instruction': 'Given the following summary and function, determine if the function contains the described vulnerability.', 'input': 'Does the following function have a vulnerability? \nFunction:\nstatic char *make_filename_safe(const char *filename TSRMLS_DC)\n{\n\tif (*filename && strncmp(filename, ":memory:", sizeof(":memory:")-1)) {\n\t\tchar *fullpath = expand_filepath(filename, NULL TSRMLS_CC);\n\n\t\tif (!fullpath) {\n\t\t\treturn NULL;\n\t\t}\n\n\t\tif (PG(safe_mode) && (!php_checkuid(fullpath, NULL, CHECKUID_CHECK_FILE_AND_DIR))) {\n\t\t\tefree(fullpath);\n\t\t\treturn NULL;\n\t\t}\n\n\t\tif (php_check_open_basedir(fullpath TSRMLS_CC)) {\n\t\t\tefree(fullpath);\n\t\t\treturn NULL;\n\t\t}\n\t\treturn fullpath;\n\t}\n\treturn estrdup(filename);\n}\nIs the function vulnerable? Yes or No. Explain your answer in one sentence.', 'output': 'Yes, this function is vulnerable because it falls under the classification of: Weaknesses in this category are related to the management of perm

In [None]:
# Load your dataset
dataset = load_dataset('json', data_files='/content/drive/MyDrive/fine_tuning_dataset.json', split='train')

# Filter the dataset for entries where the output does not start with "Yes"
non_vulnerable_samples = dataset.filter(lambda example: not example['output'].strip().startswith("Yes"))

# Select 5 random examples from the filtered non-vulnerable dataset
if len(non_vulnerable_samples) > 5:
    random_indices = random.sample(range(0, len(non_vulnerable_samples)), 5)
    samples = non_vulnerable_samples.select(random_indices)
else:
    samples = non_vulnerable_samples  # Use all available samples if less than 5

# Define the instruction
instruction = "Given the following summary and function, determine if the function contains the described vulnerability."

# Use the fine-tuned model to generate responses for each sampled function
for func in samples:
    # Extract the input part from the dataset
    function_code = func['input'].replace('\n', ' ')  # Format the function string to fit on one line if needed

    # Generate the prompt as per your Alpaca prompt style
    prompt = f"{instruction}\n{function_code}, Is the function vulnerable? Yes or No. Explain your answer in one sentence."

    # Generate the input tensor
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')

    # Generate response
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True, temperature=0.1, top_p=0.8, num_return_sequences=1)

    # Decode and clean up output
    decoded_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Print the results
    print(f"\nFunction: {function_code[:60]}...")  # Print part of the function to track
    print(f"Response: {decoded_response}")


Function: Does the following function have a vulnerability?  Function:...
Response: Given the following summary and function, determine if the function contains the described vulnerability.
Does the following function have a vulnerability?  Function: nautilus_file_set_permissions (NautilusFile *file,  			       guint32 new_permissions, 			       NautilusFileOperationCallback callback, 			       gpointer callback_data) { 	GFileInfo *info; 	GError *error;  	if (!nautilus_file_can_set_permissions (file)) { 		/* Claim that something changed even if the permission change failed. 		 * This makes it easier for some clients who see the "reverting" 		 * to the old permissions as "changing back". 		 */ 		nautilus_file_changed (file); 		error = g_error_new (G_IO_ERROR, G_IO_ERROR_PERMISSION_DENIED, 				     _("Not allowed to set permissions")); 		(* callback) (file, NULL, error, callback_data); 		g_error_free (error); 		return; 	} 			        	/* Test the permissions-haven't-changed case explicit

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
model_directory = "/content/drive/MyDrive/Models"  # Change 'YourModelFolder' to your desired folder name

# Save the model
model.save_pretrained(model_directory)
tokenizer.save_pretrained(model_directory)

('/content/drive/MyDrive/Models/tokenizer_config.json',
 '/content/drive/MyDrive/Models/special_tokens_map.json',
 '/content/drive/MyDrive/Models/tokenizer.model',
 '/content/drive/MyDrive/Models/added_tokens.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`: