In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    device_map="auto",
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.12 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [4]:
# Updated dataset prompt for cybersecurity-only scope
alpaca_prompt = """You are a cybersecurity assistant. You must only answer questions related to cybersecurity. If a question is unrelated to cybersecurity, politely refuse to answer.

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}
"""

# Add EOS token for generation stopping point
EOS_TOKEN = tokenizer.eos_token

# Function to format the dataset into the expected prompt-response format
def formatting_prompts_func(examples):
    questions = examples["Question"]  # Extract questions
    answers = examples["Answer"]      # Extract answers
    # Format prompts with cybersecurity-only constraint
    texts = [
        alpaca_prompt.format(instruction=q, input="", output=a) + EOS_TOKEN
        for q, a in zip(questions, answers)
    ]
    return {"text": texts}

In [5]:
from datasets import Dataset

from datasets import load_dataset
dataset = load_dataset("Joussef/my_sec_data", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/310 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/201k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1355 [00:00<?, ? examples/s]

Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

In [6]:
dataset[:1]

{'Question': ['What is a common method used in Network Security to prevent unauthorized access?\r\nHow does it work?'],
 'Answer': ['A common method used in Network Security to prevent unauthorized access is through the use of firewalls. Firewalls act as a barrier between a trusted internal network and untrusted external networks, controlling incoming and outgoing network traffic based on predetermined security rules. They inspect each packet of data and determine whether to allow or block it based on the defined ruleset, thus protecting the network from unauthorized access and potential threats.'],
 'text': ['You are a cybersecurity assistant. You must only answer questions related to cybersecurity. If a question is unrelated to cybersecurity, politely refuse to answer.\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a common method used in Netw

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 1500, # Set num_train_epochs = 1 for full training runs
        num_train_epochs = 1500,
        learning_rate = 1e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/1355 [00:00<?, ? examples/s]

In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,355 | Num Epochs = 9
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,500
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.1165
2,2.1862
3,1.8087
4,1.7471
5,1.458
6,1.4341
7,1.3054
8,1.3301
9,1.1623
10,0.9001


In [9]:
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Prepare the inputs with the correct placeholders filled
inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction="what is a dos attack?",  # instruction
            input="",  # input
            output="",  # leave this blank for generation!
        )
    ], return_tensors="pt").to("cuda")

# Generate the output
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

# Decode the generated tokens
generated_text = tokenizer.batch_decode(outputs)
print(generated_text)


['<|begin_of_text|>You are a cybersecurity assistant. You must only answer questions related to cybersecurity. If a question is unrelated to cybersecurity, politely refuse to answer.\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nwhat is a dos attack?\n\n### Input:\n\n\n### Response:\n\nA Denial of Service (DoS) attack is a type of cyber attack where an attacker attempts to make a computer or network resource unavailable by flooding it with traffic, causing it to become overwhelmed and unable to respond to legitimate requests. This can be achieved through various means, such as sending a large amount of traffic to']


In [10]:
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Prepare the inputs with the correct placeholders filled
inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction="what is the capital of france?",  # instruction
            input="",  # input
            output="",  # leave this blank for generation!
        )
    ], return_tensors="pt").to("cuda")

# Generate the output
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

# Decode the generated tokens
generated_text = tokenizer.batch_decode(outputs)
print(generated_text)

["<|begin_of_text|>You are a cybersecurity assistant. You must only answer questions related to cybersecurity. If a question is unrelated to cybersecurity, politely refuse to answer.\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nwhat is the capital of france?\n\n### Input:\n\n\n### Response:\n\nI'm happy to help with cybersecurity-related questions, but I must politely refuse to answer questions that are unrelated to cybersecurity. In this case, the question is not related to cybersecurity, so I won't be able to provide an answer. If you have a cybersecurity-related question, I'd be happy to try and assist you"]


In [11]:
# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Prepare the inputs with the correct placeholders filled
inputs = tokenizer(
    [
        alpaca_prompt.format(
            instruction="hello can you help",  # instruction
            input="",  # input
            output="",  # leave this blank for generation!
        )
    ], return_tensors="pt").to("cuda")

# Generate the output
outputs = model.generate(**inputs, max_new_tokens=512, use_cache=True)

# Decode the generated tokens
generated_text = tokenizer.batch_decode(outputs)
print(generated_text)


["<|begin_of_text|>You are a cybersecurity assistant. You must only answer questions related to cybersecurity. If a question is unrelated to cybersecurity, politely refuse to answer.\n\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nhello can you help\n\n### Input:\n\n\n### Response:\n\nI'd be happy to assist with any cybersecurity-related inquiry. What specifically would you like to know or discuss?\n<|eot_id|>"]


In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Save the model and tokenizer after fine-tuning
model.save_pretrained("llama_security_model")
tokenizer.save_pretrained("llama_security_model")

('llama_security_model/tokenizer_config.json',
 'llama_security_model/special_tokens_map.json',
 'llama_security_model/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer (if not already loaded)
model = AutoModelForCausalLM.from_pretrained("Joussef/llama_sec_model")
tokenizer = AutoTokenizer.from_pretrained("Joussef/llama_sec_model")

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]



('/content/drive/MyDrive/llama_model/tokenizer_config.json',
 '/content/drive/MyDrive/llama_model/special_tokens_map.json',
 '/content/drive/MyDrive/llama_model/tokenizer.json')

In [14]:
model.push_to_hub("Joussef/llama_security_model", token = "hf_token")
tokenizer.push_to_hub("Joussef/llama_security_model", token = "hf_token")

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/Joussef/llama_security_model


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [15]:
model.push_to_hub_gguf("Joussef/unsloth-llama-3-q4_k_m", tokenizer, quantization_method = "q4_k_m", token = "hf_token")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.27 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 47%|████▋     | 15/32 [00:01<00:02,  8.01it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:53<00:00,  3.55s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving Joussef/unsloth-llama-3-q4_k_m/pytorch_model-00001-of-00004.bin...
Unsloth: Saving Joussef/unsloth-llama-3-q4_k_m/pytorch_model-00002-of-00004.bin...
Unsloth: Saving Joussef/unsloth-llama-3-q4_k_m/pytorch_model-00003-of-00004.bin...
Unsloth: Saving Joussef/unsloth-llama-3-q4_k_m/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at Joussef/unsloth-llama-3-q4_k_m into f16 GGUF format.
The output location will be /content/Joussef/unsloth-llama-3-q4_k_m/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: unsloth-llama-3-q4_k_m
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00004.bin'
INFO:hf-to-g

  0%|          | 0/1 [00:00<?, ?it/s]

unsloth.Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Joussef/unsloth-llama-3-q4_k_m
