# 💡 LoRA Fine-Tuning for DGA Detection
An attempt to fine-tune `meta-llama/Llama-3.2-3B-Instruct` on dataset (`asobirov/dga-preprocessed`) using QLoRA and PEFT.

In [1]:
!nvidia-smi

Sat Apr 19 11:22:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             69W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# Install required packages
!pip uninstall -y gcsfs bigframes -q
!pip install -U fsspec==2024.12.0 datasets transformers peft accelerate bitsandbytes huggingface_hub -q

[0m

In [12]:
# Log into Hugging Face
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
# Load dataset from Hugging Face
from datasets import load_dataset
hf_dataset = load_dataset("asobirov/dga-preprocessed")

In [47]:
dataset = hf_dataset.shuffle(seed=42)
dataset["train"] = hf_dataset["train"].select(range(100_000))
dataset["test"] = hf_dataset["test"].select(range(5000))

def normalize(example):
    label = str(example.get("label") or example.get("Type") or "").lower().strip()
    example["output"] = "Yes" if label in ["dga", "1", "yes"] else "No"
    return example

dataset = dataset.map(normalize)

In [48]:
# Load model + tokenizer + prepare for LoRA
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

model = get_peft_model(model, lora_config)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [49]:
# Tokenize
def tokenize(batch):
    prompts = [
        f"### Instruction:\nDetermine if the domain '{domain}' is algorithmically generated (DGA).\n\n### Response:"
        for domain in batch["domain"]
    ]
    model_inputs = tokenizer(prompts, max_length=256, truncation=True, padding="max_length", return_tensors=None)
    model_inputs["labels"] = tokenizer(
        batch["output"],
        max_length=64,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )["input_ids"]
    model_inputs["labels"] = [
        [(tok if tok != tokenizer.pad_token_id else -100) for tok in label]
        for label in model_inputs["labels"]
    ]
    return model_inputs


remove_columns = dataset["train"].column_names
tokenized_ds = dataset.map(tokenize, batched=True, remove_columns=remove_columns, num_proc=8)

Map (num_proc=8):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/5000 [00:00<?, ? examples/s]

In [50]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./dga-lora-output",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_steps=100,
    max_steps=1500,
    save_steps=500,
    save_total_limit=1,
    logging_steps=10,
    fp16=True,
    report_to="none"
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
10,4.1623
20,3.9656
30,3.5843
40,2.6383
50,1.8058
60,1.2454
70,1.2547
80,1.1911
90,1.1642
100,1.2234


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
10,4.1623
20,3.9656
30,3.5843
40,2.6383
50,1.8058
60,1.2454
70,1.2547
80,1.1911
90,1.1642
100,1.2234


TrainOutput(global_step=1500, training_loss=1.1575657018025716, metrics={'train_runtime': 1815.3359, 'train_samples_per_second': 6.61, 'train_steps_per_second': 0.826, 'total_flos': 1.9151179481088e+16, 'train_loss': 1.1575657018025716, 'epoch': 0.12})

In [51]:
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

def classify_domain(domain: str):
    prompt = f"### Instruction:\nDetermine if the domain '{domain}' is DGA-generated.\n\n### Response:\n"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=256).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=5)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).lower()
    return "yes" in decoded

# Evaluate on 1000 examples
y_true, y_pred = [], []
for row in tqdm(dataset["test"].select(range(1000))):
    domain = row["domain"]
    label = row.get("label") or row.get("Type") or "benign"
    true = 1 if str(label).strip().lower() in ["dga", "1", "yes"] else 0
    pred = 1 if classify_domain(domain) else 0
    y_true.append(true)
    y_pred.append(pred)

print("✅ Accuracy:", round(accuracy_score(y_true, y_pred), 4))
print("🎯 F1 Score:", round(f1_score(y_true, y_pred), 4))


  return fn(*args, **kwargs)
100%|██████████| 1000/1000 [06:28<00:00,  2.58it/s]

✅ Accuracy: 0.249
🎯 F1 Score: 0.0





In [None]:
# 📤 Push model to Hugging Face
HF_REPO = "asobirov/dga-detector-lora"
model.push_to_hub(HF_REPO)
tokenizer.push_to_hub(HF_REPO)