In [1]:
!pip install transformers datasets accelerate bitsandbytes peft


Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.0


In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig

)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from google.colab import files

model_name = "google-t5/t5-small"

# ----------------------------
# 1. Quantization QLoRA (4-bit)
# ----------------------------
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,               # quantization 4-bit
    bnb_4bit_use_double_quant=True,  # double quantization (optimise RAM)
    bnb_4bit_quant_type="nf4",       # NF4 = meilleure pr√©cision
    bnb_4bit_compute_dtype="bfloat16"
)

# ----------------------------
# 2. Charger mod√®le en 4-bit
# ----------------------------
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# ----------------------------
# 3. Activer LoRA (adapters)
# R = 16 est standard
# Chaque couche gel√©e sauf matrices Q et V
# ----------------------------
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q","v","k","o","wi","wo"],
    #target_modules=["q", "v",],  # T5 layers qui supportent LoRA
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)

print(model)  # (Optionnel) tu vois que seules q,v sont entra√Ænables

# ----------------------------
# 4. Pr√©parer dataset
# ----------------------------
#dataset = load_dataset("json", data_files="/kaggle/input/content/nmap_dataset.json")
uploaded = files.upload()  # S√©lectionne 'nmap_dataset_prof.json' depuis ton PC

# ------------------------------
# 2. Charger dataset
# ------------------------------
dataset_path = "/content/nmap_dataset.json"
dataset = load_dataset(
    "json",
    data_files=dataset_path,
    split="train"
)

print("Dataset charg√© :", dataset[0])

def encode(example):
    # Input
    inputs = tokenizer(
        example["input"],
        truncation=True,
        padding="max_length",
        max_length=128  # ‚Üê R√©duit de 256 √† 128
    )

    # Output
    with tokenizer.as_target_tokenizer():
        outputs = tokenizer(
            example["output"],
            truncation=True,
            padding="max_length",
            max_length=64  # ‚Üê R√©duit de 256 √† 64
        )

    inputs["labels"] = outputs["input_ids"]

    # CRITIQUE : Remplace padding par -100 (ignor√© dans la loss)
    inputs["labels"] = [
        (label if label != tokenizer.pad_token_id else -100)
        for label in inputs["labels"]
    ]

    return inputs

dataset = dataset.map(encode)

# ----------------------------
# 5. Data collator
# ----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ----------------------------
# 6. Training arguments
# ----------------------------
training_args = TrainingArguments(
    output_dir="./t5-nmap-qlora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,            # conseill√© pour QLoRA
    num_train_epochs=15,
    fp16=False,
    bf16=True,                     # T5 + QLoRA = BF16
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
)

# ----------------------------
# 7. Trainer
# ----------------------------

splits = dataset.train_test_split(test_size=0.1)
train_dataset = splits["train"]
test_dataset  = splits["test"]

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
# ----------------------------
# 8. Fine-tuning
# ----------------------------
trainer.train()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=512, out_features=512, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=512, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
     

Saving nmap_dataset.json to nmap_dataset.json


Generating train split: 0 examples [00:00, ? examples/s]

Dataset charg√© : {'input': 'Scan all ports Run default,vuln scripts on 192.168.0.0/16', 'output': 'nmap -p- --script default,vuln 192.168.0.0/16'}


Map:   0%|          | 0/1799 [00:00<?, ? examples/s]

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Step,Training Loss
50,4.3087
100,2.7557
150,2.0045
200,1.6816
250,1.5133
300,1.3291
350,1.2105
400,1.161
450,1.085
500,1.029


TrainOutput(global_step=1530, training_loss=1.1228978861391155, metrics={'train_runtime': 1084.8795, 'train_samples_per_second': 22.385, 'train_steps_per_second': 1.41, 'total_flos': 862029946552320.0, 'train_loss': 1.1228978861391155, 'epoch': 15.0})

In [7]:
# ----------------------------
# TEST IMM√âDIAT
# ----------------------------
print("\n" + "="*60)
print("üß™ TEST IMM√âDIAT APR√àS FINE-TUNING")
print("="*60 + "\n")

import torch

model.eval()

test_prompts = [
    "Scan all ports Run default,vuln scripts on 192.168.0.0/16",
    "Scan all ports on 192.168.1.20",
    "Do a ping scan and traceroute on 10.0.0.0/28",
    "Scan top ports with version detection and OS detection on 192.168.1.50",
    "Scan all TCP ports on the target host",
    "Launch the SMB brute-force script using a specific user list and disabling the default password attempts"
]

for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            num_beams=4,
            do_sample=False
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"üìù Prompt : {prompt}")
    print(f"‚úÖ R√©sultat : {result}\n")


üß™ TEST IMM√âDIAT APR√àS FINE-TUNING

üìù Prompt : Scan all ports Run default,vuln scripts on 192.168.0.0/16
‚úÖ R√©sultat : nmap -p- --script default,vuln 192.168.0.0/16

üìù Prompt : Scan all ports on 192.168.1.20
‚úÖ R√©sultat : nmap -p- 192.168.1.20

üìù Prompt : Do a ping scan and traceroute on 10.0.0.0/28
‚úÖ R√©sultat : nmap -sn --traceroute 10.0.0.0/28

üìù Prompt : Scan top ports with version detection and OS detection on 192.168.1.50
‚úÖ R√©sultat : nmap -p- -sV -O 192.168.1.50

üìù Prompt : Scan all TCP ports on the target host
‚úÖ R√©sultat : nmap -p- -O target

üìù Prompt : Launch the SMB brute-force script using a specific user list and disabling the default password attempts
‚úÖ R√©sultat : nmap --script smb-brute --script-args user-list --script-args target



In [8]:
# ----------------------------
# 9. Sauvegarde mod√®le LoRA
# ----------------------------

output_dir = "/content/T5-qlora-nmap"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
import shutil


# -------- ZIP pour t√©l√©chargement --------
shutil.make_archive(output_dir, "zip", output_dir)
print("‚úÖ Fine-tuning T5-small avec QLoRA termin√© ! Mod√®le sauvegard√© dans :", output_dir)

‚úÖ Fine-tuning T5-small avec QLoRA termin√© ! Mod√®le sauvegard√© dans : /content/T5-qlora-nmap
