In [None]:
# ==========================================
# 1. INSTALLATION
# ==========================================
!pip install -q transformers datasets peft accelerate evaluate

import json
import torch
import random
from datasets import Dataset, concatenate_datasets
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Appareil : {device}")

# ==========================================
# 2. CHARGEMENT ET AUGMENTATION INTELLIGENTE (LA SOLUTION)
# ==========================================
# Chargement du dataset original
filename = "nmap_dataset.json"
try:
    with open(filename, 'r') as f:
        original_data = json.load(f)
except FileNotFoundError:
    print(f"ERREUR: {filename} manquant.")
    raise

# --- ICI C'EST LA MAGIE : ON APPREND LES PORTS AU MODÈLE ---
# On crée des données synthétiques pour forcer l'association Service <-> Port
common_services = {
    "SSH": "22", "HTTP": "80", "HTTPS": "443", "FTP": "21",
    "Telnet": "23", "SMTP": "25", "DNS": "53", "MySQL": "3306",
    "RDP": "3389", "SNMP": "161"
}

synthetic_data = []
ips = ["192.168.1.1", "10.0.0.5", "172.16.0.10", "localhost"]

# On génère 400 exemples supplémentaires focalisés UNIQUEMENT sur les ports
for _ in range(400):
    service = random.choice(list(common_services.keys()))
    port = common_services[service]
    ip = random.choice(ips)

    # Variations de phrases pour que le modèle généralise
    templates = [
        (f"Scan for {service} on {ip}", f"nmap -p {port} {ip}"),
        (f"Check {service} port on {ip}", f"nmap -p {port} {ip}"),
        (f"Scan {service} on {ip} with version detection", f"nmap -p {port} -sV {ip}"),
        (f"Target {ip} for {service}", f"nmap -p {port} {ip}")
    ]

    inp, out = random.choice(templates)
    synthetic_data.append({"input": inp, "output": out})

# Fusion des données : Originales + Synthétiques
print(f"Données originales : {len(original_data)}")
print(f"Données synthétiques ajoutées : {len(synthetic_data)}")

full_data = original_data + synthetic_data
random.shuffle(full_data) # Mélanger pour bien apprendre

# Création du Dataset
dataset_dict = {
    "input": [item["input"] for item in full_data],
    "output": [item["output"] for item in full_data]
}
hf_dataset = Dataset.from_dict(dataset_dict)
split_dataset = hf_dataset.train_test_split(test_size=0.05, seed=42) # Moins de test, plus de train
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# ==========================================
# 3. TOKENIZATION & MODÈLE
# ==========================================
model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
prefix = "nmap conversion: "

def preprocess_function(examples):
    # .lower() est crucial pour que "SSH" et "ssh" soient pareils
    inputs = [prefix + doc.lower() for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(examples["output"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# ==========================================
# 4. CONFIGURATION LoRA (CAPACITÉ AUGMENTÉE)
# ==========================================
lora_config = LoraConfig(
    r=64,                         # RANG TRÈS ÉLEVÉ (64) pour mémoriser les ports
    lora_alpha=128,               # Alpha fort
    target_modules=["q", "v", "k", "o", "wi", "wo"], # On entraîne TOUT le cerveau
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ==========================================
# 5. ENTRAÎNEMENT
# ==========================================
output_dir = "nmap_expert_model"

training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    learning_rate=5e-4,              # Vitesse moyenne/haute
    per_device_train_batch_size=16,  # Batch plus gros pour stabilité
    per_device_eval_batch_size=16,
    num_train_epochs=20,             # Assez d'époques pour apprendre par coeur les ports
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    predict_with_generate=True,
    fp16=True,
    logging_steps=20,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Démarrage de l'entraînement avec connaissance des ports...")
trainer.train()

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# ==========================================
# 6. TEST ET DÉMONSTRATION
# ==========================================
print("-" * 30)
print("TEST DES SERVICES -> PORTS")
print("-" * 30)

base_model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
peft_model = PeftModel.from_pretrained(base_model, output_dir)
peft_model.to(device)
peft_model.eval()

def generate_command(text):
    # Petit nettoyage pour aider le modèle
    clean_text = text.lower().replace("scan for ", "").replace("check ", "")

    inputs = tokenizer(prefix + clean_text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = peft_model.generate(
            **inputs,
            max_new_tokens=50,
            num_beams=3 # Pas besoin de trop de beams pour des ports
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Tests cruciaux
queries = [
    "scan for ssh on 10.0.0.1",       # Doit donner -p 22
    "check http on 192.168.1.5",      # Doit donner -p 80
    "scan mysql on localhost",        # Doit donner -p 3306
    "scan ports 80 and 443 on 10.0.0.5" # Doit gérer la liste
]

for q in queries:
    print(f"Input : {q}")
    print(f"Output: {generate_command(q)}")
    print("-" * 20)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hAppareil : cuda
Données originales : 1032
Données synthétiques ajoutées : 400


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/1360 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 8,650,752 || all params: 69,157,376 || trainable%: 12.5088


  trainer = Seq2SeqTrainer(


Démarrage de l'entraînement avec connaissance des ports...


Epoch,Training Loss,Validation Loss
1,0.2007,0.113003
2,0.08,0.053637
3,0.0547,0.036694
4,0.0441,0.031335
5,0.0382,0.027367
6,0.034,0.022968
7,0.0274,0.021779
8,0.0243,0.018289
9,0.0227,0.016942
10,0.0186,0.016635


------------------------------
TEST DES SERVICES -> PORTS
------------------------------
Input : scan for ssh on 10.0.0.1
Output: nmap -p 22 10.0.0.1
--------------------
Input : check http on 192.168.1.5
Output: nmap -p 80 192.168.1.5
--------------------
Input : scan mysql on localhost
Output: nmap -p 3306 localhost
--------------------
Input : scan ports 80 and 443 on 10.0.0.5
Output: nmap -p 443 10.0.0.5
--------------------


In [None]:
import shutil
import os
from google.colab import files

# 1. Configuration des noms
# C'est le nom du dossier généré par l'entraînement précédent
folder_to_zip = "nmap_expert_model"
output_filename = "mon_nouveau_modele_nmap"

# Vérification que le dossier existe bien avant de zipper
if os.path.exists(folder_to_zip):
    print(f"✅ Dossier '{folder_to_zip}' trouvé.")
    print("⏳ Compression en cours (cela peut prendre quelques secondes)...")

    # 2. Création de l'archive ZIP
    shutil.make_archive(output_filename, 'zip', folder_to_zip)
    print("✅ Compression terminée.")

    # 3. Téléchargement
    print("⬇️ Téléchargement du fichier ZIP...")
    files.download(output_filename + '.zip')
else:
    print(f"❌ Erreur : Le dossier '{folder_to_zip}' n'existe pas. Avez-vous bien lancé l'entraînement ?")

✅ Dossier 'nmap_expert_model' trouvé.
⏳ Compression en cours (cela peut prendre quelques secondes)...
✅ Compression terminée.
⬇️ Téléchargement du fichier ZIP...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>