<a href="https://colab.research.google.com/github/Else-If-05/ai_project/blob/main/ia_vuln_project_draft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

AI project test

In [None]:
# Installer les dépendances nécessaires
!pip install transformers datasets torch

In [None]:
!pip install --upgrade transformers


In [30]:
import requests
import json
from datetime import datetime, timedelta
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from datasets import Dataset


API_KEY = None  # pas de clé
API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0/"

def fetch_cves(start_date: datetime, end_date: datetime, api_key=None):
    all_vulns = []
    index = 0
    max_results = 500

    while True:
        params = {
            "lastModStartDate": start_date.isoformat(),
            "lastModEndDate": end_date.isoformat(),
            "startIndex": index,
            "resultsPerPage": max_results
        }

        headers = {"apiKey": api_key} if api_key else {}

        response = requests.get(API_URL, params=params, headers=headers, timeout=60)
        response.raise_for_status()
        data = response.json()

        vulns = data.get("vulnerabilities", [])
        all_vulns.extend(vulns)

        if len(vulns) < max_results:
            break
        index += len(vulns)

    return all_vulns


if __name__ == "__main__":
    now = datetime.utcnow()
    past = now - timedelta(days=1)  # les 24 dernières heures

    cves = fetch_cves(past, now, API_KEY)

    with open("nvd_cves.json", "w") as f:
        json.dump(cves, f, indent=2)

    print(f"{len(cves)} CVEs récupérées.")


755 CVEs récupérées.


In [31]:
def extract_severity_dataset(cves):
    dataset = []

    for cve in cves:
        try:
            # Cherche une description en anglais
            description = next(
                desc["value"]
                for desc in cve["cve"]["descriptions"]
                if desc["lang"] == "en"
            )

            # Essaie d'extraire la sévérité (version 3.1 de préférence)
            severity = None
            if "cvssMetricV31" in cve["cve"]["metrics"]:
                severity = cve["cve"]["metrics"]["cvssMetricV31"][0]["cvssData"]["baseSeverity"]
            elif "cvssMetricV30" in cve["cve"]["metrics"]:
                severity = cve["cve"]["metrics"]["cvssMetricV30"][0]["cvssData"]["baseSeverity"]
            elif "cvssMetricV2" in cve["cve"]["metrics"]:
                severity = cve["cve"]["metrics"]["cvssMetricV2"][0]["baseSeverity"]

            if severity and description:
                dataset.append({
                    "description": description,
                    "severity": severity.upper()
                })

        except Exception as e:
            # Tu peux log les erreurs ici si besoin
            continue

    return dataset
dataset = extract_severity_dataset(cves)
print(f"Dataset créé avec {len(dataset)} entrées.")


Dataset créé avec 707 entrées.


In [32]:
# Fetch des CVEs pour les dernières 24h
print("Récupération des CVEs...")
now = datetime.utcnow()
past = now - timedelta(days=1)  # les 24 dernières heures

cves = fetch_cves(past, now, API_KEY)
print(f"{len(cves)} CVEs récupérées.")
print(f"Exemple de CVE : {cves[0]}")


descriptions = []
severities = []

for cve in cves:
    try:
        descs = cve["cve"]["descriptions"]
        description_en = next((d["value"] for d in descs if d["lang"] == "en"), None)

        severity = None
        if "cvssMetricV31" in cve["cve"]["metrics"]:
            severity = cve["cve"]["metrics"]["cvssMetricV31"][0]["cvssData"]["baseSeverity"]
            severity = severity.upper()
        if description_en and severity:
            descriptions.append(description_en)
            severities.append(severity)

    except Exception as e:
        print("Erreur avec un CVE:", e)

print(f"-Nombre de CVEs avec description + sévérité : {len(descriptions)}")
print("-Exemple :")
print("Description :", descriptions[0])
print("Sévérité :", severities[0])


Récupération des CVEs...
755 CVEs récupérées.
Exemple de CVE : {'cve': {'id': 'CVE-2001-0827', 'sourceIdentifier': 'cve@mitre.org', 'published': '2001-12-06T05:00:00.000', 'lastModified': '2025-04-23T16:15:18.870', 'vulnStatus': 'Deferred', 'cveTags': [], 'descriptions': [{'lang': 'en', 'value': 'Cerberus FTP server 1.0 - 1.5 allows remote attackers to cause a denial of service (crash) via a large number of "PASV" requests.'}], 'metrics': {'cvssMetricV31': [{'source': '134c704f-9b21-4f2e-91b3-4a467353bcc0', 'type': 'Secondary', 'cvssData': {'version': '3.1', 'vectorString': 'CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H', 'baseScore': 7.5, 'baseSeverity': 'HIGH', 'attackVector': 'NETWORK', 'attackComplexity': 'LOW', 'privilegesRequired': 'NONE', 'userInteraction': 'NONE', 'scope': 'UNCHANGED', 'confidentialityImpact': 'NONE', 'integrityImpact': 'NONE', 'availabilityImpact': 'HIGH'}, 'exploitabilityScore': 3.9, 'impactScore': 3.6}], 'cvssMetricV2': [{'source': 'nvd@nist.gov', 'type': 'Pr

In [33]:
# Reprenons ici
texts = descriptions
labels = severities

# Map des labels en entiers
label_map = {'LOW': 0, 'MEDIUM': 1, 'HIGH': 2, 'CRITICAL': 3}
int_labels = [label_map[label] for label in labels]

# Split
X_train, X_test, y_train, y_test = train_test_split(texts, int_labels, test_size=0.2, random_state=42)

# Dataset HuggingFace
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

# Tokenizer
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Modèle
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

# Entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer
)

trainer.train()

Map:   0%|          | 0/546 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'