In [1]:
import pandas
print("✅ Notebook PLUTON prêt à travailler")


✅ Notebook PLUTON prêt à travailler


In [2]:
import pandas as pd
from pathlib import Path

# 1️⃣ Chemin vers le CSV (doit être dans le dossier PLUTON)
csv_path = Path("capture_structured.csv")

# 2️⃣ Chargement des données
df = pd.read_csv(csv_path, low_memory=False)

# 3️⃣ Infos de base
print(f"✅ Données chargées : {df.shape[0]:,} lignes × {df.shape[1]} colonnes")
print(f"📦 Mémoire utilisée : {df.memory_usage(deep=True).sum() / 1024**2:.2f} Mo")

# 4️⃣ Aperçu des premières lignes
display(df.head(5))


✅ Données chargées : 1,166,610 lignes × 10 colonnes
📦 Mémoire utilisée : 305.13 Mo


Unnamed: 0,frame.number,frame.time_epoch,ip.src,ip.dst,udp.srcport,udp.dstport,sip.Method,sip.Status-Code,sip.Call-ID,frame.len
0,1,1749901000.0,172.19.0.3,172.19.0.2,5060,5060,INVITE,,1-1@172.19.0.3,547
1,2,1749901000.0,172.19.0.2,172.19.0.3,5060,5060,,180.0,1-1@172.19.0.3,337
2,3,1749901000.0,172.19.0.2,172.19.0.3,5060,5060,,200.0,1-1@172.19.0.3,492
3,4,1749901000.0,172.19.0.3,172.19.0.2,5060,5060,ACK,,1-1@172.19.0.3,391
4,5,1749901000.0,172.19.0.3,172.19.0.2,5060,5060,INVITE,,2-1@172.19.0.3,547


In [3]:
# 1️⃣ Valeurs manquantes
na_counts = df.isna().sum().sort_values(ascending=False)
print("🔍 Valeurs manquantes par colonne :")
print(na_counts[na_counts > 0])

# 2️⃣ Répartition des méthodes SIP
print("\n📊 Répartition des méthodes SIP (top 10) :")
print(df['sip.Method'].value_counts(dropna=False).head(10))

# 3️⃣ Répartition des codes SIP
print("\n📊 Répartition des codes SIP (top 10) :")
print(df['sip.Status-Code'].value_counts(dropna=False).head(10))


🔍 Valeurs manquantes par colonne :
sip.Method         767627
sip.Status-Code    398983
dtype: int64

📊 Répartition des méthodes SIP (top 10) :
sip.Method
NaN         767627
INVITE      144393
ACK         102284
BYE         102192
REGISTER     30071
OPTIONS      20043
Name: count, dtype: int64

📊 Répartition des codes SIP (top 10) :
sip.Status-Code
200.0    623235
NaN      398983
180.0    144392
Name: count, dtype: int64


In [4]:
# ✅ Remplir les NaN par des valeurs neutres
df['sip.Method'].fillna('NO_METHOD', inplace=True)
df['sip.Status-Code'].fillna(0, inplace=True)

# ✅ Vérification rapide après imputation
print(df['sip.Method'].value_counts().head(6))
print(df['sip.Status-Code'].value_counts().head(6))


sip.Method
NO_METHOD    767627
INVITE       144393
ACK          102284
BYE          102192
REGISTER      30071
OPTIONS       20043
Name: count, dtype: int64
sip.Status-Code
200.0    623235
0.0      398983
180.0    144392
Name: count, dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sip.Method'].fillna('NO_METHOD', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['sip.Status-Code'].fillna(0, inplace=True)


In [5]:
session_features = df.groupby('sip.Call-ID').agg(
    session_duration=('frame.time_epoch', lambda x: x.max() - x.min()),
    nb_trames=('frame.number', 'count'),
    nb_invite=('sip.Method', lambda x: (x == 'INVITE').sum()),
    nb_ack=('sip.Method', lambda x: (x == 'ACK').sum()),
    nb_bye=('sip.Method', lambda x: (x == 'BYE').sum()),
    nb_register=('sip.Method', lambda x: (x == 'REGISTER').sum()),
    nb_options=('sip.Method', lambda x: (x == 'OPTIONS').sum()),
    nb_200=('sip.Status-Code', lambda x: (x == 200).sum()),
    nb_180=('sip.Status-Code', lambda x: (x == 180).sum()),
    total_bytes=('frame.len', 'sum'),
    unique_ips=('ip.src', pd.Series.nunique)
).reset_index()

print(f"✅ Sessions agrégées : {session_features.shape[0]}")
display(session_features.head(10))


✅ Sessions agrégées : 194507


Unnamed: 0,sip.Call-ID,session_duration,nb_trames,nb_invite,nb_ack,nb_bye,nb_register,nb_options,nb_200,nb_180,total_bytes,unique_ips
0,1-1@172.19.0.3,1.80998,6,1,1,1,0,0,2,1,2487,2
1,10-1@172.19.0.3,1.809435,6,1,1,1,0,0,2,1,2510,2
2,100-1@172.19.0.3,1.809905,6,1,1,1,0,0,2,1,2533,2
3,1000-1@172.19.0.3,1.80015,6,1,1,1,0,0,2,1,2556,2
4,10000-1@172.19.0.3,1.801057,6,1,1,1,0,0,2,1,2579,2
5,100000-1@172.19.0.3,1.800617,6,1,1,1,0,0,2,1,2602,2
6,100001-1@172.19.0.3,1.800423,6,1,1,1,0,0,2,1,2602,2
7,100002-1@172.19.0.3,1.810569,6,1,1,1,0,0,2,1,2602,2
8,100003-1@172.19.0.3,1.810056,6,1,1,1,0,0,2,1,2602,2
9,100004-1@172.19.0.3,1.799758,6,1,1,1,0,0,2,1,2602,2


In [6]:
def label_from_callid(call_id):
    if isinstance(call_id, str):
        if "optflood" in call_id:
            return "options_flood"
        elif "regflood" in call_id:
            return "register_flood"
        elif "flood-" in call_id:  # flood simple = INVITE flood
            return "invite_flood"
    return "normal"

session_features['attack_type'] = session_features['sip.Call-ID'].apply(label_from_callid)

# ✅ Voir la répartition des sessions labellisées
print(session_features['attack_type'].value_counts())


attack_type
normal            102284
invite_flood       42109
register_flood     30071
options_flood      20043
Name: count, dtype: int64


In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ✅ 1. Séparer features et label
X = session_features.drop(columns=['sip.Call-ID', 'attack_type'])
y = session_features['attack_type']

# ✅ 2. Encoder le label en numérique
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# ✅ 3. Division train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # pour garder la même proportion normal/attaque
)

print("✅ Séparation terminée")
print("Taille train :", X_train.shape[0], "sessions")
print("Taille test :", X_test.shape[0], "sessions")
print("Classes :", list(label_encoder.classes_))


✅ Séparation terminée
Taille train : 155605 sessions
Taille test : 38902 sessions
Classes : ['invite_flood', 'normal', 'options_flood', 'register_flood']


In [8]:
import joblib   # ← ajoute l'import

# ✅ Sérialiser le DataFrame de sessions
session_features.to_pickle("session_features.pkl")

# ✅ Sérialiser l’encodeur de labels
joblib.dump(label_encoder, "pluton_label_encoder.pkl")   # sans espace dans le nom

print("✅ session_features.pkl et pluton_label_encoder.pkl enregistrés.")


✅ session_features.pkl et pluton_label_encoder.pkl enregistrés.
