Preparación y extracción de Datos

In [None]:
from scapy.all import rdpcap
import pandas as pd
from collections import defaultdict
import glob
import os

# === CONFIG ===
pcap_dir = "Dataset/onu/*_1.pcapng"   # ruta a los pcaps
dataset_out = "Dataset/dataset_all.csv"     #nombre de salida pcaps

# Función para mapear nombre de archivo a etiqueta
def get_label_from_filename(filename):
    name = os.path.basename(filename).lower()
    if "game" in name:
        return "game"
    elif "video" in name:
        return "video"
    elif "mail" in name:
        return "mail"
    elif "instant-message" in name:
        return "instant-message"
    elif "network-storage" in name:
        return "network-storage"
    elif "network-transmission" in name:
        return "network-transmission"
    elif "web-browsing" in name:
        return "web-browsing"
    else:
        return "unknown"

# Lista para el dataset final
all_data = []

# Procesar todos los PCAPs en la carpeta
for pcap_file in glob.glob(pcap_dir):
    app_label = get_label_from_filename(pcap_file)
    print(f"Procesando {pcap_file} → etiqueta: {app_label}")

    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {"packets": 0, "bytes": 0, "times": []})

    for pkt in packets:
        if "IP" in pkt:
            src = pkt["IP"].src
            dst = pkt["IP"].dst
            proto = pkt["IP"].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0

            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(pkt.time)

    # Construir dataset parcial para este PCAP
    for fid, stats in flows.items():
        times = stats["times"]
        duration = max(times) - min(times) if len(times) > 1 else 0
        avg_pkt_size = stats["bytes"] / stats["packets"] if stats["packets"] > 0 else 0
        throughput = stats["bytes"] / duration if duration > 0 else stats["bytes"]

        all_data.append({
            "flow_id": fid,
            "packets": stats["packets"],
            "bytes": stats["bytes"],
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "class": app_label
        })

# Convertir a DataFrame y guardar
df = pd.DataFrame(all_data)
print(df.head())
df.to_csv(dataset_out, index=False)
print(f"Dataset completo guardado en {dataset_out}")




📂 Procesando Dataset/onu\ONU_capture_game_1.pcapng → etiqueta: game
📂 Procesando Dataset/onu\ONU_capture_instant-message_1.pcapng → etiqueta: instant-message
📂 Procesando Dataset/onu\ONU_capture_mail-service_1.pcapng → etiqueta: mail
📂 Procesando Dataset/onu\ONU_capture_network-storage_1.pcapng → etiqueta: network-storage
📂 Procesando Dataset/onu\ONU_capture_network-transmission_1.pcapng → etiqueta: network-transmission
📂 Procesando Dataset/onu\ONU_capture_video_1.pcapng → etiqueta: video
📂 Procesando Dataset/onu\ONU_capture_web-browsing_1.pcapng → etiqueta: web-browsing
                                      flow_id  packets  bytes    duration  \
0  210.196.16.152:2103-152.185.184.31:2103-17       45   6480  220.210560   
1       61.139.26.122:60678-61.139.2.69:53-17       33   2988  296.189484   
2       61.139.2.69:53-61.139.26.122:60678-17       33   5934  296.190030   
3     61.139.26.122:44006-47.94.114.72:1443-6       29   3799   16.317012   
4     47.94.114.72:1443-61.139.26.122

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Cargar dataset
df = pd.read_csv("Dataset/dataset_all.csv")

# Estadísticas básicas antes del balanceo
print("Estadísticas de features originales:")
print(df[['packets','bytes','duration','avg_pkt_size','throughput']].describe())

# Codificar etiquetas
le = LabelEncoder()
df['class_encoded'] = le.fit_transform(df['class'])

# ===== Undersampling =====
min_size = df['class_encoded'].value_counts().min()

df_list = []
for cls in df['class_encoded'].unique():
    df_cls = df[df['class_encoded']==cls].sample(n=min_size, random_state=42)
    df_list.append(df_cls)

df_balanced = pd.concat(df_list).sample(frac=1, random_state=42)  # mezclar filas

# Separar X / y
features = ['packets','bytes','duration','avg_pkt_size','throughput']
X = df_balanced[features]
y = df_balanced['class_encoded']

# Escalar features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Guardar dataset preprocesado y balanceado
df_preprocessed = pd.DataFrame(X_scaled, columns=features)
df_preprocessed['class_encoded'] = y.values
df_preprocessed['class'] = le.inverse_transform(y.values)  # opcional, para referencia
df_preprocessed.to_csv("Dataset/dataset_balanced_scaled.csv", index=False)
print("✅ Dataset balanceado y escalado guardado en 'dataset_balanced_scaled.csv'")

# Separar train/test (opcional)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print("Distribución de clases en el train set:")
print(pd.Series(y_train).value_counts())


Estadísticas de features originales:
            packets         bytes      duration  avg_pkt_size    throughput
count  13022.000000  1.302200e+04  13022.000000  13022.000000  1.302200e+04
mean     105.529181  6.334817e+04     19.342542    217.475339  4.993246e+05
std      943.498048  7.776669e+05     51.582559    252.847231  1.532715e+07
min        1.000000  5.400000e+01      0.000000     43.000000  5.328268e-01
25%        1.000000  1.032500e+02      0.000000     74.000000  6.668191e+01
50%        5.000000  4.190000e+02      0.163692    117.000000  1.300000e+02
75%       11.000000  2.520500e+03     14.940699    257.700000  2.672351e+03
max    39237.000000  3.843130e+07    301.382720   1504.949151  5.050000e+08
✅ Dataset balanceado y escalado guardado en 'dataset_balanced_scaled.csv'
Distribución de clases en el train set:
class_encoded
1    350
0    350
5    349
6    349
4    349
3    349
2    349
Name: count, dtype: int64


In [6]:
import pandas as pd

dataset_file = "Dataset/dataset_balanced_scaled.csv"
df = pd.read_csv(dataset_file)

# === Mostrar las primeras filas ===
print("Primeras 5 filas del dataset:")
print(df.head())

# === Mostrar información general ===
print("\nInformación general del dataset:")
print(df.info())

# === Estadísticas básicas de las columnas numéricas ===
print("\nEstadísticas descriptivas:")
print(df.describe())

# === Mostrar conteo de clases ===
print("\nNúmero de flujos por clase:")
print(df['class'].value_counts())

Primeras 5 filas del dataset:
    packets     bytes  duration  avg_pkt_size  throughput  class_encoded  \
0 -0.084077 -0.073483 -0.375125     -0.601286   -0.031483              3   
1 -0.085132 -0.073616 -0.379994     -0.601286   -0.031512              6   
2 -0.085132 -0.073582 -0.379994     -0.535597   -0.031511              5   
3 -0.084077 -0.073483 -0.379508     -0.601286   -0.031185              3   
4 -0.085132 -0.073582 -0.379994     -0.535597   -0.031511              5   

             class  
0  network-storage  
1     web-browsing  
2            video  
3  network-storage  
4            video  

Información general del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3493 entries, 0 to 3492
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   packets        3493 non-null   float64
 1   bytes          3493 non-null   float64
 2   duration       3493 non-null   float64
 3   avg_pkt_size   3493 no