Preparación y extracción de Datos

In [7]:
#importacion de librerias

from scapy.all import rdpcap, TCP, IP
import numpy as np
from collections import defaultdict
import pandas as pd
import glob
import os
import math


In [None]:
# Pruebas con extracción simple de características de flujos de red a partir de archivos PCAP

# Patrón de archivos PCAP a procesar
pcap_pattern = "Dataset/onu/ONU_capture_game_[1-6].pcapng"  
# Archivo CSV de salida donde se guardarán los datos procesados
output_csv = "Dataset/clases1/game.csv"
# Clase o etiqueta que se asignará a estos flujos
target_class = "game"

# Función que procesa un archivo PCAP y extrae características simples de cada flujo
def process_pcap(pcap_file, label):
    # Cargar todos los paquetes del archivo PCAP
    packets = rdpcap(pcap_file)
    
    # Diccionario para almacenar información de cada flujo, inicializando valores por defecto
    flows = defaultdict(lambda: {
        "packets": 0,      # Número total de paquetes en el flujo
        "bytes": 0,        # Número total de bytes en el flujo
        "times": [],       # Tiempos de llegada de cada paquete
        "syn_count": 0,    # Contador de paquetes TCP SYN
        "ack_count": 0,    # Contador de paquetes TCP ACK
        "fin_count": 0,    # Contador de paquetes TCP FIN
        "rst_count": 0,    # Contador de paquetes TCP RST
        "pkt_sizes": []    # Tamaños de cada paquete
    })

    # Recorrer cada paquete del archivo
    for pkt in packets:
        # Verificar si el paquete tiene capa IP
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst          # IP origen y destino
            proto = pkt[IP].proto                        # Protocolo (TCP, UDP, etc.)
            # Obtener puertos de origen y destino si existen, sino 0
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            # Identificador único de flujo basado en IPs, puertos y protocolo
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            # Actualizar estadísticas del flujo
            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            # Si es TCP, contar flags específicos
            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1  # SYN
                if flags & 0x10: flows[flow_id]["ack_count"] += 1  # ACK
                if flags & 0x01: flows[flow_id]["fin_count"] += 1  # FIN
                if flags & 0x04: flows[flow_id]["rst_count"] += 1  # RST

    # Lista final para guardar los flujos procesados como diccionarios
    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        # Calcular duración del flujo: diferencia entre el último y primer paquete
        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        # Calcular tamaño promedio de paquete
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0

        # Guardar todas las características en un diccionario
        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "class": label
        })
    return all_data

# === PROCESO PRINCIPAL ===
all_data = []  # Lista que almacenará todos los flujos de todos los archivos
pcap_files = sorted(glob.glob(pcap_pattern))  # Listar todos los archivos PCAP que coinciden con el patrón

# Procesar cada archivo PCAP
for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))  # Añadir flujos procesados a la lista principal

# Guardar los datos procesados en un archivo CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)  # Crear directorio si no existe
df = pd.DataFrame(all_data)  # Convertir lista de diccionarios a DataFrame
df.to_csv(output_csv, index=False)  # Guardar CSV sin índice
print(f"✅ Guardado {len(df)} flujos en {output_csv}")


Procesando Dataset/onu\ONU_capture_game_1.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_2.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_3.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_4.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_5.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_6.pcapng → clase game
✅ Guardado 5357 flujos en Dataset/clases1/game_simple.csv


In [9]:
# clase instant-message

pcap_pattern = "Dataset/onu/ONU_capture_instant-message_[1-3].pcapng"   
output_csv = "Dataset/clases1/instant-message.csv"
target_class = "instant-message"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu\ONU_capture_instant-message_1.pcapng → clase instant-message
Procesando Dataset/onu\ONU_capture_instant-message_2.pcapng → clase instant-message
Procesando Dataset/onu\ONU_capture_instant-message_3.pcapng → clase instant-message
✅ Guardado 4377 flujos en Dataset/clases1/instant-message.csv


In [10]:
# clase mail

pcap_pattern = "Dataset/onu/ONU_capture_mail-service_[1-2].pcapng"  
output_csv = "Dataset/clases1/mail.csv"
target_class = "mail"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu\ONU_capture_mail-service_1.pcapng → clase mail
Procesando Dataset/onu\ONU_capture_mail-service_2.pcapng → clase mail
✅ Guardado 3045 flujos en Dataset/clases1/mail.csv


In [11]:
# clase network-storage

pcap_pattern = "Dataset/onu/ONU_capture_network-storage_[1-3].pcapng" 
output_csv = "Dataset/clases1/network-storage.csv"
target_class = "network-storage"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu\ONU_capture_network-storage_1.pcapng → clase network-storage
Procesando Dataset/onu\ONU_capture_network-storage_2.pcapng → clase network-storage
Procesando Dataset/onu\ONU_capture_network-storage_3.pcapng → clase network-storage
✅ Guardado 4110 flujos en Dataset/clases1/network-storage.csv


In [13]:
# clase video

pcap_pattern = "Dataset/onu/ONU_capture_video_1.pcapng" 
output_csv = "Dataset/clases1/video.csv"
target_class = "video"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu/ONU_capture_video_1.pcapng → clase video
✅ Guardado 3131 flujos en Dataset/clases1/video.csv


In [17]:
# web-browsing

pcap_pattern = "Dataset/onu/ONU_capture_web-browsing_[1-2].pcapng" 
output_csv = "Dataset/clases1/web-browsing.csv"
target_class = "web-browsing"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")


Procesando Dataset/onu\ONU_capture_web-browsing_1.pcapng → clase web-browsing
Procesando Dataset/onu\ONU_capture_web-browsing_2.pcapng → clase web-browsing
✅ Guardado 4813 flujos en Dataset/clases1/web-browsing.csv


Extración aplicando Ventana

In [None]:
# clase game

pcap_pattern = "Dataset/onu/ONU_capture_game_[1-4].pcapng"  # procesa 1 a 4
output_csv = "Dataset/clases2/game.csv"
target_class = "game"
window_size = 1.0  # segundos

# ==============================
# Función para extraer features por ventanas
# ==============================
def extract_features_from_pcap_windowed(pcap_file, window_size=1.0):
    packets = rdpcap(pcap_file)

    if len(packets) == 0:
        return []

    start_time = packets[0].time
    end_time = packets[-1].time
    total_duration = end_time - start_time

    n_windows = math.ceil(total_duration / window_size)
    windows = [[] for _ in range(n_windows)]

    # Distribuir paquetes en ventanas
    for pkt in packets:
        if IP in pkt:
            ts = pkt.time
            size = len(pkt)
            win_idx = int((ts - start_time) // window_size)
            if 0 <= win_idx < n_windows:
                windows[win_idx].append(size)

    # Extraer features de cada ventana
    features = []
    for i, pkts in enumerate(windows):
        if len(pkts) == 0:
            continue

        window_start = start_time + i * window_size
        window_end = window_start + window_size
        duration = window_end - window_start

        n_packets = len(pkts)
        total_bytes = sum(pkts)
        avg_pkt_size = total_bytes / n_packets
        throughput = total_bytes / duration
        pkt_rate = n_packets / duration

        features.append({
            "window_start": window_start,
            "window_end": window_end,
            "packets": n_packets,
            "bytes": total_bytes,
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "pkt_rate": pkt_rate,
            "class": target_class
        })

    return features

# ==============================
# Procesar todos los archivos
# ==============================
pcap_files = glob.glob(pcap_pattern)
print(f"🔎 Encontrados {len(pcap_files)} archivos para procesar")

data = []
for f in pcap_files:
    feats = extract_features_from_pcap_windowed(f, window_size)
    data.extend(feats)
    print(f"✅ Procesado {os.path.basename(f)} con {len(feats)} ventanas")

df = pd.DataFrame(data)

# ==============================
# Guardar dataset
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"\n✅ Dataset guardado en {output_csv} con {len(df)} flujos")
print(df.head())


🔎 Encontrados 4 archivos para procesar


✅ Procesado ONU_capture_game_1.pcapng con 258 ventanas
✅ Procesado ONU_capture_game_2.pcapng con 295 ventanas
✅ Procesado ONU_capture_game_3.pcapng con 300 ventanas
✅ Procesado ONU_capture_game_4.pcapng con 301 ventanas

✅ Dataset guardado en Dataset/clases/game.csv con 1154 flujos
        window_start         window_end  packets   bytes  duration  \
0  1693900141.261672  1693900142.261672        1     144  1.000000   
1  1693900142.261672  1693900143.261672       18    5082  1.000000   
2  1693900143.261672  1693900144.261672      576  314467  1.000000   
3  1693900144.261672  1693900145.261672     1202  652591  1.000000   
4  1693900145.261672  1693900146.261672        2     140  1.000000   

   avg_pkt_size throughput pkt_rate class  
0    144.000000        144        1  game  
1    282.333333       5082       18  game  
2    545.949653     314467      576  game  
3    542.920965     652591     1202  game  
4     70.000000     1.4E+2        2  game  


In [None]:
# clase instant-message

pcap_pattern = "Dataset/onu/ONU_capture_instant-message_1.pcapng"   
output_csv = "Dataset/clases2/instant-message.csv"
target_class = "instant-message"
window_size = 1  # segundos por ventana

# ==============================
# Función para extraer features por ventanas
# ==============================
def extract_features_from_pcap_windowed(pcap_file, window_size=1.0):
    packets = rdpcap(pcap_file)

    if len(packets) == 0:
        return []

    start_time = packets[0].time
    end_time = packets[-1].time
    total_duration = end_time - start_time

    n_windows = math.ceil(total_duration / window_size)
    windows = [[] for _ in range(n_windows)]

    # Distribuir paquetes en ventanas
    for pkt in packets:
        if IP in pkt:
            ts = pkt.time
            size = len(pkt)
            win_idx = int((ts - start_time) // window_size)
            if 0 <= win_idx < n_windows:
                windows[win_idx].append(size)

    # Extraer features de cada ventana
    features = []
    for i, pkts in enumerate(windows):
        if len(pkts) == 0:
            continue

        window_start = start_time + i * window_size
        window_end = window_start + window_size
        duration = window_end - window_start

        n_packets = len(pkts)
        total_bytes = sum(pkts)
        avg_pkt_size = total_bytes / n_packets
        throughput = total_bytes / duration
        pkt_rate = n_packets / duration

        features.append({
            "window_start": window_start,
            "window_end": window_end,
            "packets": n_packets,
            "bytes": total_bytes,
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "pkt_rate": pkt_rate,
            "class": target_class
        })

    return features

# ==============================
# Procesar todos los archivos
# ==============================
pcap_files = glob.glob(pcap_pattern)
print(f"🔎 Encontrados {len(pcap_files)} archivos para procesar")

data = []
for f in pcap_files:
    feats = extract_features_from_pcap_windowed(f, window_size)
    data.extend(feats)
    print(f"✅ Procesado {os.path.basename(f)} con {len(feats)} ventanas")

df = pd.DataFrame(data)

# ==============================
# Guardar dataset
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"\n✅ Dataset guardado en {output_csv} con {len(df)} flujos")
print(df.head())

🔎 Encontrados 1 archivos para procesar
✅ Procesado ONU_capture_instant-message_1.pcapng con 274 ventanas

✅ Dataset guardado en Dataset/clases/instant-message.csv con 274 flujos
        window_start         window_end  packets   bytes  duration  \
0  1693899429.966343  1693899430.966343      497  153868  1.000000   
1  1693899430.966343  1693899431.966343      134   86374  1.000000   
2  1693899431.966343  1693899432.966343      158   94060  1.000000   
3  1693899432.966343  1693899433.966343      301  112154  1.000000   
4  1693899433.966343  1693899434.966343      120   28347  1.000000   

   avg_pkt_size throughput pkt_rate            class  
0    309.593561     153868      497  instant-message  
1    644.582090      86374      134  instant-message  
2    595.316456   9.406E+4      158  instant-message  
3    372.604651     112154      301  instant-message  
4    236.225000      28347   1.2E+2  instant-message  


In [None]:
# clase mail

pcap_pattern = "Dataset/onu/ONU_capture_mail-service_[1].pcapng"  
output_csv = "Dataset/clases2/mail.csv"
target_class = "mail"
window_size = 1  # segundos por ventana


# ==============================
# Función para extraer features por ventanas
# ==============================
def extract_features_from_pcap_windowed(pcap_file, window_size=1.0):
    packets = rdpcap(pcap_file)

    if len(packets) == 0:
        return []

    start_time = packets[0].time
    end_time = packets[-1].time
    total_duration = end_time - start_time

    n_windows = math.ceil(total_duration / window_size)
    windows = [[] for _ in range(n_windows)]

    # Distribuir paquetes en ventanas
    for pkt in packets:
        if IP in pkt:
            ts = pkt.time
            size = len(pkt)
            win_idx = int((ts - start_time) // window_size)
            if 0 <= win_idx < n_windows:
                windows[win_idx].append(size)

    # Extraer features de cada ventana
    features = []
    for i, pkts in enumerate(windows):
        if len(pkts) == 0:
            continue

        window_start = start_time + i * window_size
        window_end = window_start + window_size
        duration = window_end - window_start

        n_packets = len(pkts)
        total_bytes = sum(pkts)
        avg_pkt_size = total_bytes / n_packets
        throughput = total_bytes / duration
        pkt_rate = n_packets / duration

        features.append({
            "window_start": window_start,
            "window_end": window_end,
            "packets": n_packets,
            "bytes": total_bytes,
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "pkt_rate": pkt_rate,
            "class": target_class
        })

    return features

# ==============================
# Procesar todos los archivos
# ==============================
pcap_files = glob.glob(pcap_pattern)
print(f"🔎 Encontrados {len(pcap_files)} archivos para procesar")

data = []
for f in pcap_files:
    feats = extract_features_from_pcap_windowed(f, window_size)
    data.extend(feats)
    print(f"✅ Procesado {os.path.basename(f)} con {len(feats)} ventanas")

df = pd.DataFrame(data)

# ==============================
# Guardar dataset
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"\n✅ Dataset guardado en {output_csv} con {len(df)} flujos")
print(df.head())

🔎 Encontrados 1 archivos para procesar
✅ Procesado ONU_capture_mail-service_1.pcapng con 301 ventanas

✅ Dataset guardado en Dataset/clases/mail.csv con 301 flujos
        window_start         window_end  packets    bytes  duration  \
0  1693898362.004763  1693898363.004763       50     7254  1.000000   
1  1693898363.004763  1693898364.004763     4420  2526960  1.000000   
2  1693898364.004763  1693898365.004763     3906  2300288  1.000000   
3  1693898365.004763  1693898366.004763      841   417849  1.000000   
4  1693898366.004763  1693898367.004763       26     6423  1.000000   

   avg_pkt_size  throughput pkt_rate class  
0    145.080000        7254     5E+1  mail  
1    571.710407  2.52696E+6  4.42E+3  mail  
2    588.911418     2300288     3906  mail  
3    496.847800      417849      841  mail  
4    247.038462        6423       26  mail  


In [None]:
# clase network-storage

pcap_pattern = "Dataset/onu/ONU_capture_network-storage_[1-3].pcapng" 
output_csv = "Dataset/clases2/network-storage.csv"
target_class = "network-storage"
window_size = 1  # segundos por ventana

# ==============================
# Función para extraer features por ventanas
# ==============================
def extract_features_from_pcap_windowed(pcap_file, window_size=1.0):
    packets = rdpcap(pcap_file)

    if len(packets) == 0:
        return []

    start_time = packets[0].time
    end_time = packets[-1].time
    total_duration = end_time - start_time

    n_windows = math.ceil(total_duration / window_size)
    windows = [[] for _ in range(n_windows)]

    # Distribuir paquetes en ventanas
    for pkt in packets:
        if IP in pkt:
            ts = pkt.time
            size = len(pkt)
            win_idx = int((ts - start_time) // window_size)
            if 0 <= win_idx < n_windows:
                windows[win_idx].append(size)

    # Extraer features de cada ventana
    features = []
    for i, pkts in enumerate(windows):
        if len(pkts) == 0:
            continue

        window_start = start_time + i * window_size
        window_end = window_start + window_size
        duration = window_end - window_start

        n_packets = len(pkts)
        total_bytes = sum(pkts)
        avg_pkt_size = total_bytes / n_packets
        throughput = total_bytes / duration
        pkt_rate = n_packets / duration

        features.append({
            "window_start": window_start,
            "window_end": window_end,
            "packets": n_packets,
            "bytes": total_bytes,
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "pkt_rate": pkt_rate,
            "class": target_class
        })

    return features

# ==============================
# Procesar todos los archivos
# ==============================
pcap_files = glob.glob(pcap_pattern)
print(f"🔎 Encontrados {len(pcap_files)} archivos para procesar")

data = []
for f in pcap_files:
    feats = extract_features_from_pcap_windowed(f, window_size)
    data.extend(feats)
    print(f"✅ Procesado {os.path.basename(f)} con {len(feats)} ventanas")

df = pd.DataFrame(data)

# ==============================
# Guardar dataset
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"\n✅ Dataset guardado en {output_csv} con {len(df)} flujos")
print(df.head())

🔎 Encontrados 3 archivos para procesar
✅ Procesado ONU_capture_network-storage_1.pcapng con 303 ventanas
✅ Procesado ONU_capture_network-storage_2.pcapng con 279 ventanas
✅ Procesado ONU_capture_network-storage_3.pcapng con 302 ventanas

✅ Dataset guardado en Dataset/clases/network-storage.csv con 884 flujos
        window_start         window_end  packets  bytes  duration  \
0  1693896868.195161  1693896869.195161      265  94732  1.000000   
1  1693896869.195161  1693896870.195161      194  91852  1.000000   
2  1693896870.195161  1693896871.195161       12   4653  1.000000   
3  1693896871.195161  1693896872.195161       25   4638  1.000000   
4  1693896872.195161  1693896873.195161       60  18350  1.000000   

   avg_pkt_size throughput pkt_rate            class  
0    357.479245      94732      265  network-storage  
1    473.463918      91852      194  network-storage  
2    387.750000       4653       12  network-storage  
3    185.520000       4638       25  network-storage  


In [None]:
# clase video

pcap_pattern = "Dataset/onu/ONU_capture_video_[1-2].pcapng" 
output_csv = "Dataset/clases2/video.csv"
target_class = "video"
window_size = 1  # segundos por ventana

# ==============================
# Función para extraer features por ventanas
# ==============================
def extract_features_from_pcap_windowed(pcap_file, window_size=1.0):
    packets = rdpcap(pcap_file)

    if len(packets) == 0:
        return []

    start_time = packets[0].time
    end_time = packets[-1].time
    total_duration = end_time - start_time

    n_windows = math.ceil(total_duration / window_size)
    windows = [[] for _ in range(n_windows)]

    # Distribuir paquetes en ventanas
    for pkt in packets:
        if IP in pkt:
            ts = pkt.time
            size = len(pkt)
            win_idx = int((ts - start_time) // window_size)
            if 0 <= win_idx < n_windows:
                windows[win_idx].append(size)

    # Extraer features de cada ventana
    features = []
    for i, pkts in enumerate(windows):
        if len(pkts) == 0:
            continue

        window_start = start_time + i * window_size
        window_end = window_start + window_size
        duration = window_end - window_start

        n_packets = len(pkts)
        total_bytes = sum(pkts)
        avg_pkt_size = total_bytes / n_packets
        throughput = total_bytes / duration
        pkt_rate = n_packets / duration

        features.append({
            "window_start": window_start,
            "window_end": window_end,
            "packets": n_packets,
            "bytes": total_bytes,
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "pkt_rate": pkt_rate,
            "class": target_class
        })

    return features

# ==============================
# Procesar todos los archivos
# ==============================
pcap_files = glob.glob(pcap_pattern)
print(f"🔎 Encontrados {len(pcap_files)} archivos para procesar")

data = []
for f in pcap_files:
    feats = extract_features_from_pcap_windowed(f, window_size)
    data.extend(feats)
    print(f"✅ Procesado {os.path.basename(f)} con {len(feats)} ventanas")

df = pd.DataFrame(data)

# ==============================
# Guardar dataset
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"\n✅ Dataset guardado en {output_csv} con {len(df)} flujos")
print(df.head())

🔎 Encontrados 2 archivos para procesar
✅ Procesado ONU_capture_video_1.pcapng con 302 ventanas
✅ Procesado ONU_capture_video_2.pcapng con 303 ventanas

✅ Dataset guardado en Dataset/clases/video.csv con 605 flujos
        window_start         window_end  packets   bytes  duration  \
0  1693897322.577452  1693897323.577452     1431  987801  1.000000   
1  1693897323.577452  1693897324.577452       46   16015  1.000000   
2  1693897324.577452  1693897325.577452       42   14051  1.000000   
3  1693897325.577452  1693897326.577452      492  151302  1.000000   
4  1693897326.577452  1693897327.577452      379  146994  1.000000   

   avg_pkt_size throughput pkt_rate  class  
0    690.287212     987801     1431  video  
1    348.152174      16015       46  video  
2    334.547619      14051       42  video  
3    307.524390     151302      492  video  
4    387.846966     146994      379  video  


In [None]:
# clase web-browsing

pcap_pattern = "Dataset/onu/ONU_capture_web-browsing_[1-2].pcapng" 
output_csv = "Dataset/clases2/web-browsing.csv"
target_class = "web-browsing"
window_size = 1  # segundos por ventana

# ==============================
# Función para extraer features por ventanas
# ==============================
def extract_features_from_pcap_windowed(pcap_file, window_size=1.0):
    packets = rdpcap(pcap_file)

    if len(packets) == 0:
        return []

    start_time = packets[0].time
    end_time = packets[-1].time
    total_duration = end_time - start_time

    n_windows = math.ceil(total_duration / window_size)
    windows = [[] for _ in range(n_windows)]

    # Distribuir paquetes en ventanas
    for pkt in packets:
        if IP in pkt:
            ts = pkt.time
            size = len(pkt)
            win_idx = int((ts - start_time) // window_size)
            if 0 <= win_idx < n_windows:
                windows[win_idx].append(size)

    # Extraer features de cada ventana
    features = []
    for i, pkts in enumerate(windows):
        if len(pkts) == 0:
            continue

        window_start = start_time + i * window_size
        window_end = window_start + window_size
        duration = window_end - window_start

        n_packets = len(pkts)
        total_bytes = sum(pkts)
        avg_pkt_size = total_bytes / n_packets
        throughput = total_bytes / duration
        pkt_rate = n_packets / duration

        features.append({
            "window_start": window_start,
            "window_end": window_end,
            "packets": n_packets,
            "bytes": total_bytes,
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "pkt_rate": pkt_rate,
            "class": target_class
        })

    return features

# ==============================
# Procesar todos los archivos
# ==============================
pcap_files = glob.glob(pcap_pattern)
print(f"🔎 Encontrados {len(pcap_files)} archivos para procesar")

data = []
for f in pcap_files:
    feats = extract_features_from_pcap_windowed(f, window_size)
    data.extend(feats)
    print(f"✅ Procesado {os.path.basename(f)} con {len(feats)} ventanas")

df = pd.DataFrame(data)

# ==============================
# Guardar dataset
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"\n✅ Dataset guardado en {output_csv} con {len(df)} flujos")
print(df.head())

🔎 Encontrados 2 archivos para procesar
✅ Procesado ONU_capture_web-browsing_1.pcapng con 301 ventanas
✅ Procesado ONU_capture_web-browsing_2.pcapng con 295 ventanas

✅ Dataset guardado en Dataset/clases/web-browsing.csv con 596 flujos
        window_start         window_end  packets    bytes  duration  \
0  1693898020.251076  1693898021.251076       30     2308  1.000000   
1  1693898021.251076  1693898022.251076       14     7037  1.000000   
2  1693898022.251076  1693898023.251076     3004  1527584  1.000000   
3  1693898023.251076  1693898024.251076     1691   843539  1.000000   
4  1693898024.251076  1693898025.251076     1514   774217  1.000000   

   avg_pkt_size throughput pkt_rate         class  
0     76.933333       2308     3E+1  web-browsing  
1    502.642857       7037       14  web-browsing  
2    508.516644    1527584     3004  web-browsing  
3    498.840331     843539     1691  web-browsing  
4    511.371863     774217     1514  web-browsing  


Codigo General para agrupar dato 

In [18]:
import pandas as pd
import glob
import os

# ==============================
# Configuración de rutas
# ==============================
clases_dir = "Dataset/clases1/*.csv"  # CSVs por clase
output_csv = "Dataset/dataset_all1.csv"  # Dataset combinado final

# ==============================
# Cargar todos los CSVs por clase
# ==============================
csv_files = glob.glob(clases_dir)
print(f"Encontrados {len(csv_files)} archivos de clases")

df_all = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
print(f"Dataset combinado inicial: {df_all.shape[0]} filas")

# ==============================
# Guardar dataset final
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df_all.to_csv(output_csv, index=False)

print(f"✅ Dataset combinado guardado en {output_csv}")



Encontrados 6 archivos de clases
Dataset combinado inicial: 24833 filas
✅ Dataset combinado guardado en Dataset/dataset_all1.csv


In [19]:
import pandas as pd
import glob
import os

# ==============================
# Configuración de rutas
# ==============================
clases_dir = "Dataset/clases2/*.csv"  # CSVs por clase
output_csv = "Dataset/dataset_all2.csv"  # Dataset combinado final

# ==============================
# Cargar todos los CSVs por clase
# ==============================
csv_files = glob.glob(clases_dir)
print(f"Encontrados {len(csv_files)} archivos de clases")

df_all = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)
print(f"Dataset combinado inicial: {df_all.shape[0]} filas")

# ==============================
# Guardar dataset final
# ==============================
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df_all.to_csv(output_csv, index=False)

print(f"✅ Dataset combinado guardado en {output_csv}")


Encontrados 6 archivos de clases
Dataset combinado inicial: 3814 filas
✅ Dataset combinado guardado en Dataset/dataset_all2.csv
