Preparación y extracción de Datos

In [24]:
# Importacion de librerias

from scapy.all import rdpcap, TCP, IP
import pandas as pd
from collections import defaultdict
import numpy as np
import os
import glob

In [None]:

# Dataset par clase game

pcap_pattern = "Dataset/onu/ONU_capture_game_[1-6].pcapng"  
output_csv = "Dataset/clases/game.csv"
target_class = "game"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0
        throughput = float(stats["bytes"]) / duration if duration > 0 else float(stats["bytes"])
        small_packet_ratio = float(np.sum(np.array(pkt_sizes) < 100)) / float(len(pkt_sizes)) if pkt_sizes else 0.0
        pkt_size_var = float(np.var(pkt_sizes)) if pkt_sizes else 0.0
        interarrival_var = float(np.var(np.diff(sorted(times_float)))) if len(times_float) > 1 else 0.0
        throughput_per_packet = throughput / float(stats["packets"]) if stats["packets"] > 0 else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "small_packet_ratio": small_packet_ratio,
            "pkt_size_var": pkt_size_var,
            "interarrival_var": interarrival_var,
            "throughput_per_packet": throughput_per_packet,
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")


Procesando Dataset/onu\ONU_capture_game_1.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_2.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_3.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_4.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_5.pcapng → clase game
Procesando Dataset/onu\ONU_capture_game_6.pcapng → clase game
✅ Guardado 5357 flujos en Dataset/clases/game.csv


In [None]:

# clase instant-message

pcap_pattern = "Dataset/onu/ONU_capture_instant-message_[1-3].pcapng"   
output_csv = "Dataset/clases/instant-message.csv"
target_class = "instant-message"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0
        throughput = float(stats["bytes"]) / duration if duration > 0 else float(stats["bytes"])
        small_packet_ratio = float(np.sum(np.array(pkt_sizes) < 100)) / float(len(pkt_sizes)) if pkt_sizes else 0.0
        pkt_size_var = float(np.var(pkt_sizes)) if pkt_sizes else 0.0
        interarrival_var = float(np.var(np.diff(sorted(times_float)))) if len(times_float) > 1 else 0.0
        throughput_per_packet = throughput / float(stats["packets"]) if stats["packets"] > 0 else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "small_packet_ratio": small_packet_ratio,
            "pkt_size_var": pkt_size_var,
            "interarrival_var": interarrival_var,
            "throughput_per_packet": throughput_per_packet,
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu\ONU_capture_instant-message_1.pcapng → clase instant-message
Procesando Dataset/onu\ONU_capture_instant-message_2.pcapng → clase instant-message
Procesando Dataset/onu\ONU_capture_instant-message_3.pcapng → clase instant-message
✅ Guardado 4377 flujos en Dataset/clases/instant-message.csv


In [None]:
# clase mail

pcap_pattern = "Dataset/onu/ONU_capture_mail-service_[1-2].pcapng"  
output_csv = "Dataset/clases/mail.csv"
target_class = "mail"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0
        throughput = float(stats["bytes"]) / duration if duration > 0 else float(stats["bytes"])
        small_packet_ratio = float(np.sum(np.array(pkt_sizes) < 100)) / float(len(pkt_sizes)) if pkt_sizes else 0.0
        pkt_size_var = float(np.var(pkt_sizes)) if pkt_sizes else 0.0
        interarrival_var = float(np.var(np.diff(sorted(times_float)))) if len(times_float) > 1 else 0.0
        throughput_per_packet = throughput / float(stats["packets"]) if stats["packets"] > 0 else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "small_packet_ratio": small_packet_ratio,
            "pkt_size_var": pkt_size_var,
            "interarrival_var": interarrival_var,
            "throughput_per_packet": throughput_per_packet,
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu\ONU_capture_mail-service_1.pcapng → clase mail
Procesando Dataset/onu\ONU_capture_mail-service_2.pcapng → clase mail
✅ Guardado 3045 flujos en Dataset/clases/mail.csv


In [None]:
# clase network-storage

pcap_pattern = "Dataset/onu/ONU_capture_network-storage_[1-3].pcapng" 
output_csv = "Dataset/clases/network-storage.csv"
target_class = "network-storage"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0
        throughput = float(stats["bytes"]) / duration if duration > 0 else float(stats["bytes"])
        small_packet_ratio = float(np.sum(np.array(pkt_sizes) < 100)) / float(len(pkt_sizes)) if pkt_sizes else 0.0
        pkt_size_var = float(np.var(pkt_sizes)) if pkt_sizes else 0.0
        interarrival_var = float(np.var(np.diff(sorted(times_float)))) if len(times_float) > 1 else 0.0
        throughput_per_packet = throughput / float(stats["packets"]) if stats["packets"] > 0 else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "small_packet_ratio": small_packet_ratio,
            "pkt_size_var": pkt_size_var,
            "interarrival_var": interarrival_var,
            "throughput_per_packet": throughput_per_packet,
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

  pcap_pattern = "Dataset\onu\ONU_capture_network-storage_[1-3].pcapng"


Procesando Dataset\onu\ONU_capture_network-storage_1.pcapng → clase network-storage
Procesando Dataset\onu\ONU_capture_network-storage_2.pcapng → clase network-storage
Procesando Dataset\onu\ONU_capture_network-storage_3.pcapng → clase network-storage
✅ Guardado 4110 flujos en Dataset/clases/network-storage.csv


In [26]:
# clase network-storage

pcap_pattern = "Dataset/onu/ONU_capture_video_1.pcapng" 
output_csv = "Dataset/clases/video.csv"
target_class = "video"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0
        throughput = float(stats["bytes"]) / duration if duration > 0 else float(stats["bytes"])
        small_packet_ratio = float(np.sum(np.array(pkt_sizes) < 100)) / float(len(pkt_sizes)) if pkt_sizes else 0.0
        pkt_size_var = float(np.var(pkt_sizes)) if pkt_sizes else 0.0
        interarrival_var = float(np.var(np.diff(sorted(times_float)))) if len(times_float) > 1 else 0.0
        throughput_per_packet = throughput / float(stats["packets"]) if stats["packets"] > 0 else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "small_packet_ratio": small_packet_ratio,
            "pkt_size_var": pkt_size_var,
            "interarrival_var": interarrival_var,
            "throughput_per_packet": throughput_per_packet,
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu/ONU_capture_video_1.pcapng → clase video
✅ Guardado 3131 flujos en Dataset/clases/video.csv


In [27]:
# clase network-storage

pcap_pattern = "Dataset/onu/ONU_capture_web-browsing_[1-2].pcapng" 
output_csv = "Dataset/clases/web-browsing.csv"
target_class = "web-browsing"

def process_pcap(pcap_file, label):
    packets = rdpcap(pcap_file)
    flows = defaultdict(lambda: {
        "packets": 0, "bytes": 0, "times": [],
        "syn_count": 0, "ack_count": 0, "fin_count": 0, "rst_count": 0,
        "pkt_sizes": []
    })

    for pkt in packets:
        if IP in pkt:
            src, dst = pkt[IP].src, pkt[IP].dst
            proto = pkt[IP].proto
            sport = pkt.sport if hasattr(pkt, "sport") else 0
            dport = pkt.dport if hasattr(pkt, "dport") else 0
            flow_id = f"{src}:{sport}-{dst}:{dport}-{proto}"

            flows[flow_id]["packets"] += 1
            flows[flow_id]["bytes"] += len(pkt)
            flows[flow_id]["times"].append(float(pkt.time))
            flows[flow_id]["pkt_sizes"].append(len(pkt))

            if TCP in pkt:
                flags = pkt[TCP].flags
                if flags & 0x02: flows[flow_id]["syn_count"] += 1
                if flags & 0x10: flows[flow_id]["ack_count"] += 1
                if flags & 0x01: flows[flow_id]["fin_count"] += 1
                if flags & 0x04: flows[flow_id]["rst_count"] += 1

    all_data = []
    for fid, stats in flows.items():
        times_float = stats["times"]
        pkt_sizes = stats["pkt_sizes"]

        duration = float(max(times_float) - min(times_float)) if len(times_float) > 1 else 0.0
        avg_pkt_size = float(np.mean(pkt_sizes)) if pkt_sizes else 0.0
        throughput = float(stats["bytes"]) / duration if duration > 0 else float(stats["bytes"])
        small_packet_ratio = float(np.sum(np.array(pkt_sizes) < 100)) / float(len(pkt_sizes)) if pkt_sizes else 0.0
        pkt_size_var = float(np.var(pkt_sizes)) if pkt_sizes else 0.0
        interarrival_var = float(np.var(np.diff(sorted(times_float)))) if len(times_float) > 1 else 0.0
        throughput_per_packet = throughput / float(stats["packets"]) if stats["packets"] > 0 else 0.0

        all_data.append({
            "flow_id": fid,
            "packets": float(stats["packets"]),
            "bytes": float(stats["bytes"]),
            "duration": duration,
            "avg_pkt_size": avg_pkt_size,
            "throughput": throughput,
            "syn_count": float(stats["syn_count"]),
            "ack_count": float(stats["ack_count"]),
            "fin_count": float(stats["fin_count"]),
            "rst_count": float(stats["rst_count"]),
            "small_packet_ratio": small_packet_ratio,
            "pkt_size_var": pkt_size_var,
            "interarrival_var": interarrival_var,
            "throughput_per_packet": throughput_per_packet,
            "class": label
        })
    return all_data

# === PROCESO ===
all_data = []
pcap_files = sorted(glob.glob(pcap_pattern))

for pcap_file in pcap_files:
    print(f"Procesando {pcap_file} → clase {target_class}")
    all_data.extend(process_pcap(pcap_file, target_class))

# Guardar CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df = pd.DataFrame(all_data)
df.to_csv(output_csv, index=False)
print(f"✅ Guardado {len(df)} flujos en {output_csv}")

Procesando Dataset/onu\ONU_capture_web-browsing_1.pcapng → clase web-browsing
Procesando Dataset/onu\ONU_capture_web-browsing_2.pcapng → clase web-browsing
✅ Guardado 4813 flujos en Dataset/clases/web-browsing.csv


In [30]:
import pandas as pd
import glob
import os

# Ruta donde están los CSVs por clase
clases_dir = "Dataset/clases/*.csv"
output_csv = "Dataset/dataset_all.csv"

# Cargar todos los CSVs de clases
csv_files = glob.glob(clases_dir)
print(f"Encontrados {len(csv_files)} archivos de clases")

# Unir todos en un solo DataFrame
df_all = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# 🔹 Combinar clases difíciles de separar
df_all['class'] = df_all['class'].replace({
    'game': 'interactive-app',
    'instant-message': 'interactive-app'
})

# Guardar dataset unificado
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df_all.to_csv(output_csv, index=False)

print(f"✅ Dataset combinado guardado en {output_csv}")
print("Distribución de clases después de combinar:")
print(df_all['class'].value_counts())


Encontrados 6 archivos de clases
✅ Dataset combinado guardado en Dataset/dataset_all.csv
Distribución de clases después de combinar:
class
interactive-app    9734
web-browsing       4813
network-storage    4110
video              3131
mail               3045
Name: count, dtype: int64


In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Cargar dataset
df = pd.read_csv("Dataset/dataset_all.csv")

# -------------------------------
# Balancear clases (undersampling)
# -------------------------------
min_class_size = df['class'].value_counts().min()
print(f"Tamaño de clase minoritaria: {min_class_size}")

balanced_df = pd.DataFrame()

for cls in df['class'].unique():
    cls_samples = df[df['class'] == cls].sample(min_class_size, random_state=42)
    balanced_df = pd.concat([balanced_df, cls_samples])

# Mezclar los datos
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n=== Distribución de clases tras undersampling ===")
print(balanced_df['class'].value_counts())

# -------------------------------
# Escalar métricas numéricas
# -------------------------------
features = ['packets','bytes','duration','avg_pkt_size','throughput',
            'syn_count','ack_count','fin_count','rst_count',
            'small_packet_ratio','pkt_size_var','interarrival_var',
            'throughput_per_packet']

scaler = StandardScaler()
balanced_df[features] = scaler.fit_transform(balanced_df[features])

print("\n=== Primeras filas tras escalado ===")
print(balanced_df.head())

# -------------------------------
# Guardar dataset balanceado y escalado
# -------------------------------
balanced_df.to_csv("Dataset/dataset_balanced_scaled.csv", index=False)
print("\nDataset balanceado y escalado guardado en 'dataset_balanced_scaled.csv'")


Tamaño de clase minoritaria: 3045

=== Distribución de clases tras undersampling ===
class
web-browsing       3045
video              3045
interactive-app    3045
network-storage    3045
mail               3045
Name: count, dtype: int64

=== Primeras filas tras escalado ===
                                       flow_id   packets     bytes  duration  \
0      124.192.86.248:61079-68.31.89.180:53-17 -0.065342 -0.042128 -0.402400   
1   61.139.26.122:41416-113.233.206.8:38191-17 -0.065342 -0.042129 -0.402400   
2     61.139.26.122:1433-218.88.239.56:62217-6 -0.064685 -0.042098 -0.392477   
3  111.173.128.39:28755-61.139.26.115:58433-17 -0.065342 -0.042072 -0.402400   
4   112.50.28.154:27912-61.139.26.115:59957-17 -0.065342 -0.042072 -0.402400   

   avg_pkt_size  throughput  syn_count  ack_count  fin_count  rst_count  \
0     -0.496475   -0.036964  -0.673749  -0.056137  -0.609463  -0.076601   
1     -0.504662   -0.036964  -0.673749  -0.056137  -0.609463  -0.076601   
2     -0.553785   -

In [34]:
import pandas as pd
import numpy as np
import os

# === CONFIG ===
input_csv = "Dataset/dataset_balanced_scaled.csv"           # Dataset original
output_csv = "Dataset/dataset_all_features.csv" # Nuevo dataset con características adicionales

# Cargar dataset original
df = pd.read_csv(input_csv)

# --- NUEVAS CARACTERÍSTICAS ---

# Ratios de flags TCP
df['syn_ratio'] = df['syn_count'] / df['packets']
df['ack_ratio'] = df['ack_count'] / df['packets']
df['fin_ratio'] = df['fin_count'] / df['packets']
df['rst_ratio'] = df['rst_count'] / df['packets']
df['control_flag_ratio'] = (df['syn_count'] + df['fin_count'] + df['rst_count']) / df['packets']

# Ratios de throughput
df['bytes_per_packet'] = df['bytes'] / df['packets']
df['throughput_per_avg_pkt'] = df['throughput'] / (df['avg_pkt_size'] + 1e-6)

# Paquetes y bytes por segundo
df['pkt_per_sec'] = df['packets'] / (df['duration'] + 1e-6)
df['bytes_per_sec'] = df['bytes'] / (df['duration'] + 1e-6)

# Transformaciones log para reducir skew
df['log_bytes'] = np.log1p(df['bytes'])
df['log_throughput'] = np.log1p(df['throughput'])
df['log_avg_pkt_size'] = np.log1p(df['avg_pkt_size'])

# Opcional: interacción de características
df['pkt_bytes_ratio'] = df['packets'] / (df['bytes'] + 1e-6)
df['small_pkt_ratio_per_pkt'] = df['small_packet_ratio'] / (df['packets'] + 1e-6)

# Guardar nuevo CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
df.to_csv(output_csv, index=False)

print(f"✅ Nuevo dataset guardado en {output_csv}")
print("Primeras filas del dataset:")
print(df.head())


✅ Nuevo dataset guardado en Dataset/dataset_all_features.csv
Primeras filas del dataset:
                                       flow_id   packets     bytes  duration  \
0      124.192.86.248:61079-68.31.89.180:53-17 -0.065342 -0.042128 -0.402400   
1   61.139.26.122:41416-113.233.206.8:38191-17 -0.065342 -0.042129 -0.402400   
2     61.139.26.122:1433-218.88.239.56:62217-6 -0.064685 -0.042098 -0.392477   
3  111.173.128.39:28755-61.139.26.115:58433-17 -0.065342 -0.042072 -0.402400   
4   112.50.28.154:27912-61.139.26.115:59957-17 -0.065342 -0.042072 -0.402400   

   avg_pkt_size  throughput  syn_count  ack_count  fin_count  rst_count  ...  \
0     -0.496475   -0.036964  -0.673749  -0.056137  -0.609463  -0.076601  ...   
1     -0.504662   -0.036964  -0.673749  -0.056137  -0.609463  -0.076601  ...   
2     -0.553785   -0.036953  -0.673749  -0.054309  -0.609463   0.327434  ...   
3     -0.132143   -0.036958  -0.673749  -0.056137  -0.609463  -0.076601  ...   
4     -0.132143   -0.036958  -