In [3]:
import scapy.all as scapy
from scapy.layers.inet import IP, TCP
import pandas as pd
import os
from datetime import datetime, timedelta, timezone
from collections import defaultdict

# Global Configuration for your timezone
LOCAL_OFFSET = -3  # GMT-3
LOCAL_TZ = timezone(timedelta(hours=LOCAL_OFFSET))

# Define your experiment window (Local time)
START_TIME = "2025-12-26 14:15:00"
END_TIME   = "2025-12-26 16:25:00"

class ExperimentAnalyzer:
    def __init__(self, directory, start_str, end_str):
        self.directory = directory
        self.start_ts = datetime.strptime(start_str, "%Y-%m-%d %H:%M:%S") \
                            .replace(tzinfo=LOCAL_TZ).timestamp()
        
        self.end_ts = datetime.strptime(end_str, "%Y-%m-%d %H:%M:%S") \
                            .replace(tzinfo=LOCAL_TZ).timestamp()

        print(f"Filtering for UTC Range: {datetime.fromtimestamp(self.start_ts, tz=timezone.utc)}")
        
        # State
        self.all_packet_data = []
        self.metrics = {"resets": 0, "retrans": 0, "zeros": 0}
        self.pending_reqs = {}
        self.flow_sequences = defaultdict(set)

    def run_batch(self):
        # Get all pcap files and sort them chronologically
        files = sorted([f for f in os.listdir(self.directory) if f.endswith(".pcap")])
        
        for filename in files:
            file_path = os.path.join(self.directory, filename)
            
            # Rough check: Parse timestamp from filename (e.g., experiment_capture_20251226_170306.pcap)
            try:
                file_start_str = "_".join(filename.split("_")[2:4]).replace(".pcap", "")
                file_start_ts = datetime.strptime(file_start_str, "%Y%m%d_%H%M%S").timestamp()
                
                # If file starts after our window, we can stop (since files are sorted)
                if file_start_ts > self.end_ts:
                    continue
            except Exception:
                pass # Fallback to processing if filename format differs

            print(f"Processing: {filename}...")
            self._process_file(file_path)

        self._print_final_report()

    def _process_file(self, file_path):
        with scapy.PcapReader(file_path) as reader:
            for pkt in reader:
                if not pkt.haslayer(TCP): continue
                
                # Fine-grained timestamp filter
                ts = float(pkt.time)
                if ts < self.start_ts: continue
                if ts > self.end_ts: break # Optimization: stop reading this file
                
                ip = pkt[IP]
                tcp = pkt[TCP]
                
                # Logic for Kafka Filtering
                if (ip.src == KAFKA_IP and tcp.sport == KAFKA_PORT) or \
                   (ip.dst == KAFKA_IP and tcp.dport == KAFKA_PORT):
                    self._extract_metrics(ip, tcp, ts)

    def _extract_metrics(self, ip, tcp, ts):
        # 1. TCP Health
        if tcp.flags & 0x04: self.metrics["resets"] += 1
        if tcp.window == 0: self.metrics["zeros"] += 1
        
        # 2. Retransmissions
        flow = (ip.src, ip.dst, tcp.sport, tcp.dport)
        if len(tcp.payload) > 0:
            if tcp.seq in self.flow_sequences[flow]:
                self.metrics["retrans"] += 1
            else:
                self.flow_sequences[flow].add(tcp.seq)

        # 3. Kafka Correlation ID Matching
        payload = bytes(tcp.payload)
        if len(payload) > 12:
            try:
                if tcp.dport == KAFKA_PORT: # Request
                    cid = int.from_bytes(payload[8:12], "big")
                    self.pending_reqs[cid] = ts
                elif tcp.sport == KAFKA_PORT: # Response
                    cid = int.from_bytes(payload[4:8], "big")
                    if cid in self.pending_reqs:
                        lat = (ts - self.pending_reqs[cid]) * 1000
                        self.all_packet_data.append({'type': 'kafka_lat', 'val': lat, 'ts': ts})
                        del self.pending_reqs[cid]
            except: pass

        # 4. Jitter (IAT from Broker)
        if ip.src == KAFKA_IP:
            self.all_packet_data.append({'type': 'iat', 'val': ts, 'ts': ts})

    def _print_final_report(self):
        df = pd.DataFrame(self.all_packet_data)
        print("\n" + "="*40)
        print(f"EXPERIMENT SUMMARY ({START_TIME} to {END_TIME})")
        print("="*40)
        print(f"Resets: {self.metrics['resets']} | Retrans: {self.metrics['retrans']} | ZeroWindows: {self.metrics['zeros']}")
        
        # 1. Get Inter-Arrival Times (IAT) in milliseconds
        iat_series = df[df['type'] == 'iat']['val'].diff().dropna() * 1000
        
        # 2. Filter out idle gaps (e.g., any gap > 1000ms is considered 'app idleness', not network jitter)
        # This prevents the 160s jitter you saw.
        network_iats = iat_series[iat_series < 1000]
        
        if not network_iats.empty:
            # 3. Calculate Average Jitter (Mean Absolute Deviation of successive IATs)
            # This is closer to how Wireshark calculates it.
            jitter = np.abs(network_iats.diff()).mean()
        else:
            jitter = 0

        print(f"Refined Network Jitter: {jitter:.4f} ms")

        # Kafka Latency Stats
        k_lat = df[df['type'] == 'kafka_lat']['val']
        if not k_lat.empty:
            print(f"Kafka Latency: Avg={k_lat.mean():.2f}ms, P95={k_lat.quantile(0.95):.2f}ms")

In [None]:
# Global Config
KAFKA_IP = "167.71.21.92"
KAFKA_PORT = 32289
DATA_DIR = "../../data/raw/experiment08/tcp_dump_cloud"

analyzer = ExperimentAnalyzer(DATA_DIR, START_TIME, END_TIME)
analyzer.run_batch()

In [10]:
# Global Config
KAFKA_IP = "172.16.208.242"
KAFKA_PORT = 31289
DATA_DIR = "../../data/raw/experiment08/tcp_dump_edge"

analyzer = ExperimentAnalyzer(DATA_DIR, START_TIME, END_TIME)
analyzer.run_batch()

Filtering for UTC Range: 2025-12-26 17:15:00+00:00
Processing: experiment_capture_20251226_133302.pcap...
Processing: experiment_capture_20251226_170234.pcap...
Processing: experiment_capture_20251226_171937.pcap...
Processing: experiment_capture_20251226_173550.pcap...
Processing: experiment_capture_20251226_175413.pcap...
Processing: experiment_capture_20251226_180427.pcap...
Processing: experiment_capture_20251226_182012.pcap...
Processing: experiment_capture_20251226_183534.pcap...
Processing: experiment_capture_20251226_185119.pcap...

EXPERIMENT SUMMARY (2025-12-26 14:15:00 to 2025-12-26 16:25:00)
Resets: 0 | Retrans: 0 | ZeroWindows: 0
Refined Network Jitter: 46.4070 ms
Kafka Latency: Avg=62.34ms, P95=93.85ms


In [None]:
class PcapToParquetExporter:
    def __init__(self, directory):
        self.directory = directory
        self.records = []
        self.pending_reqs = {} # corr_id -> timestamp
        self.syn_times = {}    # flow -> timestamp

    def process(self):
        files = sorted([f for f in os.listdir(self.directory) if f.endswith(".pcap")])
        
        for filename in files:
            path = os.path.join(self.directory, filename)
            print(f"Exporting: {filename}")
            
            with scapy.PcapReader(path) as reader:
                for pkt in reader:
                    if not pkt.haslayer(TCP) or not pkt.haslayer(IP):
                        continue
                        
                    ip, tcp = pkt[IP], pkt[TCP]
                    ts = float(pkt.time)
                    
                    # Filter for Kafka
                    is_to_kafka = (ip.dst == KAFKA_IP and tcp.dport == KAFKA_PORT)
                    is_from_kafka = (ip.src == KAFKA_IP and tcp.sport == KAFKA_PORT)
                    
                    if not (is_to_kafka or is_from_kafka):
                        continue

                    record = {
                        "timestamp": ts,
                        "src_ip": ip.src,
                        "dst_ip": ip.dst,
                        "actual_msg_count": 0,
                        "tcp_flags": int(tcp.flags),
                        "window_size": tcp.window,
                        "payload_len": len(tcp.payload),
                        "rtt_ms": None,
                        "kafka_lat_ms": None,
                        "is_retransmission": 0
                    }

                    # --- RTT Calculation (Network) ---
                    if tcp.flags & 0x02: # SYN
                        self.syn_times[(ip.src, tcp.sport)] = ts
                    elif tcp.flags & 0x12: # SYN-ACK
                        key = (ip.dst, tcp.dport)
                        if key in self.syn_times:
                            record["rtt_ms"] = (ts - self.syn_times[key]) * 1000
                            del self.syn_times[key]

                    # --- Kafka Latency Calculation (App) ---
                    payload = bytes(tcp.payload)
                    if len(payload) > 12:
                        try:
                            if is_to_kafka:
                                corr_id = int.from_bytes(payload[8:12], "big")
                                self.pending_reqs[corr_id] = ts
                            elif is_from_kafka:
                                corr_id = int.from_bytes(payload[4:8], "big")
                                if corr_id in self.pending_reqs:
                                    record["kafka_lat_ms"] = (ts - self.pending_reqs[corr_id]) * 1000
                                    del self.pending_reqs[corr_id]
                        except: pass

                    self.records.append(record)

        # Convert and Save
        df = pd.DataFrame(self.records)
        if not df.empty:
            # 2. Sort by timestamp to ensure the rolling window is accurate
            df = df.sort_values("timestamp")
            
            # 4. Save to Parquet
            df.to_parquet(OUTPUT_FILE, compression='snappy')
            print(f"Successfully saved {len(df)} records with throughput metrics to {OUTPUT_FILE}")

In [6]:
OUTPUT_FILE = "analyzed_network_metrics_cloud.parquet"
KAFKA_IP = "167.71.21.92"
KAFKA_PORT = 32289
DATA_DIR = "../../data/raw/experiment08/tcp_dump_cloud"
exporter = PcapToParquetExporter(DATA_DIR)
exporter.process()

Exporting: experiment_capture_20251226_170306.pcap
Exporting: experiment_capture_20251226_171043.pcap
Exporting: experiment_capture_20251226_172005.pcap
Exporting: experiment_capture_20251226_173533.pcap
Exporting: experiment_capture_20251226_175124.pcap
Exporting: experiment_capture_20251226_180530.pcap
Exporting: experiment_capture_20251226_181957.pcap
Exporting: experiment_capture_20251226_183605.pcap
Exporting: experiment_capture_20251226_185436.pcap
Successfully saved 304 records with throughput metrics to analyzed_network_metrics_cloud.parquet


In [7]:
OUTPUT_FILE = "analyzed_network_metrics_edge.parquet"
KAFKA_IP = "172.16.208.242"
KAFKA_PORT = 31289
DATA_DIR = "../../data/raw/experiment08/tcp_dump_edge"
exporter = PcapToParquetExporter(DATA_DIR)
exporter.process()

Exporting: experiment_capture_20251226_133302.pcap
Exporting: experiment_capture_20251226_170234.pcap
Exporting: experiment_capture_20251226_171937.pcap
Exporting: experiment_capture_20251226_173550.pcap
Exporting: experiment_capture_20251226_175413.pcap
Exporting: experiment_capture_20251226_180427.pcap
Exporting: experiment_capture_20251226_182012.pcap
Exporting: experiment_capture_20251226_183534.pcap
Exporting: experiment_capture_20251226_185119.pcap
Successfully saved 269 records with throughput metrics to analyzed_network_metrics_edge.parquet


In [14]:
df_cloud = pd.read_parquet("analyzed_network_metrics_cloud.parquet")

In [None]:
df_cloud

Unnamed: 0,timestamp,src_ip,dst_ip,actual_msg_count,tcp_flags,window_size,payload_len,rtt_ms,kafka_lat_ms,is_retransmission
0,1766769000.0,167.71.21.92,172.17.0.5,0,18,65535,0,,,0
1,1766769000.0,172.17.0.5,167.71.21.92,0,16,502,0,0.040054,,0
2,1766769000.0,172.17.0.5,167.71.21.92,0,24,502,66,,,0
3,1766769000.0,167.71.21.92,172.17.0.5,0,16,32768,0,,,0
4,1766769000.0,167.71.21.92,172.17.0.5,0,24,32768,460,,143.31007,0


In [17]:
df_edge = pd.read_parquet("analyzed_network_metrics_edge.parquet")

In [19]:
df_edge.head()

Unnamed: 0,timestamp,src_ip,dst_ip,actual_msg_count,tcp_flags,window_size,payload_len,rtt_ms,kafka_lat_ms,is_retransmission
0,1766756000.0,172.16.208.242,172.17.0.4,0,24,32768,1398,,,0
1,1766756000.0,172.17.0.4,172.16.208.242,0,16,589,0,,,0
2,1766756000.0,172.16.208.242,172.17.0.4,0,24,32768,699,,,0
3,1766756000.0,172.17.0.4,172.16.208.242,0,16,611,0,,,0
4,1766756000.0,172.17.0.4,172.16.208.242,0,17,611,0,,,0
