In [11]:
import os
import re
from typing import Union, Iterable
import pandas as pd

def parse_traceroute_log_file(file_path: str) -> pd.DataFrame:
    """
    Parse a CAIDA-style traceroute log file into a pandas DataFrame.

    Each valid row must match:
      vp_name, country_code, src_ip, (lat, lon), user_id, trace_src, trace_dst, status

    Hop lines (those starting with numbers or '*') are ignored.
    """
    rows = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            m = HEADER_RE.match(line)
            if not m:
                # Not a header (hop lines, etc.)
                continue
            d = m.groupdict()
            d["lat"] = float(d["lat"])
            d["lon"] = float(d["lon"])
            d["user_id"] = int(d["user_id"])
            d["status"] = d["status"].lower()
            rows.append(d)

    return pd.DataFrame.from_records(
        rows,
        columns=["vp_name", "country", "src_ip", "lat", "lon",
                 "user_id", "trace_src", "trace_dst", "status"]
    )

log_file = "./run1_trunc.log"
df = parse_traceroute_log_file(log_file)

In [10]:
df

Unnamed: 0,user_id,status
0,40,error
1,41,error
2,42,error
3,43,error
4,44,error
...,...,...
955,113,completed
956,156,completed
957,353,completed
958,98,gaplimit


In [5]:
# Sanity Check
uid_counts = df.groupby('user_id').count()
len(uid_counts[uid_counts.vp_name != 2])

0

In [12]:
# Breakdown of successful traceroutes
df_ = df[["user_id", "country", "status"]].copy()

group_counts = []
for (user_id, country), g in df_.groupby(["user_id", "country"]):
    statuses = tuple(sorted(g["status"].tolist()))
    if len(statuses) == 1:
        statuses = (statuses[0], None)
    group_counts.append(
        {"user_id": user_id, "country": country, "status_pair": statuses}
    )

out = pd.DataFrame(group_counts)

# Count how many (user_id, country) fall into each status_pair
breakdown = (
    out.value_counts(["country", "status_pair"])
    .rename_axis(["country", "status_pair"])
    .reset_index(name="count")
)

In [14]:
breakdown.sort_values(by='status_pair')

Unnamed: 0,country,status_pair,count
1,BR,"(completed, completed)",44
2,ES,"(completed, completed)",39
3,IN,"(completed, completed)",37
10,CA,"(completed, completed)",20
15,US,"(completed, completed)",14
12,AU,"(completed, completed)",19
8,AU,"(completed, error)",24
0,US,"(completed, gaplimit)",46
5,IN,"(completed, gaplimit)",35
6,ES,"(completed, gaplimit)",35


In [None]:
# Analyze completed pairs

