In [None]:
!sudo apt-get update
!sudo apt-get install -y curl apt-transport-https ssl-cert ca-certificates gnupg lsb-release
!curl -1sLf 'https://dl.cloudsmith.io/public/wand/libwandio/cfg/setup/bash.deb.sh' | sudo -E bash
!echo "deb https://pkg.caida.org/os/$(lsb_release -si|awk '{print tolower($0)}') $(lsb_release -sc) main" | sudo tee /etc/apt/sources.list.d/caida.list
!sudo wget -O /etc/apt/trusted.gpg.d/caida.gpg https://pkg.caida.org/os/ubuntu/keyring.gpg
!sudo apt update; sudo apt-get install bgpstream

In [None]:
!pip install pybgpstream
!python3 -m pip install pybgpkit-parser
!python3 -m pip install pybgpkit
!pip install neo4j
!pip install pycountry

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/LLM4BGP

Mounted at /content/drive
/content/drive/MyDrive/LLM4BGP


In [93]:
from datetime import datetime, timedelta
import pybgpstream
from IPython.display import Markdown
import random
import random, json, bz2, urllib.request
from collections import defaultdict
from typing import Dict, List, Tuple
import json
import time
from pathlib import Path
import gzip
import requests
import pandas as pd

In [163]:
def collect_paths(asn, limit, project, collectors, record_type, hours):
    until = datetime.utcnow()
    since = until - timedelta(hours=5*hours)
    seen = set()
    out = []
    for collector in collectors:
        stream = pybgpstream.BGPStream(
            project=project,
            collectors=[collector],
            record_type=record_type,
            # from_time=since.strftime("%Y-%m-%d %H:%M:%S"),
            # until_time=until.strftime("%Y-%m-%d %H:%M:%S UTC"),
            from_time="2025-08-03 00:00:00",
            until_time="2025-08-03 00:10:00 UTC",
            filter=f"aspath _{asn}_"
        )
        cnt = 0
        for elem in stream:
            # print(elem)
            path = elem.fields.get("as-path")
            if not path:
                continue
            if path not in seen:
                seen.add(path)
                out.append(path)
                cnt += 1
                if len(out) >= limit:
                    return out
    return out

In [164]:
def get_paths():
    TARGET_ASNS = [174, 3356, 6810, 8551, 9121, 176, 4657]
    rv_collectors=("route-views2","route-views.eqix","route-views.linx", "route-views.sg")
    updates_hours=3
    ribs_hours=24
    limit = 15
    paths = []
    for asn in TARGET_ASNS:
        asn_paths = collect_paths(asn, limit, "routeviews", rv_collectors, "updates", updates_hours)
        paths += asn_paths
    return paths

In [165]:
def load_relationships(rel_file: str):
    """Parse CAIDA as-rel2 text file into a direction-sensitive map."""
    rel= {}
    with open(rel_file, "r", encoding="utf-8") as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            a, b, r, _ = line.split('|')
            a, b, r = int(a), int(b), int(r)
            if r == -1:                        # provider → customer
                rel[(a, b)], rel[(b, a)] = -1, +1
            else:                              # peer
                rel[(a, b)] = rel[(b, a)] = 0
    return rel

In [174]:
def is_valley_free(path, rel_map):
    # Define states: 0 (uphill), 1 (flat), 2 (downhill)
    state = 0
    for i in range(len(path) - 1):
        a, b = path[i], path[i+1]
        rel = rel_map.get((a, b))
        if rel is None:
            continue
        if state == 0:  # uphill
            if rel == -1:  # downhill starts
                state = 2
            elif rel == 0:  # peer relationship
                state = 1
        elif state == 1:  # flat (peering)
            if rel == 1:  # invalid transition back to uphill
                return False
            elif rel == -1:  # downhill allowed from flat
                state = 2
        elif state == 2:  # downhill
            if rel != -1:  # only downhill allowed after downhill
                return False
    return True

In [175]:
def generate_valley_free_dataset(paths, rel_map, total_paths=50):
    labeled_paths = []
    random.shuffle(paths)
    count_vf = count_non_vf = 0

    for path_str in paths:
        if len(labeled_paths) >= total_paths:
            break
        path = [int(asn) for asn in path_str.split()]
        vf = is_valley_free(path, rel_map)
        labeled_paths.append((path_str, vf))
        if vf:
            count_vf += 1
        else:
            count_non_vf += 1

    print(f"Generated {len(labeled_paths)} paths: {count_vf} Valley-Free, {count_non_vf} Non-Valley-Free")
    return labeled_paths

In [180]:
def filter_valley_free_only(labeled_paths):
    return [path for path, is_vf in labeled_paths if is_vf]

In [183]:
def build_reverse_rel_map(rel_map):
    """Builds a map from each AS to its neighbors with labeled relationships."""
    neighbor_map = defaultdict(list)
    for (a, b), rel in rel_map.items():
        neighbor_map[a].append((b, rel))
    return neighbor_map

def build_non_valley_free_path(neighbor_map, max_len=5):
    """Builds a single synthetic non-Valley-Free path."""
    for _ in range(100):  # attempt up to 100 times
        path = []
        current = random.choice(list(neighbor_map.keys()))
        path.append(current)
        state = 0  # 0 = uphill, 1 = flat, 2 = downhill
        for _ in range(max_len - 1):
            neighbors = neighbor_map.get(current, [])
            if not neighbors:
                break
            random.shuffle(neighbors)
            for neighbor, rel in neighbors:
                if state == 0 and rel == 1:
                    path.append(neighbor)
                    current = neighbor
                    continue
                elif state == 0 and rel == 0:
                    state = 1
                    path.append(neighbor)
                    current = neighbor
                    continue
                elif state == 1 and rel == 1:
                    path.append(neighbor)  # Invalid transition
                    return path
                elif state == 1 and rel == -1:
                    state = 2
                    path.append(neighbor)
                    current = neighbor
                    continue
                elif state == 2 and rel != -1:
                    path.append(neighbor)  # Invalid transition
                    return path
                elif state == 2 and rel == -1:
                    path.append(neighbor)
                    current = neighbor
                    continue
            break
    return None

def generate_synthetic_non_valley_free_paths(rel_map, total_paths=50):
    neighbor_map = build_reverse_rel_map(rel_map)
    paths = []
    while len(paths) < total_paths:
        p = build_non_valley_free_path(neighbor_map)
        if p and not is_valley_free(p, rel_map):
            paths.append(" ".join(map(str, p)))
    return paths

In [None]:
def label_paths(vf_paths, non_vf_paths):
    labeled = []
    for p in vf_paths:
        labeled.append({"path": p, "label": "VF"})
    for p in non_vf_paths:
        labeled.append({"path": p, "label": "non-VF"})
    return labeled

In [193]:
def create_vf_q_a_dataset(labeled_dataset):
    # Split VF and non-VF paths
    vf_paths = [item for item in labeled_dataset if item["label"] == "VF"]
    non_vf_paths = [item for item in labeled_dataset if item["label"] == "non-VF"]

    # Sample 25 from each
    sampled_vf = random.sample(vf_paths, 25)
    sampled_non_vf = random.sample(non_vf_paths, 25)
    combined = sampled_vf + sampled_non_vf
    random.shuffle(combined)

    # Generate Q&A
    vf_qas = []
    for item in combined:
        path = item["path"]
        label = item["label"]
        question = f"Is the following path valley-free: {path}?"
        answer = (
            f"The path {path} is valid and valley free"
            if label == "VF"
            else f"The path {path} is invalid and not a valley free path"
        )
        vf_qas.append({"question": question, "answer": answer})

    # Save to file
    vf_qas_file = Path("vf_qas.json")
    with vf_qas_file.open("w", encoding="utf-8") as f:
        json.dump(vf_qas, f, indent=2)

    print(f"Generated {len(vf_qas)} valley-free Q&A pairs → {vf_qas_file}")

In [179]:
def create_vf_inference_q_a_dataset(dataset_size, as_rel_data_path):
    rel_map = load_relationships(as_rel_data_path)
    paths = get_paths()
    dataset = generate_valley_free_dataset(paths, rel_map, total_paths=dataset_size)

    # Example usage
    valley_free_paths = filter_valley_free_only(dataset)
    synthetic_non_vf_paths = generate_synthetic_non_valley_free_paths(rel_map, total_paths=dataset_size)

    # Example usage
    labeled_dataset = label_paths(valley_free_paths, synthetic_non_vf_paths)

    create_vf_q_a_dataset(labeled_dataset)

Generated 50 paths: 46 Valley-Free, 4 Non-Valley-Free
