In [1]:
%%capture --no-stderr
%pip install -U langgraph langchain langchain-community langchain-openai openevals langsmith openai tiktoken pycountry

In [None]:
!sudo apt-get update
!sudo apt-get install -y curl apt-transport-https ssl-cert ca-certificates gnupg lsb-release
!curl -1sLf 'https://dl.cloudsmith.io/public/wand/libwandio/cfg/setup/bash.deb.sh' | sudo -E bash
!echo "deb https://pkg.caida.org/os/$(lsb_release -si|awk '{print tolower($0)}') $(lsb_release -sc) main" | sudo tee /etc/apt/sources.list.d/caida.list
!sudo wget -O /etc/apt/trusted.gpg.d/caida.gpg https://pkg.caida.org/os/ubuntu/keyring.gpg
!sudo apt update; sudo apt-get install bgpstream

In [None]:
!pip install pybgpstream
!python3 -m pip install pybgpkit-parser
!python3 -m pip install pybgpkit
!pip install neo4j
!pip install pycountry

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/LLM4BGP

Mounted at /content/drive
/content/drive/MyDrive/LLM4BGP


In [5]:
import getpass
import os
import json
import ast
import re
import yaml
import subprocess
from langchain_openai import ChatOpenAI
from pathlib import Path

In [6]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

Enter your OpenAI API key: ··········


In [7]:
from langsmith import wrappers
from langsmith import Client
from evaluations.lang_smith_aux import *

In [8]:
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("Enter your LangSmith API key: ")

Enter your LangSmith API key: ··········


In [9]:
os.environ['LANGSMITH_TRACING'] = 'true'
os.environ['LANGSMITH_PROJECT'] = 'test'
os.environ['LANGSMITH_ENDPOINT'] = 'https://api.smith.langchain.com'
client = Client(api_key=os.environ['LANGSMITH_API_KEY'])

In [10]:
model = ChatOpenAI(model_name="gpt-4.1")

**Datasets Update**

In [11]:
from datasets.datasets_preparation import *

In [None]:
NUM_GENERATED_SAMPLES = 50
prepare_caida_datasets(NUM_GENERATED_SAMPLES)
create_ipv4_bogons_qas_dataset(NUM_GENERATED_SAMPLES)

Generated 50 rank Q&A pairs → datasets/caida/as_rank_qas.json, 2025-08-05 08:53:28.434997
Generated 50 cone Q&A pairs → datasets/caida/as_cone_qas.json, 2025-08-05 08:53:28.435148
Generated 50 IPv4 bogons Q&A pairs → datasets/bogons/ipv4_bogons_qas.json, 2025-08-05 08:53:29.658678


**Task 1 - AS Customers Cone**

In [12]:
PROMPT_FILE = Path("prompts/knowledge_retreival/as_cone_size_prompts.yaml")

In [13]:
with PROMPT_FILE.open(encoding="utf-8") as fp:
    prompts = yaml.safe_load(fp)

In [14]:
as_customer_cone_dataset_name = "AS Customer Cone Size Q&A dataset"

In [15]:
datasets = client.list_datasets()
as_customer_cone_dataset = None
for dataset in datasets:
  if dataset.name == as_customer_cone_dataset_name:
    as_customer_cone_dataset = dataset
    break

if as_customer_cone_dataset is None:
    # Handle the case where the dataset is not found, perhaps create it
    print(f"Dataset '{as_customer_cone_dataset_name}' not found.")
    # You might want to add logic here to create the dataset if it doesn't exist
    # as_customer_cone_dataset = client.create_dataset(...)
    as_customer_cone_dataset = client.create_dataset(
    dataset_name=as_customer_cone_dataset_name, description="AS Customer Cone Size Q&A dataset in LangSmith."
    )
    as_cone_eval_dataset_path = 'datasets/caida/as_cone_qas.json'
    with open(as_cone_eval_dataset_path, 'r') as f:
        asn_cone_q_a = json.load(f)
    examples = convert_to_examples(asn_cone_q_a)
    client.create_examples(dataset_id=as_customer_cone_dataset.id, examples=examples)

Direct Task Prompting (Zero-Shot)

In [16]:
as_cone_zs_prompt = prompts["as_cone_zs_prompt"]

In [17]:
experiment = "zero-shot-prompting-as-customer-cone-size"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_cone_zs_prompt,
                                  " ",
                                  as_customer_cone_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'zero-shot-prompting-as-customer-cone-size-temp-0.0-10478b65' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/e0e665a7-c8f7-4598-b84a-64a99f28f82c/compare?selectedSessions=5e8275f9-e493-4048-89f2-88655f414fda




0it [00:00, ?it/s]

Role-Based Prompting

In [None]:
as_cone_role_based_prompt = prompts["as_cone_role_based_prompt"]

In [None]:
experiment = "role-based-prompting-as-customer-cone-size"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_cone_role_based_prompt,
                                  " ",
                                  as_customer_cone_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'role-based-prompting-as-customer-cone-size-temp-0.0-dea6da80' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/e0e665a7-c8f7-4598-b84a-64a99f28f82c/compare?selectedSessions=8c9c8861-ae3a-41b4-9425-b4582b158a01




0it [00:00, ?it/s]

Multi-Step Reasoning

In [None]:
as_cone_ms_reasoning_prompt = prompts["as_cone_ms_reasoning_prompt"]

In [None]:
experiment = "multi-step-prompting-as-customer-cone-size"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_cone_ms_reasoning_prompt,
                                  " ",
                                  as_customer_cone_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'multi-step-prompting-as-customer-cone-size-temp-0.0-e55b6618' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/e0e665a7-c8f7-4598-b84a-64a99f28f82c/compare?selectedSessions=e1f8aeef-8802-4a27-9513-029debf90735




0it [00:00, ?it/s]

Few-Shot Learning

In [None]:
as_cone_fs_prompt = prompts["as_cone_fs_prompt"]

In [None]:
experiment = "few-shots-prompting-as-customer-cone-size"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_cone_fs_prompt,
                                  " ",
                                  as_customer_cone_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'few-shots-prompting-as-customer-cone-size-temp-0.0-cc13df78' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/e0e665a7-c8f7-4598-b84a-64a99f28f82c/compare?selectedSessions=bc0a4883-8080-4264-b478-4f02165139b8




0it [00:00, ?it/s]

Chain-of-Thought (CoT) Prompting

In [None]:
as_cone_cot_prompt = prompts["as_cone_cot_prompt"]

In [None]:
experiment = "cot-prompting-as-customer-cone-size"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_cone_cot_prompt,
                                  " ",
                                  as_customer_cone_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'cot-prompting-as-customer-cone-size-temp-0.0-1478cf24' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/e0e665a7-c8f7-4598-b84a-64a99f28f82c/compare?selectedSessions=586a0eda-6bbc-439c-b570-35d139279442




0it [00:00, ?it/s]

**Task 2 - AS Rank**

In [None]:
PROMPT_FILE = Path("prompts/knowledge_retreival/as_rank_prompts.yaml")

In [None]:
with PROMPT_FILE.open(encoding="utf-8") as fp:
    prompts = yaml.safe_load(fp)

In [None]:
as_rank_dataset_name = "AS Rank Q&A dataset"

In [None]:
datasets = client.list_datasets()
as_rank_dataset = None
for dataset in datasets:
  if dataset.name == as_rank_dataset_name:
    as_rank_dataset = dataset
    break

if as_rank_dataset is None:
    # Handle the case where the dataset is not found, perhaps create it
    print(f"Dataset '{as_rank_dataset_name}' not found.")
    # You might want to add logic here to create the dataset if it doesn't exist
    # as_rank_dataset = client.create_dataset(...)
    as_rank_dataset = client.create_dataset(
    dataset_name=as_rank_dataset_name, description="AS Rank Q&A dataset in LangSmith."
    )
    as_rank_eval_dataset_path = 'datasets/caida/as_rank_qas.json'
    with open(as_rank_eval_dataset_path, 'r') as f:
        asn_rank_q_a = json.load(f)
    examples = convert_to_examples(asn_rank_q_a)
    client.create_examples(dataset_id=as_rank_dataset.id, examples=examples)
    print(f"Dataset '{as_rank_dataset_name}' created.")

Dataset 'AS Rank Q&A dataset' not found.
Dataset 'AS Rank Q&A dataset' created.


Direct Task Prompting

In [None]:
as_rank_zs_prompt = prompts["as_rank_zs_prompt"]

In [None]:
experiment = "zero-shot-prompting-as-rank"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_rank_zs_prompt,
                                  " ",
                                  as_rank_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'zero-shot-prompting-as-rank-temp-0.0-415a0348' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/fd4fc5aa-0241-41d7-8bc1-25b5ce0691de/compare?selectedSessions=c64b9525-7979-405e-b438-ab62b092f266




0it [00:00, ?it/s]

Role-Based Prompting

In [None]:
as_rank_role_based_prompt = prompts["as_rank_role_based_prompt"]

In [None]:
experiment = "role-based-prompting-as-rank"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_rank_role_based_prompt,
                                  " ",
                                  as_rank_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'role-based-prompting-as-rank-temp-0.0-57e481a2' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/fd4fc5aa-0241-41d7-8bc1-25b5ce0691de/compare?selectedSessions=7a2a4f27-5777-41b5-aba9-554bf0d042c7




0it [00:00, ?it/s]

Multi-Step Reasoning

In [None]:
as_rank_multi_step_reasoning_prompt = prompts["as_rank_multi_step_reasoning_prompt"]

In [None]:
experiment = "multi-step-prompting-as-rank"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_rank_multi_step_reasoning_prompt,
                                  " ",
                                  as_rank_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'multi-step-prompting-as-rank-temp-0.0-d65d2e20' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/fd4fc5aa-0241-41d7-8bc1-25b5ce0691de/compare?selectedSessions=cc93a385-8223-4d1a-adc2-dea77841e061




0it [00:00, ?it/s]

Few-Shot Learning

In [None]:
as_rank_few_shots_prompt = prompts["as_rank_few_shots_prompt"]

In [None]:
experiment = "few-shots-prompting-as-rank"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_rank_few_shots_prompt,
                                  " ",
                                  as_rank_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'few-shots-prompting-as-rank-temp-0.0-89c0a127' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/fd4fc5aa-0241-41d7-8bc1-25b5ce0691de/compare?selectedSessions=ab84f98c-ee93-465b-8aad-1c5976880feb




0it [00:00, ?it/s]

Chain-of-Thought (CoT) Prompting

In [None]:
as_rank_cot_prompt = prompts["as_rank_cot_prompt"]

In [None]:
experiment = "cot-prompting-as-rank"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  as_rank_cot_prompt,
                                  " ",
                                  as_rank_dataset_name,
                                  experiment,
                                  [])

View the evaluation results for experiment: 'cot-prompting-as-rank-temp-0.0-74f8b82b' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/fd4fc5aa-0241-41d7-8bc1-25b5ce0691de/compare?selectedSessions=7a2cea00-b352-491d-aefa-52955f63586b




0it [00:00, ?it/s]

**Task 3 - Bogon classification**

In [None]:
from evaluations.bogons_eval_aux import *

In [None]:
PROMPT_FILE = Path("prompts/knowledge_retreival/bogons_prompts.yaml")

In [None]:
with PROMPT_FILE.open(encoding="utf-8") as fp:
    prompts = yaml.safe_load(fp)

In [None]:
bogons_dataset_name = "Bogon Prefix Classification Q&A dataset"

In [None]:
datasets = client.list_datasets()
bogons_dataset = None
for dataset in datasets:
  if dataset.name == bogons_dataset_name:
    bogons_dataset = dataset
    break

if bogons_dataset is None:
    # Handle the case where the dataset is not found, perhaps create it
    print(f"Dataset '{bogons_dataset_name}' not found.")
    # You might want to add logic here to create the dataset if it doesn't exist
    # bogons_dataset = client.create_dataset(...)
    bogons_dataset = client.create_dataset(
    dataset_name=bogons_dataset_name, description="Bogons Q&A dataset in LangSmith."
    )
    bogons_dataset_path = 'datasets/bogons/ipv4_bogons_qas.json'
    with open(bogons_dataset_path, 'r') as f:
        bogons_q_a = json.load(f)
    examples = convert_to_examples(bogons_q_a)
    client.create_examples(dataset_id=bogons_dataset.id, examples=examples)

Dataset 'Bogon Prefix Classification Q&A dataset' not found.


In [None]:
summary_evaluators=[bogon_precision_evaluator,
                    bogon_recall_evaluator,
                    bogon_f1_evaluator]

Direct Task Prompting

In [None]:
zs_prompt_bogons = prompts["direct_task_prompt_bogons"]

In [None]:
experiment = "zero-shot-prompting-bogons"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  zs_prompt_bogons,
                                  " ",
                                  bogons_dataset_name,
                                  experiment,
                                  summary_evaluators)

View the evaluation results for experiment: 'zero-shot-prompting-bogons-temp-0.0-0ca507ee' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/cf05012a-03fc-413f-8cc4-86551875678c/compare?selectedSessions=20870a3d-84d8-4c86-979a-9f43e4e42d5b




0it [00:00, ?it/s]

Role-Based Prompting

In [None]:
role_based_prompt_bogons = prompts["role_based_prompt_bogons"]

In [None]:
experiment = "role-based-prompting-bogons"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  role_based_prompt_bogons,
                                  " ",
                                  bogons_dataset_name,
                                  experiment,
                                  summary_evaluators)

View the evaluation results for experiment: 'role-based-prompting-bogons-temp-0.0-ad9f7a6b' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/cf05012a-03fc-413f-8cc4-86551875678c/compare?selectedSessions=1f91a5fb-6bea-49e4-94a7-00b6f9055196




0it [00:00, ?it/s]

Multi-Step Reasoning

In [None]:
multi_step_reasoning_prompt_bogons = prompts["multi_step_reasoning_prompt_bogons"]

In [None]:
experiment = "multi-step-prompting-bogons"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  multi_step_reasoning_prompt_bogons,
                                  " ",
                                  bogons_dataset_name,
                                  experiment,
                                  summary_evaluators)

View the evaluation results for experiment: 'multi-step-prompting-bogons-temp-0.0-930c2ad9' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/cf05012a-03fc-413f-8cc4-86551875678c/compare?selectedSessions=299dcddd-24c6-47c9-9726-982e53929b13




0it [00:00, ?it/s]

Few-Shot Learning

In [None]:
few_shot_prompt_bogons = prompts["few_shot_prompt_bogons"]

In [None]:
experiment = "few-shot-prompting-bogons"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  few_shot_prompt_bogons,
                                  " ",
                                  bogons_dataset_name,
                                  experiment,
                                  summary_evaluators)

View the evaluation results for experiment: 'few-shot-prompting-bogons-temp-0.0-2a70276a' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/cf05012a-03fc-413f-8cc4-86551875678c/compare?selectedSessions=9c803f94-feb0-4bc8-b34e-e9b3658efd58




0it [00:00, ?it/s]

Chain-of-Thought (CoT) Prompting

In [None]:
cot_prompt_bogons = prompts["cot_prompt_bogons"]

In [None]:
experiment = "cot-prompting-bogons"
eval_prompt_engineering_on_openai(client,
                                  model,
                                  cot_prompt_bogons,
                                  " ",
                                  bogons_dataset_name,
                                  experiment,
                                  summary_evaluators)

View the evaluation results for experiment: 'cot-prompting-bogons-temp-0.0-78169181' at:
https://smith.langchain.com/o/d3cb8df4-aea8-404a-b7f7-3c7ee52670a1/datasets/cf05012a-03fc-413f-8cc4-86551875678c/compare?selectedSessions=900bd3d9-399f-4e2c-814b-f987227c4186




0it [00:00, ?it/s]