In [22]:
%%capture
!pip install -q --progress-bar on "git+https://github.com/ibm-granite-community/utils.git" \
    transformers \
    pillow \
    langchain_community \
    langchain_huggingface \
    langchain_milvus \
    docling

In [23]:
!pip install PyPDF2
!pip install pymupdf
!pip install frontend
!pip install pytesseract
!pip install langchain_ibm

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pymupdf
  Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.5
Collecting frontend
  Using cached frontend-0.0.3-py3-none-any.whl.metadata (847 bytes)
Collecting starlette>=0.12.0 (from frontend)
  Using cached starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting uvicorn>=0.7.1 (from frontend)
  Using cached uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting itsdangerous>=1.1.0 (from frontend)
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting aiofiles (from frontend)
  Using cached aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Using cached frontend-0

In [24]:
# === Cell 1: Preprocessing with Inline Color Tagging ===

import os
# Suppress Milvus C++ server logs and disable HuggingFace tokenizers parallelism warnings
os.environ["GLOG_minloglevel"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import io
import re
import json
import tempfile
import requests
import csv
from dotenv import load_dotenv
from PyPDF2 import PdfReader, PdfWriter
from transformers import GPT2TokenizerFast
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document as LangDocument

load_dotenv()

# Regex matchers for CLI
PROMPT_RX = re.compile(r'^[\w\-\./]+@[\w\-\./]+[>%#]\s+.+')
CONFIG_RX = re.compile(r'^\s*(?:set|delete|show|request|clear|commit|rollback)\s+.+', re.IGNORECASE)

def is_cli(text: str) -> bool:
    return bool(PROMPT_RX.match(text.strip()) or CONFIG_RX.match(text.strip()))

def extract_colored_cli_spans(pdf_bytes: bytes):
    import fitz  # PyMuPDF
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    cli_spans = []
    for page_idx, page in enumerate(doc, start=1):
        for block in page.get_text("dict")["blocks"]:
            if block["type"] != 0:
                continue
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"].strip()
                    if not is_cli(text):
                        continue
                    c = span.get("color", 0)
                    r, g, b = (c >> 16 & 0xFF, c >> 8 & 0xFF, c & 0xFF)
                    # Record the entire CLI span with its color
                    cli_spans.append({
                        "page": page_idx,
                        "cli": text,
                        "color": (r, g, b)
                    })
    doc.close()
    return cli_spans

def save_pdf_as_txt(pdf_path: str, txt_path: str):
    import fitz
    doc = fitz.open(pdf_path)
    with open(txt_path, "w", encoding="utf-8") as out:
        for page in doc:
            out.write(page.get_text())
            out.write("\f")
    doc.close()

def extract_clis(txt_path: str):
    clis = []
    with open(txt_path, encoding='utf-8') as f:
        for raw in f:
            line = raw.rstrip()
            if not (PROMPT_RX.match(line) or CONFIG_RX.match(line)):
                continue
            stripped = line.strip()
            if len(stripped) > 200 or ' ' not in stripped:
                continue
            clis.append(stripped)
    seen = set()
    unique = []
    for cmd in clis:
        if cmd not in seen:
            seen.add(cmd)
            unique.append(cmd)
    return unique

def get_iam_access_token(api_key: str):
    iam_url = "https://iam.cloud.ibm.com/identity/token"
    resp = requests.post(
        iam_url,
        data={"apikey": api_key, "grant_type": "urn:ibm:params:oauth:grant-type:apikey"},
        headers={"Content-Type": "application/x-www-form-urlencoded"}
    )
    resp.raise_for_status()
    return resp.json()["access_token"]

def main():
    global pdf_path, txt_path, cli_spans, texts, vector_db

    pdf_path = "CSR-ACX7024-configuration-guide-v1.1.pdf"
    txt_path = pdf_path.replace(".pdf", ".txt")

    print("Loading and OCR-ing PDF...")
    with open(pdf_path, "rb") as f:
        reader = PdfReader(f)
        writer = PdfWriter()
        for p in reader.pages:
            writer.add_page(p)
        buf = io.BytesIO()
        writer.write(buf)
    pdf_bytes = buf.getvalue()

    print("Extracting colored CLI spans...")
    cli_spans = extract_colored_cli_spans(pdf_bytes)
    print(f"Found {len(cli_spans)} CLI spans.")

    print("Saving PDF as text...")
    save_pdf_as_txt(pdf_path, txt_path)

    print("Extracting CLI commands...")
    commands = extract_clis(txt_path)
    print(f"Extracted {len(commands)} unique CLI commands.")

    with open("extracted_commands.txt", "w", encoding="utf-8") as cmd_file:
        for cmd in commands:
            cmd_file.write(cmd + "\n")
    print("Saved extracted commands.")

    print("Converting to Docling document...")
    stream = io.BytesIO(pdf_bytes)
    doc_stream = DocumentStream(name=pdf_path, stream=stream, input_format=InputFormat.PDF)
    opts = PdfPipelineOptions(do_ocr=False, generate_picture_images=True)
    converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)})
    converted = converter.convert(source=doc_stream).document

    print("Chunking document with inline color tags…")
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    chunker = HybridChunker(tokenizer=tokenizer, chunk_size=tokenizer.model_max_length, chunk_overlap=20)
    texts = []
    doc_id = 0
    for chunk in chunker.chunk(converted):
        items = chunk.meta.doc_items
        if len(items) == 1 and isinstance(items[0], TableItem):
            continue
        tokens = tokenizer.encode(chunk.text)
        safe_text = tokenizer.decode(tokens[:tokenizer.model_max_length], skip_special_tokens=True)
        # Annotate full CLI spans (skip black)
        annotated = safe_text
        for s in cli_spans:
            span_text = s["cli"]
            r, g, b = s["color"]
            if (r, g, b) == (0, 0, 0):
                continue
            tag = f"<COLOR=({r},{g},{b})>{span_text}</COLOR>"
            if span_text in annotated:
                annotated = annotated.replace(span_text, tag, 1)
        doc_id += 1
        texts.append(LangDocument(page_content=annotated, metadata={"doc_id": doc_id, "source": pdf_path}))
    print(f"Prepared {len(texts)} annotated text chunks for embedding.")

    print("Initializing vector database…")
    embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False).name
    vector_db = Milvus(
        embedding_function=embed_model,
        connection_args={"uri": db_file},
        auto_id=True,
        index_params={"index_type": "AUTOINDEX"}
    )
    vector_db.add_documents(texts)
    print("Vector database ready.")

# Run preprocessing
main()

Loading and OCR-ing PDF...
Extracting colored CLI spans...
Found 704 CLI spans.
Saving PDF as text...
Extracting CLI commands...
Extracted 611 unique CLI commands.
Saved extracted commands.
Converting to Docling document...
Chunking document with inline color tags…


Token indices sequence length is longer than the specified maximum sequence length for this model (2010 > 1024). Running this sequence through the model will result in indexing errors


Prepared 221 annotated text chunks for embedding.
Initializing vector database…
Vector database ready.


In [26]:
# === Cell 2: RAG with Example CLIs Inserted into Prompt (Fixed questions loop) ===

import os
import re
import csv
from dotenv import load_dotenv
from langchain_ibm import WatsonxLLM

load_dotenv()

# CSV header
output_csv = "rag_results.csv"
with open(output_csv, "w", newline="", encoding="utf-8") as f:
    csv.writer(f).writerow(["question", "answer", "context"])

# A static list of 30 representative CLI examples
static_clis = [
    "set system host-name <hostname>",
    "set system time-zone <zone>",
    "set system ntp server <ntp-server-ip-addr>",
    "set interfaces lo0 unit 0 family inet address <loopback0_ip_address/prefix>",
    "set system services ssh",
    "request chassis beacon fpc <slot> pic-slot <pic> port <port> on timer <minutes>",
    "request chassis beacon fpc <slot> off",
    "show firewall filter wan-in-v4-test-et-0/0/0.2001-i",
    "show firewall filter wan-in-v6-in-et-0/0/0.2001-i",
    "set firewall family inet filter CMU_CSID_V1100|1500|1600-INGR term BGP-IN from source-prefix-list CSID-BGP",
    "set firewall family inet6 filter wan-in-v6-in term 1 from traffic-class af41",
    "set firewall family inet6 filter wan-in-v6-in term 1 then count inbound-af11-v6-counter",
    "set firewall family inet6 filter wan-in-v6-in term default then accept",
    "set policy-options prefix-list CSID-BGP-V1100|1500|1600 <IPV4-SUBNETS>",
    "set class-of-service interfaces ge-x/y/z shaping-rate 300m",
    "show system software list",
    "request system software add /var/tmp/junos-evo-install-acx-f... no-validate reboot",
    "show chassis pic fpc-slot 0 pic-slot 0",
    "set chassis alarm management-ethernet link-down ignore",
    "set protocols lldp interface all disable",
    "set system no-redirects",
    "set system default-address-selection",
    "delete chassis auto-image-upgrade",
    "delete system commit factory-settings",
    "set system login class REMOTE idle-timeout 15",
    "set system login user <admin> encrypted-password \"<secret>\"",
    "set system ports console log-out-on-disconnect",
    "set system ports console type vt100",
    "set interfaces et-0/0/0 unit 2001 family inet filter input wan-in-v4-test",
    "set interfaces et-0/0/0 unit 2001 family inet6 filter input wan-in-v6-in",
]

# Pre-format the examples as a bullet list
CLI_CONTEXT = "\n".join(f"- {cmd}" for cmd in static_clis)

# Setup LLM
API_KEY = os.getenv("WATSONX_API_KEY")
PROJECT_ID = os.getenv("WATSONX_PROJECT_ID")
token = get_iam_access_token(API_KEY)
llm = WatsonxLLM(
    model_id="meta-llama/llama-3-405b-instruct",
    url="https://us-south.ml.cloud.ibm.com",
    apikey=API_KEY,
    project_id=PROJECT_ID
)

def strip_color_tags(text: str) -> str:
    """
    Remove all <COLOR=(r,g,b)>…</COLOR> wrappers from the given text,
    returning only the inner content.
    """
    pattern = re.compile(r"<COLOR=\(\d+,\d+,\d+\)>(.*?)</COLOR>")
    return pattern.sub(r"\1", text)

def run_rag(question: str, k: int):
    # Retrieve context from vector DB
    docs = vector_db.as_retriever().invoke(question, k=k)
    context = "\n\n".join(d.page_content for d in docs)

    # Build prompt, inserting the examples before the actual context
    prompt = (
        "<|begin_of_text|>"
        "<|system|>"
        "You are a Junos CLI expert. Use ONLY the context and inline COLOR tags to answer.\n"
        "You can use these examples to understand the CLI format:\n"
        f"{CLI_CONTEXT}\n\n"
        "Make sure your CLI outputs do not include the color tokens; they should only contain the CLIs.\n"
        "<|end_of_system|>"
        "<|user|>"
        f"Context:\n{context}\n"
        f"Question: {question}\n"
        "<|end_of_user|>"
        "<|assistant|>"
    )

    response = llm.invoke(
        prompt,
        decoding_method="greedy",
        max_new_tokens=4096,
        temperature=0
    )
    answer = strip_color_tags(response.strip()).replace("<|end_of_assistant|>","")

    # Log to CSV
    with open(output_csv, "a", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow([question, answer, context])

    return answer, context

# Define the actual list of questions
my_questions = [
    "What CLI command do you use to configure the system hostname, timezone, and NTP servers?",
    "Which command creates a loopback interface on the CSR?",
    "How do you enable SSH services via the CLI?",
    "What is the exact Junos command to configure a VLAN-tagged IRB interface for VLAN 2001?",
    "Which CLI shows how to apply a shaping rate of 300 Mbps on a GE interface?",
    "How do you create and apply a firewall filter to rate-limit IP alarm traffic?",
    "Which CLI entries demonstrate how to configure SNMP on the CSR?",
    "How do you configure BGP import and export policies for a routing instance?",
    "What is the Junos CLI syntax to display the current system CPU utilization?",
    "Are there any CLIs in red?"
]

# Iterate over exactly those questions
for q in my_questions:
    print(f"\nQuestion: {q}\n")
    answer, _ = run_rag(q, k=20)
    print(answer)
    print("==================================================")


Question: What CLI command do you use to configure the system hostname, timezone, and NTP servers?

set system host-name <hostname>
set system time-zone <time-zone>
set system ntp server <ntp-server-ip-addr> 
set system ntp boot-server <ntp-server-ip-addr>
set system ntp source-address <loopback0_ip_address>

Question: Which command creates a loopback interface on the CSR?

set interfaces lo0 unit 0 family inet address <loopback0_ip_address/prefix>

Question: How do you enable SSH services via the CLI?

set system services ssh<|assistant|>set system services ssh protocol-version v2<|assistant|>set system services ssh ciphers aes128-ctr<|assistant|>set system services ssh ciphers aes192-ctr<|assistant|>set system services ssh ciphers aes256-ctr<|assistant|>set system services ssh connection-limit 10<|assistant|>set system services ssh rate-limit 10<|assistant|>set system services netconf ssh<|assistant|>set system services ssh sftp-server

Question: What is the exact Junos command to c