In [1]:
import os
# Suppress Milvus C++ server logs and disable HuggingFace tokenizers parallelism warnings
os.environ["GLOG_minloglevel"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import io
import re
import json
import tempfile
import requests
from dotenv import load_dotenv
from PyPDF2 import PdfReader, PdfWriter
from transformers import GPT2TokenizerFast
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document as LangDocument

load_dotenv()

# Regex matchers for CLI
PROMPT_RX = re.compile(r'^[\w\-\./]+@[\w\-\./]+[>%#]\s+.+')
CONFIG_RX = re.compile(r'^\s*(?:set|delete|show|request|clear|commit|rollback)\s+.+', re.IGNORECASE)

def is_cli(text: str) -> bool:
    return bool(PROMPT_RX.match(text.strip()) or CONFIG_RX.match(text.strip()))

def extract_colored_cli_spans(pdf_bytes: bytes):
    import fitz  # PyMuPDF
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    cli_spans = []
    for page_idx, page in enumerate(doc, start=1):
        for block in page.get_text("dict")["blocks"]:
            if block["type"] != 0:
                continue
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"]
                    if not is_cli(text):
                        continue
                    c = span.get("color", 0)
                    r, g, b = (c >> 16 & 0xFF, c >> 8 & 0xFF, c & 0xFF)
                    cli_spans.append({"page": page_idx, "cli": text.strip(), "color": (r, g, b), "bbox": span["bbox"]})
    doc.close()
    return cli_spans

def save_pdf_as_txt(pdf_path: str, txt_path: str):
    import fitz
    doc = fitz.open(pdf_path)
    with open(txt_path, "w", encoding="utf-8") as out:
        for page in doc:
            out.write(page.get_text())
            out.write("\f")
    doc.close()

def extract_clis(txt_path: str):
    clis = []
    with open(txt_path, encoding='utf-8') as f:
        for raw in f:
            line = raw.rstrip()
            if not (PROMPT_RX.match(line) or CONFIG_RX.match(line)):
                continue
            stripped = line.strip()
            if len(stripped)>200 or ' ' not in stripped:
                continue
            clis.append(stripped)
    # unique
    seen = set(); unique=[]
    for cmd in clis:
        if cmd not in seen:
            seen.add(cmd)
            unique.append(cmd)
    return unique

def get_iam_access_token(api_key: str):
    iam_url = "https://iam.cloud.ibm.com/identity/token"
    resp = requests.post(iam_url, data={"apikey":api_key,"grant_type":"urn:ibm:params:oauth:grant-type:apikey"},
                         headers={"Content-Type":"application/x-www-form-urlencoded"})
    resp.raise_for_status()
    return resp.json()["access_token"]

def main():
    # Paths
    global pdf_path, txt_path, cli_spans, texts, vector_db
    pdf_path = "CSR-ACX7024-configuration-guide-v1.1.pdf"
    txt_path = pdf_path.replace(".pdf", ".txt")
    print("Loading and OCR-ing PDF...")
    with open(pdf_path,"rb") as f:
        reader=PdfReader(f); writer=PdfWriter()
        for p in reader.pages: writer.add_page(p)
        buf=io.BytesIO(); writer.write(buf)
    pdf_bytes=buf.getvalue()
    print("Extracting colored CLI spans...")
    cli_spans = extract_colored_cli_spans(pdf_bytes)
    print(f"Found {len(cli_spans)} CLI spans.")
    print("Saving PDF as text...")
    save_pdf_as_txt(pdf_path, txt_path)
    print("Extracting CLI commands...")
    commands = extract_clis(txt_path)
    print(f"Extracted {len(commands)} unique CLI commands.")
    # Save extracted commands to local file
    commands_output_path = "extracted_commands.txt"
    with open(commands_output_path, "w", encoding="utf-8") as cmd_file:
        for cmd in commands:
            cmd_file.write(cmd + "\n")
    print(f"Saved commands to {commands_output_path}")
    print("Converting to Docling document...")
    stream = io.BytesIO(pdf_bytes)
    doc_stream = DocumentStream(name=pdf_path, stream=stream, input_format=InputFormat.PDF)
    opts = PdfPipelineOptions(do_ocr=False, generate_picture_images=True)
    converter=DocumentConverter(format_options={InputFormat.PDF:PdfFormatOption(pipeline_options=opts)})
    converted = converter.convert(source=doc_stream).document
    print("Chunking document...")
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    chunker = HybridChunker(tokenizer=tokenizer, chunk_size=tokenizer.model_max_length, chunk_overlap=20)
    texts=[]
    doc_id=0
    for chunk in chunker.chunk(converted):
        items = chunk.meta.doc_items
        if len(items) == 1 and isinstance(items[0], TableItem):
            continue
        tokens = tokenizer.encode(chunk.text)
        safe = tokenizer.decode(tokens[:tokenizer.model_max_length], skip_special_tokens=True)
        # bump the counter
        doc_id += 1
        texts.append(LangDocument(
            page_content=safe,
            metadata={"doc_id": doc_id, "source": pdf_path}
        ))
    print(f"Prepared {len(texts)} text chunks.")
    print("Initializing vector DB...")
    embed=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db_file=tempfile.NamedTemporaryFile(suffix=".db", delete=False).name
    vector_db = Milvus(embedding_function=embed, connection_args={"uri":db_file}, auto_id=True, index_params={"index_type":"AUTOINDEX"})
    vector_db.add_documents(texts)
    print("Vector DB ready.")

# Run preprocessing once
main()

  from .autonotebook import tqdm as notebook_tqdm


Loading and OCR-ing PDF...
Extracting colored CLI spans...
Found 704 CLI spans.
Saving PDF as text...
Extracting CLI commands...
Extracted 611 unique CLI commands.
Saved commands to extracted_commands.txt
Converting to Docling document...


Token indices sequence length is longer than the specified maximum sequence length for this model (2010 > 1024). Running this sequence through the model will result in indexing errors


Chunking document...
Prepared 221 text chunks.
Initializing vector DB...
Vector DB ready.


In [2]:
import os
from langchain_ibm import WatsonxLLM

# Load environment
from dotenv import load_dotenv
load_dotenv()

# Setup LLM
API_KEY = os.getenv("WATSONX_API_KEY")
PROJECT_ID = os.getenv("WATSONX_PROJECT_ID")
token = get_iam_access_token(API_KEY)
llm = WatsonxLLM(model_id="meta-llama/llama-3-405b-instruct", url="https://us-south.ml.cloud.ibm.com", apikey=API_KEY, project_id=PROJECT_ID)

def run_rag(question, k=5):
    docs = vector_db.as_retriever().invoke(question, k=k)
    context = "\n\n".join(d.page_content for d in docs)
    pages = {d.metadata.get("page") for d in docs if d.metadata.get("page")}
    spans = [s for s in cli_spans if s["page"] in pages and s["color"]==(255,0,0)]
    prompt = f"""You are a Junos CLI expert. Use ONLY the context and color metadata to answer.
COLOR_DATA: {json.dumps(spans, ensure_ascii=False)}
Context: {context}
Question: {question}
"""
    response = llm.invoke(prompt, decoding_method="greedy", max_new_tokens=1000, temperature=0, top_p=1, top_k=40)
    return response

questions = [
    "What CLI command configures the system’s hostname, timezone, and NTP servers?",
    "Which command is used to create a loopback interface on the CSR?",
    "How do you set up a BGP peering session with an edge RE?",
    "What is the exact command to configure a VLAN-tagged IRB interface (e.g. for VLAN 2001)?",
    "Which CLI syntax shows how to apply a shaping rate of 300 Mbps on a GE interface?",
    "How do you enable SSH system services via the CLI?",
    "What commands configure TACACS+ authentication and its timeout?",
    "How do you create and apply a firewall filter to rate-limit IP alarm traffic?",
    "Which CLI entries demonstrate how to configure SNMP on the CSR?",
    "What is the exact Junos command to display current system CPU utilization?"
]

for x in questions:
    print(x+"\n")
    print(run_rag(x))
    print("==================================================")


What CLI command configures the system’s hostname, timezone, and NTP servers?

Answer: set system host-name <CLLI> set system time-zone <time-zone> set system ntp boot-server <ntp-server-ip-addr> set system ntp server <ntp-server-ip-addr> set system ntp source-address <loopback0_ip_address>
Which command is used to create a loopback interface on the CSR?

Answer: set interfaces lo0 unit 0 family inet address <loopback0_ip_address/prefix>
How do you set up a BGP peering session with an edge RE?

Answer: To set up a BGP peering session with an edge RE, you need to configure the BGP protocol under the routing instance, specifying the group name, family (inet or inet6), hold time, authentication key, peer AS, and local AS (if not already defined). Additionally, you need to configure the neighbor IP address, import and export policies, and description. The specific commands are:

set routing-instances <VR-NAME> protocols bgp group <VR-group-NAME> family inet unicast
set routing-instances <V