In [None]:
%%capture
!pip install -q --progress-bar on "git+https://github.com/ibm-granite-community/utils.git" \
    transformers \
    pillow \
    langchain_community \
    langchain_huggingface \
    langchain_milvus \
    docling

In [None]:
!pip install PyPDF2
!pip install pymupdf
!pip install frontend
!pip install pytesseract
!pip install langchain_ibm

In [None]:
# === Cell 1: Preprocessing with Page Limit, Inline Color Tagging + Image → JSON ===

import os
# Suppress Milvus logs and disable HuggingFace tokenizer parallelism
os.environ["GLOG_minloglevel"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pdf_path = "pdf_files/CSR-ACX7024-configuration-guide-v1.1.pdf"

import io
import re
import json
import tempfile
import requests
import csv
import base64
from io import BytesIO
from PIL import Image
from dotenv import load_dotenv
from PyPDF2 import PdfReader, PdfWriter
from transformers import GPT2TokenizerFast
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus import Milvus
from langchain.schema import Document
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
from docling_core.types.doc.document import TableItem
from langchain_core.documents import Document as LangDocument
from tqdm import tqdm

load_dotenv()

# — Configurable: only process the first N pages (None = all) —
max_pages = 5  # set to an int to limit pages, or None for all

# — Vision-Instruct Setup —

def get_iam_access_token(api_key: str) -> str:
    resp = requests.post(
        "https://iam.cloud.ibm.com/identity/token",
        data={"apikey": api_key, "grant_type": "urn:ibm:params:oauth:grant-type:apikey"},
        headers={"Content-Type": "application/x-www-form-urlencoded"},
    )
    resp.raise_for_status()
    return resp.json()["access_token"]

VISION_API_KEY    = os.getenv("WATSONX_API_KEY")
VISION_PROJECT_ID = os.getenv("WATSONX_PROJECT_ID")
VISION_MODEL_ID   = "meta-llama/llama-3-2-90b-vision-instruct"
VISION_TOKEN      = get_iam_access_token(VISION_API_KEY)

# — CLI Extraction Setup —

PROMPT_RX = re.compile(r'^[\w\-\./]+@[\w\-\./]+[>%#]\s+.+')
CONFIG_RX = re.compile(r'^\s*(?:set|delete|show|request|clear|commit|rollback)\s+.+', re.IGNORECASE)

def is_cli(text: str) -> bool:
    return bool(PROMPT_RX.match(text.strip()) or CONFIG_RX.match(text.strip()))

def extract_colored_cli_spans(pdf_bytes: bytes):
    import fitz
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    spans = []
    for page_idx, page in enumerate(doc, start=1):
        if max_pages and page_idx > max_pages:
            break
        for block in page.get_text("dict")["blocks"]:
            if block["type"] != 0:
                continue
            for line in block["lines"]:
                for span in line["spans"]:
                    txt = span["text"].strip()
                    if not is_cli(txt):
                        continue
                    c = span.get("color", 0)
                    r, g, b = (c >> 16 & 0xFF, c >> 8 & 0xFF, c & 0xFF)
                    if r < 50 and g < 50 and b < 50:
                        continue
                    spans.append({"page": page_idx, "cli": txt, "color": (r, g, b)})
    doc.close()
    return spans

def save_pdf_as_txt(pdf_path: str, txt_path: str):
    import fitz
    doc = fitz.open(pdf_path)
    with open(txt_path, "w", encoding="utf-8") as out:
        for i, p in enumerate(doc, start=1):
            if max_pages and i > max_pages:
                break
            out.write(p.get_text())
            out.write("\f")
    doc.close()

def extract_clis(txt_path: str):
    clis = []
    with open(txt_path, encoding="utf-8") as f:
        for line in f:
            line = line.rstrip()
            if is_cli(line):
                clis.append(line)
    seen, unique = set(), []
    for c in clis:
        if c not in seen:
            seen.add(c)
            unique.append(c)
    return unique

def main():
    global pdf_path, txt_path, cli_spans, texts, vector_db

    txt_path = pdf_path.replace(".pdf", ".txt")

    # --- Load PDF into bytes (respect max_pages) ---
    with open(pdf_path, "rb") as f:
        reader, writer = PdfReader(f), PdfWriter()
        pages = reader.pages[:max_pages] if max_pages else reader.pages
        for p in pages:
            writer.add_page(p)
        buf = io.BytesIO()
        writer.write(buf)
    pdf_bytes = buf.getvalue()

    # --- Extract CLI spans ---
    cli_spans = extract_colored_cli_spans(pdf_bytes)

    # --- Save text & extract raw CLIs ---
    save_pdf_as_txt(pdf_path, txt_path)
    commands = extract_clis(txt_path)
    with open("extracted_commands.txt", "w") as out:
        out.write("\n".join(commands))

    # --- Convert to Docling document (text + images) ---
    stream = io.BytesIO(pdf_bytes)
    doc_stream = DocumentStream(
        name=pdf_path, stream=stream, input_format=InputFormat.PDF
    )
    opts = PdfPipelineOptions(do_ocr=False, generate_picture_images=True)
    converter = DocumentConverter(
        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
    )
    converted = converter.convert(source=doc_stream).document

    # --- Shared doc_id counter ---
    doc_id = 0

    # --- Vision-Instruct on each image with progress bar ---
    pictures = []
    body_refs = converted.body.children
    for ref in tqdm(body_refs, desc="Generating image JSON"):
        if not ref.cref.startswith("#/pictures/"):
            continue
        page_num = int(ref.cref.split("/")[2])
        if max_pages and page_num > max_pages:
            continue

        pic_idx = int(ref.cref.split("/")[-1])
        img = converted.pictures[pic_idx].get_image(converted)
        if not img:
            continue

        # prepare JPEG + base64
        img.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
        if img.mode in ("RGBA", "LA"):
            img = img.convert("RGB")
        bio = BytesIO()
        img.save(bio, format="JPEG", quality=70)
        b64 = base64.b64encode(bio.getvalue()).decode("utf-8")

        # neighboring text
        idx_ref = body_refs.index(ref)
        above = below = ""
        if idx_ref > 0 and body_refs[idx_ref - 1].cref.startswith("#/texts/"):
            t_idx = int(body_refs[idx_ref - 1].cref.split("/")[-1])
            if not max_pages or t_idx + 1 <= max_pages:
                above = converted.texts[t_idx].text.strip()
        if idx_ref < len(body_refs) - 1 and body_refs[idx_ref + 1].cref.startswith("#/texts/"):
            t_idx = int(body_refs[idx_ref + 1].cref.split("/")[-1])
            if not max_pages or t_idx + 1 <= max_pages:
                below = converted.texts[t_idx].text.strip()
        context_text = f"Text above:\n{above}\n\nText below:\n{below}"

        # call Watsonx vision-instruct
        payload = {
            "messages": [
                {
                    "role": "system",
                    "content": "Respond ONLY in JSON matching the image schema, capturing any CLI lines."
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": context_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
                    ]
                }
            ],
            "project_id": VISION_PROJECT_ID,
            "model_id": VISION_MODEL_ID,
            "max_tokens": 2500,
            "temperature": 0.7,
            "top_p": 1
        }
        headers = {
            "Accept": "application/json",
            "Content-Type": "application/json",
            "Authorization": f"Bearer {VISION_TOKEN}"
        }
        url = "https://us-south.ml.cloud.ibm.com/ml/v1/text/chat?version=2023-05-29"

        try:
            resp = requests.post(url, headers=headers, json=payload, timeout=300)
            resp.raise_for_status()
            generated = resp.json()["choices"][0]["message"]["content"]
        except Exception:
            generated = "{}"

        # increment and assign doc_id + source
        doc_id += 1
        pictures.append(
            Document(
                page_content=generated,
                metadata={
                    "doc_id": doc_id,
                    "source": pdf_path,
                    "type": "picture",
                    "ref": ref.cref,
                },
            )
        )

    # --- Chunk & annotate text with inline COLOR tags ---
    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
    chunker = HybridChunker(
        tokenizer=tokenizer, chunk_size=tokenizer.model_max_length, chunk_overlap=20
    )
    texts = []
    for chunk in chunker.chunk(converted):
        items = chunk.meta.doc_items
        if len(items) == 1 and isinstance(items[0], TableItem):
            continue
        safe = tokenizer.decode(
            tokenizer.encode(chunk.text)[:tokenizer.model_max_length],
            skip_special_tokens=True,
        )
        ann = safe
        for span in cli_spans:
            if span["cli"] in ann:
                r, g, b = span["color"]
                tag = f"<COLOR=({r},{g},{b})>{span['cli']}</COLOR>"
                ann = ann.replace(span["cli"], tag, 1)

        # increment and assign doc_id + source
        doc_id += 1
        texts.append(
            LangDocument(
                page_content=ann,
                metadata={"doc_id": doc_id, "source": pdf_path},
            )
        )

    # --- Build vector store on text + image JSON ---
    embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    dbf = tempfile.NamedTemporaryFile(suffix=".db", delete=False).name
    vector_db = Milvus(
        embedding_function=embed,
        connection_args={"uri": dbf},
        auto_id=True,
        index_params={"index_type": "AUTOINDEX"},
        enable_dynamic_field=True
    )
    vector_db.add_documents(texts + pictures)

    print("Preprocessing complete; vector DB ready.")

# Execute preprocessing
main()

  from .autonotebook import tqdm as notebook_tqdm
Generating image JSON: 100%|██████████| 39/39 [00:02<00:00, 15.17it/s]


Preprocessing complete; vector DB ready.


In [None]:
# === Cell 2: RAG with Annotated Context & Images ===

import os, csv
from dotenv import load_dotenv
from langchain_ibm import WatsonxLLM

load_dotenv()

# CSV header
output_csv="rag_results.csv"
with open(output_csv,"w", newline="", encoding="utf-8") as f:
    csv.writer(f).writerow(["question","answer","context"])

# A static list of 30 representative CLI examples
static_clis = [
    "set system host-name <hostname>",
    "set system time-zone <zone>",
    "set system ntp server <ntp-server-ip-addr>",
    "set interfaces lo0 unit 0 family inet address <loopback0_ip_address/prefix>",
    "set system services ssh",
    "request chassis beacon fpc <slot> pic-slot <pic> port <port> on timer <minutes>",
    "request chassis beacon fpc <slot> off",
    "show firewall filter wan-in-v4-test-et-0/0/0.2001-i",
    "show firewall filter wan-in-v6-in-et-0/0/0.2001-i",
    "set firewall family inet filter CMU_CSID_V1100|1500|1600-INGR term BGP-IN from source-prefix-list CSID-BGP",
    "set firewall family inet6 filter wan-in-v6-in term 1 from traffic-class af41",
    "set firewall family inet6 filter wan-in-v6-in term 1 then count inbound-af11-v6-counter",
    "set firewall family inet6 filter wan-in-v6-in term default then accept",
    "set policy-options prefix-list CSID-BGP-V1100|1500|1600 <IPV4-SUBNETS>",
    "set class-of-service interfaces ge-x/y/z shaping-rate 300m",
    "show system software list",
    "request system software add /var/tmp/junos-evo-install-acx-f... no-validate reboot",
    "show chassis pic fpc-slot 0 pic-slot 0",
    "set chassis alarm management-ethernet link-down ignore",
    "set protocols lldp interface all disable",
    "set system no-redirects",
    "set system default-address-selection",
    "delete chassis auto-image-upgrade",
    "delete system commit factory-settings",
    "set system login class REMOTE idle-timeout 15",
    "set system login user <admin> encrypted-password \"<secret>\"",
    "set system ports console log-out-on-disconnect",
    "set system ports console type vt100",
    "set interfaces et-0/0/0 unit 2001 family inet filter input wan-in-v4-test",
    "set interfaces et-0/0/0 unit 2001 family inet6 filter input wan-in-v6-in",
]

# Pre-format the examples as a bullet list
CLI_CONTEXT = "\n".join(f"- {cmd}" for cmd in static_clis)

# Setup LLM
API_KEY = os.getenv("WATSONX_API_KEY")
PROJECT_ID = os.getenv("WATSONX_PROJECT_ID")
token = get_iam_access_token(API_KEY)
llm = WatsonxLLM(
    model_id="meta-llama/llama-3-405b-instruct",
    url="https://us-south.ml.cloud.ibm.com",
    apikey=API_KEY,
    project_id=PROJECT_ID
)


def strip_color_tags(text: str) -> str:
    """
    Remove all <COLOR=(r,g,b)>…</COLOR> wrappers from the given text,
    returning only the inner content.
    """
    # This regex matches the opening tag with three digits, then captures
    # anything (non-greedy) until the closing tag.
    pattern = re.compile(r"<COLOR=\(\d+,\d+,\d+\)>(.*?)</COLOR>")
    # Replace each full match with just the captured group (the CLI itself)
    return pattern.sub(r"\1", text)


def run_rag(question: str, k: int):
    # Retrieve context from vector DB
    docs = vector_db.as_retriever().invoke(question, k=k)
    context = "\n\n".join(d.page_content for d in docs)

    # Build prompt, inserting the examples before the actual context
    prompt = (
        "<|begin_of_text|>"
        "<|system|>"
        "You are a Junos CLI expert. Use ONLY the context and inline COLOR tags to answer.\n"
        "You can use these examples to understand the CLI format:\n"
        f"{CLI_CONTEXT}\n\n"
        "Make sure your CLI outputs do not include the color tokens; they should only contain the CLIs. For example if you want to output: 'set firewall family inet filter wan-in-v4-test term 1 then count <COLOR=(0,176,80)>inbound', instead output: 'set firewall family inet filter wan-in-v4-test term 1 then count inbound'\n"
        "<|end_of_system|>"
        "<|user|>"
        "You will also see descriptions of images in JSON format. Understand that these represent content of images."
        f"Context:\n{context}\n"
        f"Question: {question}\n"
        "<|end_of_user|>"
        "<|assistant|>"
    )

    # Invoke the LLM
    response = llm.invoke(
        prompt,
        decoding_method="greedy",
        max_new_tokens=4096,
        temperature=0
    )
    answer = strip_color_tags(response.strip())

    # Log to CSV
    with open(output_csv, "a", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow([question, answer, context])

    return answer, context

questions = [
    "What are the functions and labeling of each port on the ACX7024 front panel as shown in the port panel diagram?",
    "According to the dimension drawing, what are the exact width, depth, and height measurements of the ACX7024 chassis?",
    "How are the AC and DC power modules arranged in the ACX7024 chassis based on the power spec diagram?",
    "In 'WAN Connectivity Scenario #1,' what VLAN and loopback assignments are illustrated for a direct uplink to an RE?",
    "What airflow pattern is depicted in the cooling-system diagram, and how does 'airflow-out' operate?",
    "What logical separation of VRFs is shown in the VR design diagram, and which services map to each VR?",
    "Which transceiver types and connectors are depicted in the spare-parts photo, and how are they keyed or color-coded?",
    "How does the 100G, 200G, and 300G bundle configuration appear in the product-bundle pictogram, and what items are highlighted?",
    "In the 'Deployment: Physical Connection' schematic, how are uplinks and down-links shown between the CSR, RE, and RAN equipment?",
    "What steps are illustrated in the port-mirroring diagram in Appendix C, and how are the mirror and source ports connected?"
]

# Example usage
for q in questions:
    print(f"\nQuestion: {q}\n")
    ans,_=run_rag(q, k=5)
    print(ans)
    print("==================================================")


Question: What are the functions and labeling of each port on the ACX7024 front panel as shown in the port panel diagram?

show chassis pic fpc-slot 0 pic-slot 0
show chassis pic fpc-slot 0 pic-slot 1
show chassis pic fpc-slot 0 pic-slot 2
show chassis pic fpc-slot 0 pic-slot 3
show chassis pic fpc-slot 0 pic-slot 4
show chassis pic fpc-slot 0 pic-slot 5
show chassis pic fpc-slot 0 pic-slot 6
show chassis pic fpc-slot 0 pic-slot 7
show chassis pic fpc-slot 0 pic-slot 8
show chassis pic fpc-slot 0 pic-slot 9
show chassis pic fpc-slot 0 pic-slot 10
show chassis pic fpc-slot 0 pic-slot 11
show chassis pic fpc-slot 0 pic-slot 12
show chassis pic fpc-slot 0 pic-slot 13
show chassis pic fpc-slot 0 pic-slot 14
show chassis pic fpc-slot 0 pic-slot 15
show chassis pic fpc-slot 0 pic-slot 16
show chassis pic fpc-slot 0 pic-slot 17
show chassis pic fpc-slot 0 pic-slot 18
show chassis pic fpc-slot 0 pic-slot 19
show chassis pic fpc-slot 0 pic-slot 20
show chassis pic fpc-slot 0 pic-slot 21
show c