In [2]:
import os
from pathlib import Path
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.embeddings import Embeddings
from sentence_transformers import SentenceTransformer
os.environ["TRANSFORMERS_NO_TF"] = "1"  


class STEmbeddings(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
    def embed_documents(self, texts):
        return self.model.encode(texts, normalize_embeddings=True).tolist()
    def embed_query(self, text):
        return self.model.encode([text], normalize_embeddings=True).tolist()[0]

emb = STEmbeddings()

# My manuals
pdfs = [
    r"C:\Users\HP\Downloads\manuals (PDF)\Honeywell.pdf",
    r"C:\Users\HP\Downloads\manuals (PDF)\Smart Touch Washing Machine.pdf",
    r"C:\Users\HP\Downloads\manuals (PDF)\Rittal.pdf",
    r"C:\Users\HP\Downloads\manuals (PDF)\Roborock Saros.pdf",
]

def guess_heading(text: str) -> str:
    for raw in text.splitlines():
        s = raw.strip()
        if not s or len(s) < 5 or len(s) > 80 or s.endswith("."):
            continue
        letters = sum(ch.isalpha() for ch in s) or 1
        if sum(ch.isupper() for ch in s) / letters >= 0.5:
            return s
        if re.match(r"^(section\s*\d+|\d+(?:\.\d+)+)\b", s, re.I):
            return s
    return ""

docs = []
for p in pdfs:
    name = Path(p).stem
    pages = PyPDFLoader(p).load()
    for d in pages:
        d.metadata["doc_name"] = name
        pg = d.metadata.get("page", None)
        d.metadata["page_number"] = (pg + 1) if isinstance(pg, int) else pg
        d.metadata["section_heading"] = guess_heading(d.page_content)
        docs.append(d)

splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=120)
chunks = splitter.split_documents(docs)

vs = Chroma.from_documents(chunks, emb, persist_directory="./chroma_store")
retriever = vs.as_retriever(search_kwargs={"k": 5})

prompt = ChatPromptTemplate.from_template(
    "You are a helpful assistant. Answer strictly from the manuals in the context. "
    "If it is not in the manuals, say you don't know. "
    "Always include citations like (Doc: <doc_name>, p.<page_number>).\n\n"
    "Conversation so far:\n{history}\n\n"
    "Question: {question}\n\n"
    "Context:\n{context}"
)

chat_history = []
def history_text(): return "\n".join(chat_history[-6:])

def make_context(docs):
    lines = []
    for d in docs:
        dn = d.metadata.get("doc_name")
        pg = d.metadata.get("page_number")
        sec = d.metadata.get("section_heading") or ""
        head = f"(Doc: {dn}, p.{pg}{', Sec: '+sec if sec else ''})"
        lines.append(f"{head}\n{d.page_content}")
    return "\n".join(lines)

def llm_fallback_answer(q, ctx_docs):
    import re
    if not ctx_docs:
        return "I don't know; this isn’t in the manuals."
    text = " ".join(ctx_docs[0].page_content.split())
    sents = re.split(r'(?<=[.!?])\s+', text)
    keep = [s for s in sents if 40 < len(s) <= 180][:4]
    cites = f"(Doc: {ctx_docs[0].metadata.get('doc_name')}, p.{ctx_docs[0].metadata.get('page_number')})"
    body = "- " + "\n- ".join(keep) if keep else text[:500]
    return f"{body}\n\nSources: {cites}"

LLM = None  
def ask(question: str):
    global chat_history
    ctx_docs = retriever.get_relevant_documents(question)
    context = make_context(ctx_docs)
    if LLM is not None:
        chain = prompt | LLM | StrOutputParser()
        answer = chain.invoke({"history": history_text(), "question": question, "context": context})
    else:
        answer = llm_fallback_answer(question, ctx_docs)
    print(answer)
    chat_history.append(f"User: {question}")
    chat_history.append(f"Assistant: {answer}")





incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams
incorrect startxref pointer(1)
parsing for Object Streams


In [3]:
ask("How do I enter and navigate the Installer Setup (ISU) menu?")        # Honeywell


  ctx_docs = retriever.get_relevant_documents(question)


- Installer setup (ISU) 1 Press and hold CENTER and buttons for approximately 3 seconds to enter advanced menu.
- 3 Press Select to cycle through menu setup options.
- 4 Press or to change values or select from available options.
- 5 Press Select and confirm your settings or press Back to ignore changes and return to ISU menu screen to continue editing another setup option.

Sources: (Doc: Honeywell, p.13)


In [4]:
ask("List the safety precautions before installation.")                    # Rittal


- for measuring and control technology in indoor spaces.
- Should you have any special requirements, and for outdoor installation, the suitability of the product must be verified by Rittal.
- Because a few residual risks cannot be precluded, the following notes must be observed.
- Danger to life and health in case of non-observance of the safety instructions.

Sources: (Doc: Rittal, p.29)


In [5]:
ask("How do I reset Wi-Fi to factory defaults?")                           # Roborock Saros


- If you want to reconnect, reset the WiFi before proceeding.
- Instructions for Use Filling the Clean Water Tank Lift the clean water tank, open its lid, and then fill it with tap water.
- After filling, close the lid, lock the latch, and then put the clean water tank back to the dock.

Sources: (Doc: Roborock Saros, p.13)


In [6]:
ask("How often should I clean the pump/pump filter?")                      # Smart Touch Washing Machine


- Note: Do not touch the surface of the filter with hands, brushes, or hard objects to avoid potential damage.
- D6-3 —Allow at least 24 hours for the filter to dry thoroughly before reinstalling it.
- Mop Cloths * Clean as required and replace every 1-3 months.
- D7-1 —Remove the mop cloths from mop cloth mounts.

Sources: (Doc: Roborock Saros, p.23)


In [7]:
ask("Which options can I change in ISU?")
ask("How do I exit ISU and save changes?")

- Installer setup (ISU) 1 Press and hold CENTER and buttons for approximately 3 seconds to enter advanced menu.
- 3 Press Select to cycle through menu setup options.
- 4 Press or to change values or select from available options.
- 5 Press Select and confirm your settings or press Back to ignore changes and return to ISU menu screen to continue editing another setup option.

Sources: (Doc: Honeywell, p.13)
- Installer setup (ISU) 1 Press and hold CENTER and buttons for approximately 3 seconds to enter advanced menu.
- 3 Press Select to cycle through menu setup options.
- 4 Press or to change values or select from available options.
- 5 Press Select and confirm your settings or press Back to ignore changes and return to ISU menu screen to continue editing another setup option.

Sources: (Doc: Honeywell, p.13)


In [8]:
# Cross-manual
ask("Compare the electrical installation safety guidance in Honeywell vs Rittal.")


- Service-HUB Indien Service HUB India RITTAL India Pvt.

Sources: (Doc: Rittal, p.31)


In [9]:
# Out-of-scope
ask("What will Egypt’s electricity tariffs be in 2026?")

- it fully charged, unplugged, and switched off.
- Recharge it at least every three months to avoid battery overdischarge.
- To transport the product, be sure to unplug the dock, switch off the robot, and empty water tanks.
- In addition, drain and rinse the cleaning solution cartridge, leave it air dry completely, and then reinstall it.

Sources: (Doc: Roborock Saros, p.35)


In [10]:
from langchain_chroma import Chroma  # <-- NEW

PERSIST_DIR = "./chroma_store"
COLLECTION  = "manuals"  # keep it explicit

def build_or_load_chroma(chunks, emb, persist_dir=PERSIST_DIR, collection=COLLECTION):
    from pathlib import Path
    if Path(persist_dir).exists():
        try:
            vs = Chroma(
                persist_directory=persist_dir,
                embedding_function=emb,
                collection_name=collection,
            )
            # If collection has vectors, reuse it
            cnt = vs._collection.count() if hasattr(vs, "_collection") else 1
            if cnt and cnt > 0:
                print(" Reloaded existing Chroma store")
                return vs
        except Exception as e:
            print("Rebuilding Chroma store due to:", e)
    print("⚙️ Building Chroma store (first run)…")
    vs = Chroma.from_documents(
        chunks, emb,
        persist_directory=persist_dir,
        collection_name=collection,
    )
    print(" Saved to", persist_dir)
    return vs

vs = build_or_load_chroma(chunks, emb)
retriever = vs.as_retriever(search_kwargs={"k": 5})


⚙️ Building Chroma store (first run)…
 Saved to ./chroma_store


In [11]:
# how many vectors are in the store?
n = vs._collection.count() if hasattr(vs, "_collection") else "?"
print("Vectors in store:", n)

# try a quick retrieval
docs = retriever.get_relevant_documents("reset Wi-Fi to factory defaults")
for d in docs[:3]:
    print(d.metadata.get("doc_name"), d.metadata.get("page_number"), d.metadata.get("section_heading"))


Vectors in store: 168
Roborock Saros 13 
Roborock Saros 18 
Roborock Saros 13 


In [12]:
def retrieve_filtered(query: str, k=5, manual: str | None = None, section: str | None = None):
    # metadata filter on manual via Chroma; section is a flexible substring match
    where = {"doc_name": manual} if manual else None
    pool_k = max(k * 5, 40) if (manual or section) else k
    docs = vs.similarity_search(query, k=pool_k, filter=where) if where else vs.similarity_search(query, k=pool_k)
    if section:
        docs = [d for d in docs if section.lower() in (d.metadata.get("section_heading") or "").lower()]
    return docs[:k]

def ask_filtered(question: str, manual: str | None = None, section: str | None = None, k: int = 5):
    ctx_docs = retrieve_filtered(question, k=k, manual=manual, section=section)
    context = make_context(ctx_docs)
    answer = llm_fallback_answer(question, ctx_docs) if LLM is None else (prompt | LLM | StrOutputParser()).invoke(
        {"history": "\n".join(chat_history[-6:]), "question": question, "context": context}
    )
    print(answer)
    chat_history.append(f"User: {question}")
    chat_history.append(f"Assistant: {answer}")


In [13]:
# 1) Direct factual (one per manual)
ask_filtered("How do I enter and navigate the Installer Setup (ISU) menu?", manual="Honeywell", k=6)
ask_filtered("List the safety precautions before installation.", manual="Rittal", k=8)
ask_filtered("How do I reset Wi-Fi to factory defaults?", manual="Roborock Saros", k=6)
ask_filtered("How often should I clean the pump/pump filter?", manual="Smart Touch Washing Machine", k=6)

# 2) Follow-ups (memory) – stay on Honeywell by passing manual once more
ask_filtered("Which options can I change in ISU?", manual="Honeywell", k=8)
ask_filtered("How do I exit ISU and save changes?", manual="Honeywell", k=8)

# 3) Cross-manual
ask_filtered("Compare the electrical installation safety guidance in Honeywell vs Rittal.", k=8)

# 4) Out-of-scope
ask_filtered("What will Egypt’s electricity tariffs be in 2026?", k=6)


- Installer setup (ISU) 1 Press and hold CENTER and buttons for approximately 3 seconds to enter advanced menu.
- 3 Press Select to cycle through menu setup options.
- 4 Press or to change values or select from available options.
- 5 Press Select and confirm your settings or press Back to ignore changes and return to ISU menu screen to continue editing another setup option.

Sources: (Doc: Honeywell, p.13)
- for measuring and control technology in indoor spaces.
- Should you have any special requirements, and for outdoor installation, the suitability of the product must be verified by Rittal.
- Because a few residual risks cannot be precluded, the following notes must be observed.
- Danger to life and health in case of non-observance of the safety instructions.

Sources: (Doc: Rittal, p.29)
- If you want to reconnect, reset the WiFi before proceeding.
- Instructions for Use Filling the Clean Water Tank Lift the clean water tank, open its lid, and then fill it with tap water.
- After fi

### BONUS: persistence + metadata filtering + summarization


In [14]:

from pathlib import Path
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- 1) Persist / reload Chroma so you don't rebuild every run ---
PERSIST_DIR = "./chroma_store"
COLLECTION  = "manuals"

# --- OOS guard using relevance scores + light heuristics ---
DOMAIN_HINTS = (
    "thermostat","honeywell","t6","vx25","rittal","enclosure","earthing",
    "washing machine","detergent","filter","pump","lint",
    "roborock","saros","robot","wifi","reset"
)

def get_docs_with_scores(query: str, k: int, manual: str | None = None):
    """Use Chroma's scores; returns list[(doc, score in 0..1)]."""
    where = {"doc_name": manual} if manual else None
    pairs = vs.similarity_search_with_relevance_scores(query, k=k, filter=where)  # [(doc, score), ...]
    # Some versions may return distance, not similarity; if you see very small numbers (e.g. 0.01),
    # just lower thresholds below.
    return pairs

def looks_oos(query: str, pairs: list[tuple], lo: float = 0.32, mid: float = 0.25) -> bool:
    """
    Decide OOS from scores and simple domain hints.
    lo  = minimum acceptable top-score
    mid = minimum acceptable median-score
    """
    if not pairs:
        return True
    scores = [float(s) for (_, s) in pairs]
    top, med = max(scores), sorted(scores)[len(scores)//2]
    qlow = query.lower()
    has_domain_word = any(t in qlow for t in DOMAIN_HINTS)
    # If scores are low AND the query lacks any domain hint, call it OOS
    if top < lo and med < mid and not has_domain_word:
        return True
    return False



def build_or_load_chroma(chunks, emb, persist_dir=PERSIST_DIR, collection=COLLECTION, rebuild: bool = False):
    """
    If a persisted Chroma DB exists, reload it; otherwise build and save.
    Set rebuild=True to force a fresh index.
    """
    if not rebuild and Path(persist_dir).exists():
        try:
            vs = Chroma(persist_directory=persist_dir, embedding_function=emb, collection_name=collection)
            cnt = vs._collection.count() if hasattr(vs, "_collection") else 0
            if cnt and cnt > 0:
                print(f"Reloaded existing Chroma store ({cnt} vectors)")
                return vs
        except Exception as e:
            print("Reload failed, rebuilding:", e)
    print("Building Chroma store…")
    vs = Chroma.from_documents(chunks, emb, persist_directory=persist_dir, collection_name=collection)
    print("Saved to", persist_dir)
    return vs

vs = build_or_load_chroma(chunks, emb)
retriever = vs.as_retriever(search_kwargs={"k": 5})


# --- 2) Metadata-based filtering (manual / section) ---
def retrieve_filtered(query: str, k: int = 5, manual: str | None = None, section: str | None = None):
    """
    - manual: exact match on metadata 'doc_name' (server-side filter in Chroma)
    - section: substring match on 'section_heading' (client-side for flexibility)
    """
    where = {"doc_name": manual} if manual else None
    pool_k = max(k * 5, 40) if (manual or section) else k  # retrieve wider pool if we plan to filter
    docs = vs.similarity_search(query, k=pool_k, filter=where) if where else vs.similarity_search(query, k=pool_k)
    if section:
        s = section.lower()
        docs = [d for d in docs if s in (d.metadata.get("section_heading") or "").lower()]
    return docs[:k]

def _format_citations(docs, max_refs=4):
    seen, cites = set(), []
    for d in docs:
        key = (d.metadata.get("doc_name"), d.metadata.get("page_number"))
        if key not in seen:
            seen.add(key); cites.append(f"(Doc: {key[0]}, p.{key[1]})")
        if len(cites) >= max_refs: break
    return ", ".join(cites)

def _make_context(docs):
    return "\n\n".join(
        f"(Doc: {d.metadata.get('doc_name')}, p.{d.metadata.get('page_number')}, Sec: {d.metadata.get('section_heading') or ''})\n{d.page_content}"
        for d in docs
    )

def ask_filtered(question: str, manual: str | None = None, section: str | None = None, k: int = 5):
    """
    Ask a question while restricting retrieval by manual and/or section metadata.
    Uses your global LLM if defined (LLM != None), otherwise a simple extractive fallback.
    """
    docs = retrieve_filtered(question, k=k, manual=manual, section=section)
    if not docs:
        print("I couldn't find matching content in the manuals.")
        return None

    context = _make_context(docs)
    cites   = _format_citations(docs)

    if "LLM" in globals() and LLM is not None:
        prompt = ChatPromptTemplate.from_template(
            "Answer strictly from the context. If not in context/manuals, say you don't know.\n"
            "Include citations like (Doc: X, p.Y).\n\n"
            "Question: {q}\n\nContext:\n{ctx}"
        )
        answer = (prompt | LLM | StrOutputParser()).invoke({"q": question, "ctx": context})
    else:
        import re
        text = " ".join(docs[0].page_content.split())
        sents = re.split(r'(?<=[.!?])\s+', text)
        bullets = [s for s in sents if 40 < len(s) <= 180][:5]
        answer = ("- " + "\n- ".join(bullets)) if bullets else text[:500]
        answer += f"\n\nSources: {cites}"

    print(answer)
    return answer


# --- 3) Summarization (manual/section aware) ---
def _compress_to_bullets(text: str, max_lines: int = 8) -> str:
    import re
    text = " ".join(text.split())
    sents = re.split(r'(?<=[.!?])\s+', text)
    keep = [s for s in sents if 40 < len(s) <= 180][:max_lines]
    return "- " + "\n- ".join(keep) if keep else text[:600]

def summarize(manual: str | None = None, section: str | None = None, query_hint: str | None = None, k: int = 12):
    """
    Summarize the most relevant chunks for a given manual/section (or both).
    - manual: filter to a single manual
    - section: restrict by section keyword
    - query_hint: optional term to steer retrieval (e.g., 'Maintenance', 'Safety')
    """
    q = query_hint or (section or "overview")
    docs = retrieve_filtered(q, k=k, manual=manual, section=section)
    if not docs:
        print("No matching context to summarize.")
        return None

    context_text = "\n\n".join(d.page_content for d in docs)

    if "LLM" in globals() and LLM is not None:
        sum_prompt = ChatPromptTemplate.from_template(
            "Summarize the key procedures and cautions in 5–8 concise bullets based strictly on the context.\n\n{ctx}"
        )
        out = (sum_prompt | LLM | StrOutputParser()).invoke({"ctx": context_text})
    else:
        out = _compress_to_bullets(context_text, max_lines=8)

    cites = _format_citations(docs)
    print(out + f"\n\nSources: {cites}")
    return out


Reloaded existing Chroma store (168 vectors)


In [15]:
# Restrict retrieval to one manual
ask_filtered("List the safety precautions before installation.", manual="Rittal", k=8)

# Restrict to a section within a manual
ask_filtered("How do I reset Wi-Fi to factory defaults?", manual="Roborock Saros", section="Reset WiFi", k=8)

# Summarize a section
summarize(manual="Smart Touch Washing Machine", section="Maintenance", k=12)

# Summarize safety notes across manuals
summarize(section="Safety", query_hint="safety warnings", k=12)


- for measuring and control technology in indoor spaces.
- Should you have any special requirements, and for outdoor installation, the suitability of the product must be verified by Rittal.
- Because a few residual risks cannot be precluded, the following notes must be observed.
- Danger to life and health in case of non-observance of the safety instructions.
- Risk of crushing during installation of mounting plate

Sources: (Doc: Rittal, p.29), (Doc: Rittal, p.2), (Doc: Rittal, p.5), (Doc: Rittal, p.25)
I couldn't find matching content in the manuals.
- which are hard to eliminate from laundry.
- In this case, do not re-rinse to eliminate these effects: it will not help at all.
- We suggest conducting a maintenance wash using a proprietary cleaner.
- If the problem persists or if you suspect a malfunction, immediately contact an Authorised Customer Service Centre.
- CLEANING AND MAINTENANCE Looking after your appliance correctly can extend its lifespan.
- Cleaning the appliance's exte

"- If the appliance is used in a manner inconsistent with this it may reduce the life of the appliance and may void the manufacturer's warranty.\n- manufacturer to the fullest extent permitted by law.\n- Children shall not play with the appliance.\n- Cleaning and user maintenance shall not be made by children without supervision.\n- Children should be supervised to ensure that they do not play with the appliance.\n- Children of less than 3 years should be kept away unless continuously supervised.\n- If the supply cord is damaged, it must be replaced by a special cord or assembly available from the manufacturer or its service agent.\n- WEEE subjected to specific treatments, in order to remove and dispose properly all pollutants, and recover and recycle all materials."