In [30]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain cohere



In [31]:
from PyPDF2 import PdfReader, PdfWriter

# Input and output file paths
input_pdf = "c5.pdf"
output_pdf = "output.pdf"

# Pages are zero-based in PyPDF2 (so page 8 is index 7, page 740 is index 739)
skip_start = 7      # page 8
skip_end = 738      # page 739 (excludes 740)

reader = PdfReader(input_pdf)
writer = PdfWriter()

for i in range(len(reader.pages)):
    if not (skip_start <= i <= skip_end):
        writer.add_page(reader.pages[i])

with open(output_pdf, "wb") as f:
    writer.write(f)

print(f"Saved new PDF to {output_pdf}")


Saved new PDF to output.pdf


In [32]:
input_pdf = "old.pdf"
reader = PdfReader(input_pdf)

# --- Extract page 1 ---
page1_writer = PdfWriter()
page1_writer.add_page(reader.pages[0])  # page 1 = index 0
with open("Artemis_v6.1_PIP_Front_Connectors_IOs.pdf", "wb") as f:
    page1_writer.write(f)

# --- Extract pages 2–7 ---
cre_writer = PdfWriter()
for i in range(1, 7):  # pages 2–7 → indices 1 to 6
    cre_writer.add_page(reader.pages[i])
with open("CRE_arguments.pdf", "wb") as f:
    cre_writer.write(f)

# --- Main document (everything from page 8 onwards) ---
main_writer = PdfWriter()
for i in range(7, len(reader.pages)):  # start from page 8 (index 7)
    main_writer.add_page(reader.pages[i])

with open("output.pdf", "wb") as f:
    main_writer.write(f)

print("✅ Done:")
print(" - Artemis_v6.1_PIP_Front_Connectors_IOs.pdf (page 1)")
print(" - CRE_arguments.pdf (pages 2–7)")
print(" - output.pdf (pages 8 and onwards)")


✅ Done:
 - Artemis_v6.1_PIP_Front_Connectors_IOs.pdf (page 1)
 - CRE_arguments.pdf (pages 2–7)
 - output.pdf (pages 8 and onwards)


In [33]:
import os
import re
from PyPDF2 import PdfReader

input_pdf = "output.pdf"
output_dir = "pages"
os.makedirs(output_dir, exist_ok=True)

# Read all text from PDF
reader = PdfReader(input_pdf)
full_text = ""
for page in reader.pages:
    text = page.extract_text()
    if text:
        full_text += text + "\n"

# Split by --- Parsing:
chunks = full_text.split("--- Parsing:")

for chunk in chunks:
    chunk = chunk.strip()
    if not chunk:
        continue

    # Look for the start of the URL
    url_match = re.search(r"https://sites\.google\.com/a/valeo\.com/castle-project/(.*?)---", chunk, re.S)
    if not url_match:
        continue

    # Extract the filename part (strip spaces/newlines)
    filename_part = url_match.group(1).strip().replace(" ", "").replace("\n", "")
    filename = filename_part.replace("/", "_") + ".txt"
    filepath = os.path.join(output_dir, filename)

    # Content is everything after the URL + --- marker
    content_start = url_match.end()
    content = chunk[content_start:].strip()

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(content)

    print(f"Saved: {filepath}")


KeyboardInterrupt: 

In [None]:
from PyPDF2 import PdfReader

pdf_files = ["Artemis_v6.1_PIP_Front_Connectors_IOs.pdf", "CRE_arguments.pdf"]

for pdf_file in pdf_files:
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

    txt_file = pdf_file.replace(".pdf", ".txt")
    with open(txt_file, "w", encoding="utf-8") as f:
        f.write(text.strip())

    print(f"✅ Saved: {txt_file}")


✅ Saved: Artemis_v6.1_PIP_Front_Connectors_IOs.txt
✅ Saved: CRE_arguments.txt


In [None]:
import os

folder = "pages"

for filename in os.listdir(folder):
    if filename.endswith(".txt"):
        filepath = os.path.join(folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read().strip()
            word_count = len(content.split())

        if word_count < 20:
            os.remove(filepath)
            print(f"🗑️ Deleted {filename} (only {word_count} words)")
        else:
            print(f"✅ Kept {filename} ({word_count} words)")


🗑️ Deleted about-us.txt (only 2 words)
✅ Kept about-us_the-team.txt (313 words)
🗑️ Deleted additional-offerings.txt (only 8 words)
🗑️ Deleted additional-offerings_arxml-generation-porcshe-sv62.txt (only 6 words)
🗑️ Deleted additional-offerings_castle-lite_raspberry-pi-as-a-castle-pim.txt (only 6 words)
🗑️ Deleted additional-offerings_virtual-castle.txt (only 2 words)
🗑️ Deleted additional-offerings_virtual-castle_vpim-user-guide-castle-virtual-pim.txt (only 3 words)
✅ Kept Artemis_v6.1_PIP_Front_Connectors_IOs.txt (141 words)
✅ Kept CRE_arguments.txt (1432 words)
🗑️ Deleted general-topics.txt (only 3 words)
🗑️ Deleted general-topics_autosar.txt (only 1 words)
✅ Kept general-topics_autosar_autosar42containersecuredpdus.txt (740 words)
🗑️ Deleted general-topics_can.txt (only 1 words)
🗑️ Deleted general-topics_can_canbustimingsamplingandconfiguration.txt (only 13 words)
✅ Kept general-topics_castle-6-bip-ethernet-interface.txt (442 words)
✅ Kept general-topics_castle-programming-adaptor.t

In [None]:
import os



In [None]:
import os
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

pages_folder = "pages"
summaries_folder = "summaries"
os.makedirs(summaries_folder, exist_ok=True)

# Load docs from pages folder
docs = []
for filename in os.listdir(pages_folder):
    if filename.endswith(".txt"):
        filepath = os.path.join(pages_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read().strip()
        if content:
            docs.append(Document(page_content=content, metadata={"source": filename}))

print(f"✅ Loaded {len(docs)} documents from {pages_folder}")

# Define chain
chain = (
    {"doc": lambda x: x.page_content, "name": lambda x: x.metadata["source"]}
    | ChatPromptTemplate.from_template(
        "You are summarizing a document extracted from a file.\n\n"
        "File Name: {name}\n\n"
        "Document Content:\n{doc}\n\n"
        "Please provide a clear, concise summary."
    )
    | ChatOpenAI(model="gpt-4o-mini", max_retries=2)  # ✅ updated model + retries
    | StrOutputParser()
)

# Run summarization with lower concurrency
summaries = chain.batch(docs, {"max_concurrency": 2})

# Save each summary
for doc, summary in zip(docs, summaries):
    filename = doc.metadata["source"]
    summary_filename = filename.replace(".txt", ".summary.txt")
    summary_path = os.path.join(summaries_folder, summary_filename)
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write(f"Source File: {filename}\n\nSummary:\n{summary}")
    print(f"📝 Saved summary: {summary_path}")


✅ Loaded 541 documents from pages
📝 Saved summary: summaries\about-us_the-team.summary.txt
📝 Saved summary: summaries\Artemis_v6.1_PIP_Front_Connectors_IOs.summary.txt
📝 Saved summary: summaries\CRE_arguments.summary.txt
📝 Saved summary: summaries\general-topics_autosar_autosar42containersecuredpdus.summary.txt
📝 Saved summary: summaries\general-topics_castle-6-bip-ethernet-interface.summary.txt
📝 Saved summary: summaries\general-topics_castle-programming-adaptor.summary.txt
📝 Saved summary: summaries\general-topics_ethernet_lwip-v6-neighbor-discovery.summary.txt
📝 Saved summary: summaries\general-topics_ethernet_simulate-nodes-from-arxml-in-canoe.summary.txt
📝 Saved summary: summaries\general-topics_ethernet_tcp.summary.txt
📝 Saved summary: summaries\general-topics_ethernet_tcpsegmentation.summary.txt
📝 Saved summary: summaries\general-topics_ethernet_tls-in-castle.summary.txt
📝 Saved summary: summaries\general-topics_ethernet_understandmaclayer.summary.txt
📝 Saved summary: summaries\

In [None]:
import os
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

api_pages_folder = "api_pages"
api_summaries_folder = "api_summaries"
os.makedirs(api_summaries_folder, exist_ok=True)

# Load docs from pages folder
api_docs = []
for filename in os.listdir(api_pages_folder):
    if filename.endswith(".md"):
        filepath = os.path.join(api_pages_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read().strip()
        if content:
            api_docs.append(Document(page_content=content, metadata={"source": filename}))

print(f"✅ Loaded {len(api_docs)} documents from {api_pages_folder}")

# Define chain
chain = (
    {"doc": lambda x: x.page_content, "name": lambda x: x.metadata["source"]}
    | ChatPromptTemplate.from_template(
        "You are summarizing a document extracted from a file.\n\n"
        "File Name: {name}\n\n"
        "Document Content:\n{doc}\n\n"
        "Please provide a clear, concise summary."
    )
    | ChatOpenAI(model="gpt-3.5-turbo", max_retries=0)
    | StrOutputParser()
)

# Run summarization
api_summaries = chain.batch(api_docs, {"max_concurrency": 5})

# Save each summary
for doc, summary in zip(api_docs, api_summaries):
    filename = doc.metadata["source"]
    summary_filename = filename.replace(".md", ".summary.txt")
    summary_path = os.path.join(api_summaries_folder, summary_filename)
    with open(summary_path, "w", encoding="utf-8") as f:
        f.write(f"Source File: {filename}\n\nSummary:\n{summary}")
    print(f"📝 Saved summary: {summary_path}")


✅ Loaded 240 documents from api_pages
📝 Saved summary: api_summaries\AdcControl.summary.txt
📝 Saved summary: api_summaries\AkSensors.summary.txt
📝 Saved summary: api_summaries\AngleDataSync.summary.txt
📝 Saved summary: api_summaries\Backlight_Feature.summary.txt
📝 Saved summary: api_summaries\Bap.summary.txt
📝 Saved summary: api_summaries\Base6Io.summary.txt
📝 Saved summary: api_summaries\BaseIo.summary.txt
📝 Saved summary: api_summaries\BaseSensors.summary.txt
📝 Saved summary: api_summaries\BipIo.summary.txt
📝 Saved summary: api_summaries\Calibrate.summary.txt
📝 Saved summary: api_summaries\CanBap.summary.txt
📝 Saved summary: api_summaries\CanDiag.summary.txt
📝 Saved summary: api_summaries\CanDiagSlave.summary.txt
📝 Saved summary: api_summaries\CanDriver.summary.txt
📝 Saved summary: api_summaries\CanXcp.summary.txt
📝 Saved summary: api_summaries\Clamp.summary.txt
📝 Saved summary: api_summaries\Climate.summary.txt
📝 Saved summary: api_summaries\ComLinSched_Tables.summary.txt
📝 Saved su

In [None]:
import os
import uuid
from langchain_core.documents import Document
from langchain.storage import LocalFileStore
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever


persist_dir = "chroma_db"
store_dir = "doc_store"

# Shared vectorstore + storage
vectorstore = Chroma(
    collection_name="summaries",
    embedding_function=OpenAIEmbeddings(),
    persist_directory=persist_dir,
)
store = LocalFileStore(store_dir)
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)


def add_docs_to_store(docs, summaries, source_type: str):
    """
    Add a batch of documents + summaries to Chroma and LocalFileStore.

    Args:
        docs (list[Document]): Full documents
        summaries (list[str]): Summaries corresponding to docs
        source_type (str): Tag to distinguish ("api_summary", "wiki_summary", etc.)
    """
    # Stable IDs: use filename if available, else uuid
    doc_ids = [
        docs[i].metadata.get("source", str(uuid.uuid4()))
        for i in range(len(docs))
    ]

    # Build summary docs with metadata
    summary_docs = [
        Document(
            page_content=summaries[i],
            metadata={
                id_key: doc_ids[i],
                "source": docs[i].metadata.get("source", "unknown"),
                "source_type": source_type,
            },
        )
        for i in range(len(docs))
    ]

    # Add to vectorstore
    retriever.vectorstore.add_documents(summary_docs)

    # Store original docs
    retriever.docstore.mset(list(zip(doc_ids, docs)))

    # Persist changes
    vectorstore.persist()

    print(f"✅ Added {len(docs)} docs with source_type='{source_type}'")


# Example usage:
add_docs_to_store(docs, summaries, "api_summary")
add_docs_to_store(api_docs, api_summaries, "wiki_summary")


  vectorstore = Chroma(
  vectorstore.persist()


✅ Added 541 docs with source_type='api_summary'
✅ Added 240 docs with source_type='wiki_summary'


In [59]:
retriever_all = vectorstore.as_retriever(
    search_kwargs={"k": 5}
)


retriever_api = vectorstore.as_retriever(
    search_kwargs={"k": 1, "filter": {"source_type": "wiki_summary"}}
)

# ✅ Wiki-only retriever
retriever_wiki = vectorstore.as_retriever(
    search_kwargs={"k": 1, "filter": {"source_type": "api_summary"}}
)


In [47]:
results = retriever_api.invoke("_ComFlexray")
print(results[0].metadata)


{'source': 'FlexrayDiag.md', 'source_type': 'wiki_summary', 'doc_id': 'FlexrayDiag.md'}


In [48]:
# Let's say you got some results from retriever
results = retriever_api.invoke("flexray")

# Each result is a summary Document
summary_docawy = results[0]
print("Summary:", summary_docawy.page_content)
print("Metadata:", summary_docawy.metadata)

# Now fetch the full/original doc from store using the doc_id
doc_id = summary_docawy.metadata["doc_id"]
original_doc = retriever.docstore.mget([doc_id])[0]

print("✅ Original doc content:")
print(original_doc.page_content)
print("✅ Original doc metadata:")
print(original_doc.metadata)


Summary: The document outlines various functions and their descriptions related to Flexray Diagnosis, including calculating access keys, configuring timeouts, sending requests cyclically, modifying requests, and testing byte sequences. It also includes messages related to error handling and response verification.
Metadata: {'doc_id': 'FlexrayDiag.md', 'source': 'FlexrayDiag.md', 'source_type': 'wiki_summary'}
✅ Original doc content:
### Function: CalculateAccessKey

**Description:** Description: Purpose: Calculate a security access key based on a seed and other parameters. Example: *_ApiCalculateAccessKey(4, 4, 2, 2, 0x01, [0xDE, 0xAD, 0xBE, 0xEF]) Parameters:

**Parameters:**

| Name | Type | EncodedType | Default |
|---|---|---|---|
| SeedSize | int | uint32 |  |
| The seed size in bytes |  |  |  |
| KeySize | int | uint32 |  |
| The key size in bytes |  |  |  |
| SeedOffset | int | uint32 |  |
| The seed offset in diag response buffer |  |  |  |
| KeyOffset | int | uint32 |  |
| The

In [79]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate only one more 
different version of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0, model="gpt-4.1") 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [70]:
from langchain.load import dumps, loads
from langchain_core.runnables import RunnableLambda

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

def add_original_query(inputs: dict):
    """Add original question back to generated queries"""
    question = inputs["question"]
    queries = generate_queries.invoke({"question": question})
    if question not in queries:
        queries = [question] + queries
    return {"queries": queries}

def search_with_queries(inputs: dict):
    all_results = []
    for q in inputs["queries"]:
        results = retriever_api.invoke(q)  # summaries with metadata
        print(f"Query: {q} -> {len(results)} results")
        print(f"Results: {[doc.metadata.get('source', 'unknown') for doc in results]}")
        all_results.append(results)
    return all_results

def swap_for_original(docs):
    """Replace summary docs with their original full docs from docstore"""
    originals = []
    for d in docs:
        doc_id = d.metadata.get("doc_id")
        if doc_id:
            orig = retriever.docstore.mget([doc_id])[0]
            if orig:  # fallback if not found
                originals.append(orig)
            else:
                originals.append(d)  # keep summary if original missing
        else:
            originals.append(d)  # keep summary if no id
    return originals

# Wrap with RunnableLambda
retrieval_chain = (
    RunnableLambda(add_original_query)
    | RunnableLambda(search_with_queries)
    | RunnableLambda(get_unique_union)
    | RunnableLambda(swap_for_original)  # 🔥 convert to original docs here
)


In [73]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0, model="gpt-4.1")

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

final_rag_chain = (
    {
        "context": retrieval_chain | RunnableLambda(format_docs),
        "question": itemgetter("question"),
    }
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":"how to start sending Lin diags? explain step by step."})

Query: how to start sending Lin diags? explain step by step. -> 1 results
Results: ['LinDiag.md']
Query: What are the step-by-step instructions for initiating LIN diagnostics communication? -> 1 results
Results: ['LinDiag.md']
Query:  -> 1 results
Results: ['ComPcEthernetPdus.md']
Query: Can you provide a detailed guide on how to begin sending LIN diagnostic messages? -> 1 results
Results: ['LinDiag.md']


'To **start sending LIN diagnostics (diags)**, you need to configure the diagnostic environment and send your first diagnostic request. Here’s a step-by-step guide based on the provided API/context:\n\n---\n\n## **Step 1: Configure the Diagnostic Environment**\n\n### 1. **Set the Default NAD (Node Address)**\nLIN diagnostics require a NAD (Node Address) to address the correct slave node.\n\n```plaintext\n*_ApiConfigureDefault_NAD(0x7E)\n```\n- Replace `0x7E` with the NAD of your target LIN slave.\n\n---\n\n### 2. **Enable the Diagnostic Module (Transport Protocol)**\nEnable the Transport Protocol (TP) layer for diagnostics.\n\n```plaintext\n*_ApiConfigureDiag(true)\n```\n\n---\n\n### 3. **(Optional) Set Default Timeouts**\nConfigure the default timeouts for diagnostic requests and pending responses.\n\n```plaintext\n*_ApiConfigureDefaultTimeout(1000, 5000)\n```\n- `1000` ms for normal response timeout (P2server)\n- `5000` ms for pending response timeout (P2*server)\n\n---\n\n### 4. **(

In [None]:
# Pipeline: wiki -> api -> synthesize
# Run this in the same notebook/session where you already have:
# - vectorstore (Chroma)
# - retriever (MultiVectorRetriever with docstore)
# - generate_queries (the query rewriter runnable)
# - ChatOpenAI, StrOutputParser, ChatPromptTemplate imported

from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads
from langchain_core.runnables import RunnableLambda

# Create filtered retrievers (re-use your vectorstore variable)
# Adjust k as you like (docs returned per generated query)
retriever_wiki = vectorstore.as_retriever(search_kwargs={"k": 5, "filter": {"source_type": "api_summary"}})
retriever_api  = vectorstore.as_retriever(search_kwargs={"k": 3, "filter": {"source_type": "wiki_summary"}})

# LLM + prompt
llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")   # use gpt-4.1 or change to preferred model
qa_prompt = ChatPromptTemplate.from_template(
    "Answer the following question based on this context:\n\n{context}\n\nQuestion: {question}"
)

# Helper: run multi-query retrieval using the generate_queries runnable
def multi_query_retrieve(question: str, retriever_obj, doc_store, max_queries=None):
    """
    Run the generate_queries runnable, run the retriever for each generated query,
    dedupe results, then fetch original docs from doc_store (using doc_id).
    Returns a list of original Documents (not summaries).
    """
    # 1) get generated queries (includes original)
    queries = generate_queries.invoke({"question": question})
    if question not in queries:
        queries = [question] + queries
    if max_queries:
        queries = queries[:max_queries]

    # 2) run retriever for each query (retriever_obj is vectorstore.as_retriever(...))
    all_results = []
    for q in queries:
        results = retriever_obj.invoke(q)  # returns summary Documents
        print(f"Query: {q} -> {len(results)} results")
        print(f"Results: {[doc.metadata.get('source','unknown') for doc in results]}")
        all_results.extend(results)

    # 3) dedupe preserving order (by serialized doc)
    seen = set()
    unique_summaries = []
    for s in all_results:
        key = dumps(s)
        if key not in seen:
            seen.add(key)
            unique_summaries.append(s)

    # 4) map summaries -> original docs using doc_id via doc_store.mget
    doc_ids = []
    for s in unique_summaries:
        doc_id = s.metadata.get("doc_id") or s.metadata.get("doc_id".upper()) or s.metadata.get("docid")
        if doc_id:
            doc_ids.append(doc_id)

    # preserve order and dedupe doc_ids
    seen_ids = set()
    unique_ids = []
    for i in doc_ids:
        if i not in seen_ids:
            seen_ids.add(i)
            unique_ids.append(i)

    originals = []
    if unique_ids:
        originals = doc_store.mget(unique_ids)  # returns list of Documents (same order)
        # some docstores return None for missing ids; filter
        originals = [d for d in originals if d is not None]

    # If docstore lookup returned none or empty, fallback to using the summaries themselves
    if not originals:
        # convert summary -> Document-like object (keeping page_content & metadata)
        originals = unique_summaries

    return originals


# Helper: format documents with optional truncation and include source header
def format_docs_with_source(docs, max_docs=3, max_chars=3000):
    """
    Return a single string that concatenates up to max_docs documents.
    Each doc is truncated to max_chars characters to avoid blowing up the prompt.
    """
    out = []
    for d in docs[:max_docs]:
        src = d.metadata.get("source", d.metadata.get("doc_id", "unknown"))
        text = d.page_content or ""
        if len(text) > max_chars:
            text = text[:max_chars] + "\n\n...[truncated]..."
        out.append(f"Source: {src}\n{text}")
    return "\n\n".join(out)


# Helper: ask the LLM to answer from docs (safe truncation applied)
def answer_from_docs(question: str, docs, max_docs=3, max_chars=3000):
    context = format_docs_with_source(docs, max_docs=max_docs, max_chars=max_chars)
    # run the QA prompt -> LLM -> string
    return (qa_prompt | llm | StrOutputParser()).invoke({"context": context, "question": question})


# Orchestration: wiki -> api -> synthesize
def wiki_then_api_pipeline(question: str,
                           doc_store = retriever.docstore,   # the MultiVectorRetriever.docstore you created earlier
                           wiki_retr = retriever_wiki,
                           api_retr  = retriever_api,
                           wiki_max_queries=3,
                           api_max_queries=2,
                           wiki_max_docs=3,
                           api_max_docs=3,
                           wiki_max_chars=100000,
                           api_max_chars=100000):
    """
    1) multi-query search in wiki_retr
    2) produce wiki_answer
    3) build combined query = original + wiki_answer
    4) multi-query search in api_retr with combined query
    5) produce api_answer
    6) synthesize final answer combining wiki+api answers
    """

    # Step A: wiki retrieval
    wiki_originals = multi_query_retrieve(question, wiki_retr, doc_store, max_queries=wiki_max_queries)

    # Step B: wiki answer
    wiki_answer = answer_from_docs(question, wiki_originals, max_docs=wiki_max_docs, max_chars=wiki_max_chars)

    # Step C: build combined query for API stage (use wiki_answer as context)
    combined_query = question + "\n\nWiki findings (short):\n" + wiki_answer

    # Step D: API retrieval using combined query
    api_originals = multi_query_retrieve(combined_query, api_retr, doc_store, max_queries=api_max_queries)

    # Step E: API answer (use original question as the question for consistency)
    api_answer = answer_from_docs(question, api_originals, max_docs=api_max_docs, max_chars=api_max_chars)

    # Step F: Final synthesis (mix wiki_answer + api_answer + original)
    final_context = (
        "WIKI ANSWER:\n" + wiki_answer.strip() + "\n\n"
        "API ANSWER:\n" + api_answer.strip() + "\n\n"
        "Original question:\n" + question
    )

    final_answer = (qa_prompt | llm | StrOutputParser()).invoke({"context": final_context, "question": question})

    return {
        "wiki_answer": wiki_answer,
        "api_answer": api_answer,
        "final_answer": final_answer,
        "wiki_docs": wiki_originals,
        "api_docs": api_originals,
    }


# Example usage:
res = wiki_then_api_pipeline("how to use castle installer?")
print("\n\n=== FINAL SYNTHESIZED ANSWER ===\n")
print(res["final_answer"])


Query: how to initialize flexray diag? explain step by step. -> 5 results
Results: ['general-topics_flexray_flexray-diag.txt', 'general-topics_flexray_flexrayonoscilloscope.txt', 'test-scripting_hardware-apis_networking-flexray_best-practices_enablecrcsqcforflexray.txt', 'tools_third-party-tools_adtf_extractasciifromdatusingadtf.txt', 'test-scripting_hardware-apis_networking-xcp_configurediagandxcptoworkonthesamemessagesviadynamiccom.txt']
Query: What are the step-by-step procedures to set up FlexRay diagnostics? -> 5 results
Results: ['general-topics_flexray_flexray-diag.txt', 'general-topics_flexray_flexrayonoscilloscope.txt', 'test-scripting_hardware-apis_networking-flexray_best-practices_enablecrcsqcforflexray.txt', 'tools_third-party-tools_adtf_extractasciifromdatusingadtf.txt', 'test-scripting_hardware-apis_networking-xcp_configurediagandxcptoworkonthesamemessagesviadynamiccom.txt']
Query: how to initialize flexray diag? explain step by step.

Wiki findings (short):
To initialize