#### Install Libraries and Setup Environment

In [None]:
# If running for the first time:
# !pip install sentence-transformers faiss-cpu PyMuPDF requests streamlit python-dotenv


In [1]:
from sentence_transformers import SentenceTransformer
import fitz  
import os
import faiss
import pickle
import requests

  from .autonotebook import tqdm as notebook_tqdm


### 2. Load and Chunk Documents


![RAG Pipeline](images/indexing.png)

In [2]:

DOCS_PATH = "data/documents/"
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

def load_documents():
    """
    Load PDF documents from the DOCS_PATH.
    Returns a list of dicts with keys:
    - 'source': a string identifier (filename and page number)
    - 'text': the text extracted from the page
    """
    docs = []
    for filename in os.listdir(DOCS_PATH):
        if filename.lower().endswith(".pdf"):
            doc_path = os.path.join(DOCS_PATH, filename)
            doc = fitz.open(doc_path)
            for page_no, page in enumerate(doc, start=1):
                text = page.get_text().strip()
                if text:
                    docs.append({"source": f"{filename} - Page {page_no}", "text": text})
    return docs

def chunk_text(docs):
    """
    Chunk each document and preserve the source info.
    Returns a list of chunks, each is a dict with keys:
    - 'text': the text chunk
    - 'source': the source info from the parent document
    """
    chunks = []
    for doc in docs:
        text = doc["text"]
        source = doc["source"]
        for i in range(0, len(text), CHUNK_SIZE - CHUNK_OVERLAP):
            chunk = text[i: i + CHUNK_SIZE]
            chunks.append({"text": chunk, "source": source})
    return chunks

documents = load_documents()
chunks = chunk_text(documents)
print(f"Loaded {len(documents)} pages and chunked into {len(chunks)} pieces.")


Loaded 63 pages and chunked into 568 pieces.


## 3. Build Embeddings and FAISS Index

In [3]:
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_PATH = "embeddings/index.faiss"
CHUNKS_PATH = "embeddings/chunks.pkl"
model = SentenceTransformer(EMBEDDING_MODEL)

def build_faiss_index(chunks):
    """
    Build a FAISS index from the text of the chunks.
    Save the vector index and the mapping to chunks (with sources).
    """
    # Encode using only the text
    embeddings = model.encode([chunk["text"] for chunk in chunks])
    dim = embeddings[0].shape[0]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    with open(CHUNKS_PATH, "wb") as f:
        pickle.dump(chunks, f)
    faiss.write_index(index, INDEX_PATH)
    print("✅ FAISS index created and saved.")

build_faiss_index(chunks)    


✅ FAISS index created and saved.


## 🔍 4. Retrieve Relevant Chunks

![RAG Pipeline](images/retriever.png)

In [4]:
def load_faiss_index():
    """
    Load the FAISS index and corresponding chunks mapping.
    """
    index = faiss.read_index(INDEX_PATH)
    with open(CHUNKS_PATH, "rb") as f:
        chunks = pickle.load(f)
    return index, chunks

def get_relevant_chunks(question, top_k=3):
    """
    Return the top_k most relevant chunks (with their source info) based on the question.
    """
    index, chunks = load_faiss_index()
    q_embed = model.encode([question])
    distances, indices = index.search(q_embed, top_k)
    # Return list of dictionaries containing text and source for each retrieved chunk.
    return [chunks[i] for i in indices[0]]

question = "Tell me about Chain of Abstraction?"
relevant_chunks = get_relevant_chunks(question)
for idx, chunk in enumerate(relevant_chunks, start=1):
    print(f"\n[{chunk['source']}]:\n{chunk['text']}...")



[AI_Chain_of_Abstraction_1709881765.pdf - Page 1]:
the [city y2 is in -WikiSearch-> y3]. 
y1: Ralph Hefferline was a professor at 
Columbia University …
y2: Columbia University
y3: Columbia University is an Ivy League 
university in New York …
The answer is New York.
Mathematical Reasoning  
Wiki QA  
Figure 1: Overview of chain-of-abstraction reasoning
with tools. Given a domain question (green scroll), a
LLM is fine-tuned to first generate an abstract multi-step
reasoning chain (blue bubble), and then call external
tools to reify the chain wit...

[AI_Chain_of_Abstraction_1709881765.pdf - Page 1]:
Efficient Tool Use with Chain-of-Abstraction Reasoning
Silin Gao1,2∗, Jane Dwivedi-Yu2, Ping Yu2, Xiaoqing Ellen Tan2,
Ramakanth Pasunuru2, Olga Golovneva2, Koustuv Sinha2
Asli Celikyilmaz2, Antoine Bosselut1, Tianlu Wang2
1EPFL, 2FAIR @ Meta
1{silin.gao,antoine.bosselut}@epfl.ch
2{silingao,janeyu,pingyu,ellenxtan}@meta.com
2{rpasunuru,olggol,koustuvs,aslic,tianluwang}@meta.com
Abstract
To

## 💬 5. Query Local LLM (via Ollama)

![RAG Pipeline](images/RAG.png)

In [10]:

OLLAMA_MODEL = "mistral"
OLLAMA_URL = "http://localhost:11434/api/generate"

def query_llm(prompt):
    payload = {
        "model": OLLAMA_MODEL,
        "prompt": prompt,
        "stream": False
    }

    try:
        response = requests.post(OLLAMA_URL, json=payload)
        response.raise_for_status()
        return response.json()["response"]
    except requests.exceptions.RequestException as e:
        return f"Error connecting to Ollama: {e}"


In [19]:
prompt = "\n\n".join(f"[{c['source']}]:\n{c['text']}" for c in relevant_chunks)
prompt += f"\n\nQuestion:\n{question}"
answer = query_llm(prompt)
print("\n💬 Answer:\n", answer)


💬 Answer:
  The "Chain of Abstraction" is a method used in artificial intelligence (AI) to enable large language models (LLMs) to perform general reasoning that aligns with human expectations. This approach decouples the general reasoning of LLMs from domain-specific knowledge obtained from external tools, making it more efficient and adaptable for various tasks.

In simpler terms, the Chain of Abstraction works by training an LLM to generate a series of abstract steps or placeholders, represented as "y1", "y2", and "y3" in this case. These placeholders can be filled with the appropriate domain-specific knowledge using external tools, allowing the model to answer complex questions effectively and accurately.

References:
1. Wei et al., 2021. Scalable Transfer Learning of Grounded Language Models. Advances in Neural Information Processing Systems (NeurIPS).
2. Wei et al., 2022. Chain-of-Thought Reasoning: Improving Efficiency and Coherence of Large Language Models through Decoupled Pla

##  6. Full RAG Pipeline Function

In [5]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
INDEX_PATH = "embeddings/index.faiss"
CHUNKS_PATH = "embeddings/chunks.pkl"
DOCS_PATH = "data/documents/"
model = SentenceTransformer(EMBEDDING_MODEL)

In [6]:
def build_index():
    print("📄 Loading documents...")
    docs = load_documents()

    print("✂️ Chunking text...")
    chunks = chunk_text(docs)

    print(f"🔢 Total chunks: {len(chunks)}")
    print("🧠 Building FAISS index...")
    build_faiss_index(chunks)

    print("✅ Index built and saved!")

In [7]:
DEFAULT_TEMPLATE = """You are a helpful assistant. Use the following context to answer the question.
If you don't know the answer, just say so.

Context:
{context}

Question:
{question}
"""

def load_prompt_template():
    return DEFAULT_TEMPLATE

def build_prompt(chunks, question):
    """
    Build a prompt by combining each chunk with its source.
    Each chunk is annotated with the source from which it came.
    """
    context_lines = []
    for chunk in chunks:
        # You can format the source info as you wish.
        context_lines.append(f"[{chunk['source']}]:\n{chunk['text']}")
    context = "\n\n".join(context_lines)
    template = load_prompt_template()
    return template.format(context=context, question=question)

In [8]:
def run_rag(question):
    print("🔍 Retrieving relevant chunks...")
    chunks = get_relevant_chunks(question)

    print("\n Retrieved Chunks (showing sources):")
    for idx, chunk in enumerate(chunks, start=1):
        preview = chunk['text'][:100].replace("\n", " ")  # short preview
        print(f"   {idx}. [{chunk['source']}]: {preview}...")

    print("\nBuilding prompt with context...")
    prompt = build_prompt(chunks, question)

    print("Querying local LLM...")
    answer = query_llm(prompt)

    return answer, chunks

In [14]:
question = "Tell me about Chain of Abstraction?"
answer, relevant_chunks = run_rag(question)

🔍 Retrieving relevant chunks...

 Retrieved Chunks (showing sources):
   1. [AI_Chain_of_Abstraction_1709881765.pdf - Page 1]: the [city y2 is in -WikiSearch-> y3].  y1: Ralph Hefferline was a professor at  Columbia University ...
   2. [AI_Chain_of_Abstraction_1709881765.pdf - Page 1]: Efficient Tool Use with Chain-of-Abstraction Reasoning Silin Gao1,2∗, Jane Dwivedi-Yu2, Ping Yu2, Xi...
   3. [AI_Chain_of_Abstraction_1709881765.pdf - Page 3]:  et al., 2021). Different from above work, we focus on the planning of general chain-of-thought (Wei...

Building prompt with context...
Querying local LLM...


In [16]:
print(answer)

 The "Chain of Abstraction" is a method proposed by the authors Silin Gao, Jane Dwivedi-Yu, Ping Yu, Xiaoqing Ellen Tan, Ramakanth Pasunuru, Olga Golovneva, Koustuv Sinha, Asli Celikyilmaz, Antoine Bosselut, and Tianlu Wang. This method aims to enhance the reasoning ability of large language models (LLMs) to align with human expectations.

The Chain of Abstraction decouples the general reasoning of LLMs from domain-specific knowledge obtained from external tools. In this method, an LLM is fine-tuned to generate multi-step reasoning chains with abstract placeholders like y1, y2, and y3. These placeholders are then filled with information from external tools that specialize in specific domains.

The goal of the Chain of Abstraction is to enable LLMs to perform faithful reasoning that takes into account both general knowledge and specialized domain knowledge. This helps improve the accuracy and relevance of the responses provided by these models, making them more useful for a wider range 