In [None]:
%pip install \
    langchain \
    langchain-openai \
    langchain-community \
    faiss-cpu \
    tiktoken

In [None]:
%pip install pypdf

In [None]:
import os
from typing import List

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.tools import tool
from langchain.chat_models import init_chat_model
from langchain.agents import create_agent

In [None]:
# For Google Colab environment.
from google.colab import userdata
key = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = key

# For local environment.
#import os
#
#key = os.getenv("OPENAI_API_KEY")

In [None]:
# -------------------------
# 1. INDEXING: load → split → store
# Only for .txt files.
# -------------------------

#def load_local_documents(data_path: str = "./data"):
#    """
#    Load all .txt files from the given folder as LangChain Documents.
#    Extend glob pattern or add more loaders for PDFs, etc.
#    """
#    loader = DirectoryLoader(
#        data_path,
#        glob="**/*.txt",        # change to e.g. "**/*" and add filters if needed
#        loader_cls=TextLoader,
#        show_progress=True,
#    )
#    docs = loader.load()
#    print(f"Loaded {len(docs)} document(s) from {data_path}")
#    return docs

In [None]:
# -------------------------
# 1. INDEXING: load → split → store
# For .txt, .md, .pdf.
# -------------------------

from pathlib import Path
from langchain_community.document_loaders import TextLoader, PyPDFLoader


def load_local_documents(data_path: str = "./data"):
    """
    Loads .txt, .md and .pdf files from `data/`.
    Automatically chooses the right loader per extension.
    """
    folder = Path(data_path)
    if not folder.exists():
        raise RuntimeError(f"Folder not found: {data_path}")

    doc_paths = list(folder.rglob("*.*"))
    if not doc_paths:
        raise RuntimeError(f"No documents found in {data_path}")

    loaded_docs = []

    for path in doc_paths:
        print(f"Processing {path}")
        ext = path.suffix.lower()

        try:
            if ext in [".txt", ".md", ".rst", ".log"]:
                # Treat markdown & text alike
                loader = TextLoader(str(path), encoding="utf-8")
                docs = loader.load()

            elif ext == ".pdf":
                loader = PyPDFLoader(str(path))
                docs = loader.load()

            else:
                print(f"Skipping unsupported file: {path.name}")
                continue

            # annotate metadata with filename
            for d in docs:
                d.metadata["source_file"] = path.name

            loaded_docs.extend(docs)
            print(f"Loaded {path.name} ({len(docs)} pages/chunks)")

        except Exception as e:
            print(f"Error loading {path.name}: {e}")

    print(f"\nTotal loaded documents: {len(loaded_docs)}")
    return loaded_docs


def build_vector_store(docs):
    """
    Split documents into chunks and index them in an in-memory vector store.
    """
    # 1) Split into chunks (RAG-friendly)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True,
    )
    splits = splitter.split_documents(docs)
    print(f"Split into {len(splits)} chunk(s)")

    # 2) Embeddings + vector store
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vector_store = InMemoryVectorStore(embeddings)
    vector_store.add_documents(splits)
    print("Indexed chunks in InMemoryVectorStore")

    return vector_store


In [None]:
# -------------------------
# 2. TOOL: retrieval function for RAG
# -------------------------

from langchain.tools import tool

def build_retrieve_tool(vector_store):
    """
    Wraps the vector store in a LangChain tool.
    The tool returns context annotated with file name and page number so
    the agent can mention them in its answer.
    """

    @tool(response_format="content_and_artifact")
    def retrieve_context(query: str):
        """
        Retrieve information from your local knowledge base to help answer a query.

        The returned text is annotated like:
        [DOC 1] file=policy_terms.txt page=1
        ...
        """
        retrieved_docs = vector_store.similarity_search(query, k=4)

        blocks = []
        for i, doc in enumerate(retrieved_docs, start=1):
            source_file = (
                doc.metadata.get("source_file")
                or doc.metadata.get("source")
                or "unknown_file"
            )
            page = doc.metadata.get("page")  # PyPDFLoader usually sets this
            if page is None:
                page_info = ""
            else:
                page_info = f", page={page + 1}" if isinstance(page, int) else f", page={page}"

            header = f"[DOC {i}] file={source_file}{page_info}"
            blocks.append(f"{header}\n{doc.page_content.strip()}")

        serialized = "\n\n".join(blocks)

        # 1) `serialized` -> text the model sees as tool output
        # 2) `retrieved_docs` -> kept as artifacts for richer tooling if needed
        return serialized, retrieved_docs

    return retrieve_context


In [None]:
# -------------------------
# 3. AGENT: model + tools using create_agent
# -------------------------

from langchain.chat_models import init_chat_model
from langchain.agents import create_agent

def build_rag_agent(retrieve_tool):
    """
    Build a graph-based agent that can call the retrieve_context tool.
    It will also mention which document(s) each answer comes from.
    """
    model = init_chat_model("gpt-4.1-mini")
    tools = [retrieve_tool]

    system_prompt = """
You are a helpful assistant over a local knowledge base of documents.

- If the user asks anything that could depend on the local files,
  you MUST first call the `retrieve_context` tool.
- The tool returns context annotated with labels like:
  [DOC 1] file=claims_process.md, page=2
  [DOC 2] file=risk_assessment_guide.pdf, page=1

When you answer:
- Use these DOC labels to show where information comes from.
- Explicitly mention the document name (and page if available) in your answer.
  Example: "The SLA for claim review is 15 working days (source: DOC 1 - claims_process.md)."
- If you use multiple documents, you can cite them like:
  "(sources: DOC 1 - claims_process.md; DOC 3 - risk_assessment_guide.pdf, page 2)"
- If the question is clearly general knowledge and not about the files,
  you may answer directly, and say you are not using the local documents.

Always answer clearly and concisely.
""".strip()

    agent = create_agent(
        model=model,
        tools=tools,
        system_prompt=system_prompt,
    )
    return agent

In [None]:
# -------------------------
# 4. SIMPLE CLI LOOP
# -------------------------


def run_cli(agent):
    """
    Very simple REPL that sends each user message as a new 'thread'.
    Uses agent.stream(...) to let the agent call tools and think in steps. :contentReference[oaicite:9]{index=9}
    """
    print("\n--- RAG Agent ready! Ask questions about your documents. ---")
    print("Type 'exit' or 'quit' to stop.\n")

    while True:
        user_input = input("You: ").strip()
        if not user_input:
            continue
        if user_input.lower() in {"exit", "quit"}:
            print("Bye!")
            break

        # We send the conversation as a list of messages;
        # here we just send the latest user message.
        events = agent.stream(
            {"messages": [{"role": "user", "content": user_input}]},
            stream_mode="values",
        )

        last_state = None
        for state in events:
            last_state = state  # we only care about the final state

        if not last_state:
            print("Agent: (no response)")
            continue

        # The state is a dict with a `messages` list; last one is the final AI message.
        final_msg = last_state["messages"][-1]
        content = final_msg.content

        if isinstance(content, list):
            # Sometimes content is a list of parts; join any text parts.
            text = "".join(
                part.get("text", "") if isinstance(part, dict) else str(part)
                for part in content
            )
        else:
            text = content

        print(f"\nAgent: {text}\n")

In [None]:
# -------------------------
# 5. MAIN
# -------------------------


def main():
    if "OPENAI_API_KEY" not in os.environ:
        raise RuntimeError("Please set the OPENAI_API_KEY environment variable.")

    docs = load_local_documents("./data")
    if not docs:
        raise RuntimeError(
            "No .txt files found in ./data. Add some documents before running."
        )

    vector_store = build_vector_store(docs)
    retrieve_tool = build_retrieve_tool(vector_store)
    agent = build_rag_agent(retrieve_tool)

    run_cli(agent)

In [None]:
if __name__ == "__main__":
    main()

### Try questions like:

“Summarize the key insurance terms from the documents.”

“What SLA penalties are mentioned?”

“What risk evaluation methods does the policy mandate?”

“Give me a combined summary of all documents.”