## Setting Up Environment

In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '<your-api-key>'

In [None]:
os.environ['OPENAI_API_KEY'] = '<your-api-key>'

In [None]:
import bs4
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate

## Document Loader

In [None]:
#### PERSISTENCE ####
PERSIST_DIR = "./chroma_db"
embedding_fn = OpenAIEmbeddings()

#### LOAD OR CREATE VECTORSTORE ####
if os.path.exists(PERSIST_DIR):
    print("🔹 Loading existing Chroma DB...")
    vectorstore = Chroma(persist_directory=PERSIST_DIR, embedding_function=embedding_fn)
else:
    print("🔹 Creating empty Chroma DB...")
    vectorstore = Chroma(persist_directory=PERSIST_DIR, embedding_function=embedding_fn)

retriever = vectorstore.as_retriever()

In [None]:
#### FUNCTION: Add New Web Docs ####
def add_new_documents_from_web(urls):
    loader = WebBaseLoader(
        web_paths=urls,
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("post-content", "post-title", "post-header")
            )
        ),
    )
    docs = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    # Add metadata "source=web"
    for doc in splits:
        doc.metadata["source"] = "web"

    vectorstore.add_documents(splits)
    vectorstore.persist()
    print(f"✅ Added {len(splits)} new chunks from {urls}")

In [None]:
#### FUNCTION: Add New PDFs ####
def add_new_documents_from_pdf(file_paths):
    all_splits = []
    for file_path in file_paths:
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(docs)

        # Add metadata "source=pdf"
        for doc in splits:
            doc.metadata["source"] = "pdf"
            doc.metadata["filename"] = os.path.basename(file_path)

        all_splits.extend(splits)
    
    vectorstore.add_documents(all_splits)
    vectorstore.persist()
    print(f"✅ Added {len(all_splits)} new chunks from {len(file_paths)} PDF(s)")


## Prompt Generation

In [None]:
#### RETRIEVAL + GENERATION ####
prompt = PromptTemplate(
    template=(
        "You are a helpful AI assistant. Use the provided context to answer the question.\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    ),
    input_variables=["context", "question"],
)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Default retriever (search all sources)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [None]:
#### FILTERED RETRIEVERS ####
pdf_retriever = vectorstore.as_retriever(search_kwargs={"filter": {"source": "pdf"}})
web_retriever = vectorstore.as_retriever(search_kwargs={"filter": {"source": "web"}})

def ask_question(question, source="all"):
    if source == "pdf":
        chain = (
            {"context": pdf_retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )
    elif source == "web":
        chain = (
            {"context": web_retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )
    else:
        chain = rag_chain
    
    return chain.invoke(question)

## Usage

In [None]:
#### USAGE ####

# Example: Add PDFs
# add_new_documents_from_pdf(["./docs/AI_research.pdf"])

# Example: Add a web page
# add_new_documents_from_web(["https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/"])

# Ask across ALL sources
print("\n🔹 All Docs:\n", ask_question("Summarize everything about Task Decomposition"))

# Ask ONLY inside PDFs
print("\n🔹 PDF Only:\n", ask_question("What does the AI research paper say about reinforcement learning?", source="pdf"))

# Ask ONLY inside Web Docs
print("\n🔹 Web Only:\n", ask_question("What does Lilian Weng say about prompt engineering?", source="web"))