In [None]:
! pip install langchain_community tiktoken langchain-google-genai langchainhub chromadb langchain

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '<your-api-key>'

In [None]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai

#### CONFIG ####
os.environ["GOOGLE_API_KEY"] = "<your-gemini-api-key>"

# Setup Gemini API
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [None]:
import bs4
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
# Persistent Chroma directory
PERSIST_DIR = "./chroma_db"

#### EMBEDDINGS (Gemini) ####
embedding_fn = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#### LOAD OR CREATE VECTORSTORE ####
vectorstore = Chroma(
    persist_directory=PERSIST_DIR,
    embedding_function=embedding_fn
)

retriever = vectorstore.as_retriever()

In [None]:
#### FUNCTIONS ####
def add_pdf(path: str):
    """Add PDF from local disk to vectorstore"""
    loader = PyPDFLoader(path)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    vectorstore.add_documents(splits)
    vectorstore.persist()
    print(f"✅ Added {len(splits)} chunks from PDF {path}")

def add_url(url: str):
    """Add web page to vectorstore"""
    loader = WebBaseLoader(
        web_paths=(url,),
        bs_kwargs=dict(
            parse_only=bs4.SoupStrainer(
                class_=("post-content", "post-title", "post-header")
            )
        ),
    )
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    vectorstore.add_documents(splits)
    vectorstore.persist()
    print(f"✅ Added {len(splits)} chunks from URL {url}")

#### PROMPT ####
prompt = PromptTemplate(
    template=(
        "You are a helpful assistant. Use the context below to answer the question.\n\n"
        "Context:\n{context}\n\n"
        "Question: {question}\n\n"
        "Answer:"
    ),
    input_variables=["context", "question"],
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

#### GEMINI ANSWER FUNCTION ####
def ask_gemini(question: str):
    context = format_docs(retriever.get_relevant_documents(question))
    filled_prompt = prompt.format(context=context, question=question)

    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(filled_prompt)
    return response.text

#### USAGE ####
# Add docs (only once per source)
# add_pdf("myfile.pdf")
# add_url("https://lilianweng.github.io/posts/2023-06-23-agent/")

# Ask question
answer = ask_gemini("What is Task Decomposition?")
print("\n🔹 Answer:\n", answer)