# Import Required Libraries
Import all necessary libraries, including dotenv, langchain modules, and the ingest_urls_to_chromadb function.

In [2]:
from dotenv import load_dotenv
import os
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from vectorstore import ingest_urls_to_chromadb

USER_AGENT environment variable not set, consider setting it to identify your requests.


# Load Environment Variables
Use the dotenv library to load environment variables, such as CHROMA_DB_DIR and OPENAI_API_KEY.

In [None]:
load_dotenv()
persist_directory = os.getenv("CHROMA_DB_DIR", "./chroma_db")
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set. If you're running this in Jupyter Notebook, ensure that the .env file is in the current working directory and that the notebook kernel was restarted after creating or modifying the .env file. You can also manually set the environment variable in a cell using os.environ['OPENAI_API_KEY'] = 'your-key'.")

# Define Helper Functions
Define any helper functions, such as a function to handle errors or validate inputs.

In [None]:
def validate_prompt(prompt: str):
    if not prompt or not isinstance(prompt, str):
        raise ValueError("Prompt must be a non-empty string.")
    return prompt

# Ingest URLs into ChromaDB
Call the ingest_urls_to_chromadb function to ingest URLs into the ChromaDB vector store.

In [None]:
urls = [
    "https://www.nationalgeographic.com/latest-stories",
    "https://www.smithsonianmag.com/smart-news/",
    "https://www.bbc.com/news/science_and_environment",
    "https://www.nasa.gov/news/all-news/",
    "https://blog.khanacademy.org/",
    "https://www.history.com/this-day-in-history",
    "https://www.cdc.gov/media/index.html",
    "https://www.nih.gov/news-events/news-releases",
    "https://www.scientificamerican.com/section/news/",
    "https://www.unicef.org/press-releases"
]
ingest_urls_to_chromadb(urls, persist_directory=persist_directory)
print("Ingestion complete.")

Created a chunk of size 1969, which is longer than the specified 1024
Created a chunk of size 1188, which is longer than the specified 1024
Created a chunk of size 1188, which is longer than the specified 1024


Ingestion complete.


  vectorstore.persist()


# Initialize Vector Store
Initialize the Chroma vector store with embeddings and the persist directory.

In [None]:
embeddings = FastEmbedEmbeddings()
vectorstore = Chroma(
    embedding_function=embeddings,
    persist_directory=persist_directory
)

# Retrieve Relevant Documents
Use the vector store's retriever to fetch relevant documents based on the input prompt.

In [None]:
prompt = "What is the latest news about Mars exploration?"
validate_prompt(prompt)
retriever = vectorstore.as_retriever()
relevant_docs = retriever.get_relevant_documents(prompt)
print(f"Found {len(relevant_docs)} relevant documents.")

# Prepare Prompt Template
Create a ChatPromptTemplate for the LLM, ensuring it includes the context and user prompt.

In [None]:
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a kind and knowledgeable teacher chatbot designed to help children learn.\n\nYour job is to provide safe, clear, and age-appropriate explanations.\n\nNEVER make up answers. You MUST ONLY use the information provided in the <Context> section.\n\nIf the answer is not directly found in the context, say:\n\"I'm not sure about that based on what I have here.\"\n\nAlways respond kindly, simply, and concisely.\n\nCONTEXT:\n\n{context}"""),
    ("human", """The student is {age_group} years old.\nThey asked this question:\n{prompt}""")
])

# Run Retrieval-Augmented Generation (RAG) Chain
Invoke the RAG chain with the context, prompt, and age group, and handle the output.

In [None]:
age_group = "Middle School [12-14]"
model = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=openai_api_key
)
chain = create_stuff_documents_chain(
    llm=model,
    prompt=chat_prompt
)
result = chain.invoke({
    "context": "\n\n".join([doc.page_content for doc in relevant_docs]),
    "prompt": prompt,
    "age_group": age_group
})
if isinstance(result, dict) and "output" in result:
    print("Summary:", result["output"])
elif isinstance(result, str):
    print("Summary:", result)
else:
    print("Unexpected result:", result)
