In [1]:
from typing import List
import chromadb
from chromadb.utils import embedding_functions

In [2]:


# --- 1. Initialize the ChromaDB Client and Collection ---
# This setup is run once when your agent starts up.

# Define paths (must match the path used in the loading script)
DB_PATH = "../chroma_db_chunks" 
COLLECTION_NAME = "alphabet_10k_collection_chunks"

# Initialize the embedding function used for both loading and querying
# It must match the model used when the data was indexed.
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

# Initialize the persistent client and get the specific collection object
try:
    client = chromadb.PersistentClient(path=DB_PATH)
    # The 'collection' variable is now available globally in this module's scope
    collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_function)
    print(f"ChromaDB collection '{COLLECTION_NAME}' loaded successfully.")
except ValueError as e:
    print(f"Error loading collection: {e}. Make sure you ran the PDF loader script first.")
    # Exit or handle error if the collection isn't found


  from .autonotebook import tqdm as notebook_tqdm


ChromaDB collection 'alphabet_10k_collection_chunks' loaded successfully.


In [4]:
query_text = "According to the MD&A, how might the increasing proportion of revenues derived from non-advertising sources like Google Cloud and devices potentially impact Alphabet's overall operating margin, and why?"

results = collection.query(
        query_texts=[query_text],
        n_results=3, # Retrieve top 3 relevant results
        include=["documents", "metadatas"]
    )
results

{'ids': [['alphabet-form-10-K-2024.pdf_p37_c1',
   'alphabet-form-10-K-2024.pdf_p31_c4',
   'alphabet-form-10-K-2024.pdf_p58_c5']],
 'embeddings': None,
 'documents': [['Table of Contents Alphabet Inc.\nFinancial Results\nRevenues\nThe following table presents revenues by type (in millions):\nYear Ended December 31,\n2023 2024\nGoogle Search & other $ 175,033 $ 198,084 \nYouTube ads 31,510 36,147 \nGoogle Network 31,312 30,359 \nGoogle advertising 237,855 264,590 \nGoogle subscriptions, platforms, and devices 34,688 40,340 \nGoogle Services total 272,543 304,930 \nGoogle Cloud 33,088 43,229 \nOther Bets 1,527 1,648 \nHedging gains (losses) 236 211 \nTotal revenues $ 307,394 $ 350,018 \nGoogle Services\nGoogle advertising revenues\nGoogle Search & other\nGoogle Search & other revenues increased $23.1 billion from 2023 to 2024. The overall growth was driven by interrelated factors including\nincreases in search queries resulting from growth in user adoption and usage on mobile devices; g

In [6]:
# Format results for the LLM to read easily (including citation info)
formatted_results = []
for doc, meta in zip(results['documents'][0], results['metadatas'][0]):
    formatted_results.append(
        f"Source: {meta['source']} (Page Number: {meta['page_number']})\nContent: {doc}\n---"
    )

formatted_results

['Source: alphabet-form-10-K-2024.pdf (Page Number: 37)\nContent: Table of Contents Alphabet Inc.\nFinancial Results\nRevenues\nThe following table presents revenues by type (in millions):\nYear Ended December 31,\n2023 2024\nGoogle Search & other $ 175,033 $ 198,084 \nYouTube ads 31,510 36,147 \nGoogle Network 31,312 30,359 \nGoogle advertising 237,855 264,590 \nGoogle subscriptions, platforms, and devices 34,688 40,340 \nGoogle Services total 272,543 304,930 \nGoogle Cloud 33,088 43,229 \nOther Bets 1,527 1,648 \nHedging gains (losses) 236 211 \nTotal revenues $ 307,394 $ 350,018 \nGoogle Services\nGoogle advertising revenues\nGoogle Search & other\nGoogle Search & other revenues increased $23.1 billion from 2023 to 2024. The overall growth was driven by interrelated factors including\nincreases in search queries resulting from growth in user adoption and usage on mobile devices; growth in advertiser spending; and\nimprovements we have made in ad formats and delivery.\nYouTube ads\n-