In [11]:
!uv add -q langchain-community pypdf langchain-google-genai langchain-mongodb

I0000 00:00:1757991962.218566  497374 fork_posix.cc:71] Other threads are currently calling into gRPC, skipping fork() handlers


In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./data/nke-10k-2023.pdf" # Path to the document
loader = PyPDFLoader(file_path) # Create a loader for the document (PyPDFLoader is a loader for PDF files)

docs = loader.load() # Load the document into a list of documents (each document is a page)

print(len(docs))

107


In [6]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './data/nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [19]:
# Splitting

from langchain_text_splitters import RecursiveCharacterTextSplitter

# Create a text splitter, chunk_size is the size of the chunk, chunk_overlap is the overlap between chunks, 
# add_start_index is to add the start index of the chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200, add_start_index=True)
all_splits = text_splitter.split_documents(docs)  # Split the document into chunks

len(all_splits)

257

In [9]:
# Embeddings

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001") # Create a Google Generative AI embeddings model

In [10]:
vector_1 = embeddings.embed_query(all_splits[0].page_content) # Embed the first chunk
vector_2 = embeddings.embed_query(all_splits[1].page_content) # Embed the second chunk

assert len(vector_1) == len(vector_2) # Assert that the length of the first vector is equal to the length of the second vector
print(f"Generated vectors of length {len(vector_1)}\n") # Print the length of the vectors
print(vector_1[:10]) # Print the first 10 elements of the vectors

Generated vectors of length 3072

[-0.0014180047437548637, 0.0006364254513755441, 0.0023240740410983562, -0.028028815984725952, -0.006018347572535276, 0.007032659370452166, 0.008973612450063229, -0.009234011173248291, -0.0018726892303675413, 0.02026733197271824]


In [None]:
import os
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from dotenv import load_dotenv

load_dotenv()  # Load the environment variables

MONGODB_URI = os.getenv("MONGODB_URI")
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION")

client = MongoClient(MONGODB_URI) # Create a MongoDB client
db = client[MONGODB_DATABASE] # Create a database
collection = db[MONGODB_COLLECTION] # Create a collection

vector_store = MongoDBAtlasVectorSearch(
    embedding=embeddings, # Embeddings model
    collection=collection, # Collection to store the vectors
    index_name="default", # Index name to store the vectors
    relevance_score_fn="cosine", # Relevance score function
)


In [20]:
ids = vector_store.add_documents(documents=all_splits) # Add the documents to the vector store
print(ids)

GoogleGenerativeAIError: Error embedding content: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
]

In [None]:
# Sync query
results = vector_store.similarity_search(  # Search for the most similar documents (synchronous)
    "How many distribution centers does Nike have in the US?"
)

print(results)


[]


In [None]:
# Async query
results = await vector_store.asimilarity_search(
    "When was Nike incorporated?"
)  # Search for the most similar documents (asynchronous)

print(results[0])


In [None]:
# Return scores

# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)


In [None]:
# Return documents based on similarity to an embedded query

embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

In [None]:
# Retrievers

# Vectorstores implement a similarity_search method
from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> list[Document]:
    return vector_store.similarity_search(
        query, k=1
    )  # Search for the most similar documents


retriever.batch(  # Batch the queries
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)


[[], []]

In [None]:
# Vectorstores implement an as_retriever method

"""
VectorStoreRetriever supports search types of "similarity" (default), 
"mmr" (maximum marginal relevance), and "similarity_score_threshold". 
"""

retriever = vector_store.as_retriever(
    search_type="similarity",  # options: "similarity", "mmr", "similarity_score_threshold", "hybrid"
    search_kwargs={"k": 1},  # k is the number of documents to return
)  # search_type is the type of search to perform, search_kwargs is the arguments to pass to the search method

retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)
