In [1]:
pip install langchain-community pypdf

Collecting pypdf
  Downloading pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.4.0-py3-none-any.whl (302 kB)
Installing collected packages: pypdf
Successfully installed pypdf-5.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import getpass

# Required
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("Enter your LangChain API key (if you have one): ")

# Optional but recommended for tracing/debugging
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# Optional: if you're using Tavily for web search capabilities
os.environ["TAVILY_API_KEY"] = getpass.getpass("Enter your Tavily API key (if using): ")

Enter your OpenAI API key:  ········
Enter your LangChain API key (if you have one):  ········
Enter your Tavily API key (if using):  ········


In [11]:
from langchain_core.documents import Document

# Sample list of documents manually defined — simulating what you'd get from loading a file like a PDF
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},  # metadata can track origin, type, tags, etc.
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [15]:
from langchain_community.document_loaders import PyPDFLoader  # Import the PDF loader from LangChain

file_path = "/Users/daniel/Documents/Northwestern/MSDS-442 AI Agent Design & Development/Lab_4-1/source-code/nke-10k-2023.pdf"  # Full path to the Nike 10-K PDF file

loader = PyPDFLoader(file_path)  # Create a loader instance for that PDF file

docs = loader.load()  # Load the PDF into a list of Document objects (one per page)

print(len(docs))  # Print the number of pages loaded (i.e., number of Document objects)

107


In [17]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': '/Users/daniel/Documents/Northwestern/MSDS-442 AI Agent Design & Development/Lab_4-1/source-code/nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [19]:
from langchain_text_splitters import RecursiveCharacterTextSplitter  # Import the recommended text splitter

# Create a text splitter that breaks text into 1000-character chunks with 200-character overlap
# The overlap ensures that important context isn’t lost across chunk boundaries
# add_start_index=True stores where each chunk begins (useful for tracking location in the source)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True
)

# Apply the splitter to the list of Document objects (one per page)
# This creates a new list of smaller Document chunks
all_splits = text_splitter.split_documents(docs)

# Check how many chunks were created after splitting
len(all_splits)

516

In [21]:
from langchain_openai import OpenAIEmbeddings  # Import the OpenAI embeddings class from LangChain

# Initialize the embedding model using your OpenAI API key (already set earlier)
embeddings = OpenAIEmbeddings()

# Generate a vector for the content of the first split document
vector_1 = embeddings.embed_query(all_splits[0].page_content)

# Generate a vector for the second split document
vector_2 = embeddings.embed_query(all_splits[1].page_content)

# Ensure both vectors have the same number of dimensions (sanity check)
assert len(vector_1) == len(vector_2)

# Output the dimensionality of the vector and a preview of the first 10 values
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 1536

[-0.00860656425356865, -0.03344116732478142, -0.009941618889570236, -0.0050745029002428055, 0.009079665876924992, 0.009442593902349472, -0.028230568394064903, -0.01646135002374649, 0.002953645773231983, -0.012832076288759708]


In [25]:
from langchain_chroma import Chroma  # Import Chroma, an in-memory vector store integration

# Initialize the vector store with the embedding model you created earlier
# This allows the store to embed any document or query using that model
vector_store = Chroma(embedding_function=embeddings)

# Add your pre-split Document chunks (all_splits) to the vector store
# This step embeds each chunk and stores it for future similarity search
# The returned 'ids' list contains unique identifiers for each stored chunk
ids = vector_store.add_documents(documents=all_splits)

In [27]:
# Perform a similarity search using the embedded question
# The query will be converted to a vector and compared against all stored document vectors
# The result will be a list of the most similar Document chunks (by default, the top 4)
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

# Print the most relevant result (the first document in the list)
# This will display both the content and metadata of the top-matching chunk
print(results[0])

page_content='direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2' metadata={'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'creationdate': '2023-07-20T16:22:00-04:00', 'creator': 'EDGAR Filing HTML Converter', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'page': 4, 'page_label': '5', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'source': '/Users/daniel/Documents/Northwestern/MSDS-442 AI Agent Design & Development/Lab_4-1/source-code/nke-10k-2023.pdf', 'start_index': 3125, 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31

In [29]:
# Perform an asynchronous similarity search using a natural language question
# The query is embedded and compared against stored document vectors
# Note: 'await' means this line must be run inside an async context (like an async function or Jupyter cell with asyncio support)
results = await vector_store.asimilarity_search("When was Nike incorporated?")

# Print the most relevant document chunk from the results
# This will include both the page content and its associated metadata
print(results[0])

page_content='Table of Contents
PART I
ITEM 1. BUSINESS
GENERAL
NIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"
"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.
Our principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is
the largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores
and sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales' metadata={'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'creationdate': '2023-07-

In [35]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.
# Lower score = higher similarity (more relevant match)
# Range (typical for cosine distance):
#	•	0.0 = perfect match
#	•	0.2–0.3 = very relevant
#	•	0.4–0.6 = somewhat relevant
#	•	> 0.6 = weak or marginal relevance

# Perform a similarity search and return both the matching documents and their scores
results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")

# Unpack the first result into the document and its associated similarity score
doc, score = results[0]

# Print the similarity score — lower is better
print(f"Score: {score}\n")

# Print the document content and metadata for the top match
print(doc)

Score: 0.23853273689746857

page_content='Table of Contents
FISCAL 2023 NIKE BRAND REVENUE HIGHLIGHTSThe following tables present NIKE Brand revenues disaggregated by reportable operating segment, distribution channel and major product line:
FISCAL 2023 COMPARED TO FISCAL 2022
• NIKE, Inc. Revenues were $51.2 billion in fiscal 2023, which increased 10% and 16% compared to fiscal 2022 on a reported and currency-neutral basis, respectively.
The increase was due to higher revenues in North America, Europe, Middle East & Africa ("EMEA"), APLA and Greater China, which contributed approximately 7, 6,
2 and 1 percentage points to NIKE, Inc. Revenues, respectively.
• NIKE Brand revenues, which represented over 90% of NIKE, Inc. Revenues, increased 10% and 16% on a reported and currency-neutral basis, respectively. This
increase was primarily due to higher revenues in Men's, the Jordan Brand, Women's and Kids' which grew 17%, 35%,11% and 10%, respectively, on a wholesale
equivalent basis.' meta

In [37]:
# Create an embedding vector for the query manually (instead of passing the string directly to the search function)
# This is useful when you want to reuse the same embedding across different operations
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

# Perform a similarity search by directly providing the query vector
# This skips the embedding step inside the search function and uses the one you just generated
results = vector_store.similarity_search_by_vector(embedding)

# Print the most relevant document chunk returned by the search
print(results[0])

page_content='Table of Contents
GROSS MARGIN
FISCAL 2023 COMPARED TO FISCAL 2022
For fiscal 2023, our consolidated gross profit increased 4% to $22,292 million compared to $21,479 million for fiscal 2022. Gross margin decreased 250 basis points to
43.5% for fiscal 2023 compared to 46.0% for fiscal 2022 due to the following:
*Wholesale equivalent
The decrease in gross margin for fiscal 2023 was primarily due to:
• Higher NIKE Brand product costs, on a wholesale equivalent basis, primarily due to higher input costs and elevated inbound freight and logistics costs as well as
product mix;
• Lower margin in our NIKE Direct business, driven by higher promotional activity to liquidate inventory in the current period compared to lower promotional activity in
the prior period resulting from lower available inventory supply;
• Unfavorable changes in net foreign currency exchange rates, including hedges; and
• Lower off-price margin, on a wholesale equivalent basis.
This was partially offset by:'

In [39]:
from typing import List  # Used for type hinting: the function will return a list of Document objects

from langchain_core.documents import Document  # Import the Document class from LangChain
from langchain_core.runnables import chain     # Import the chain decorator to create a Runnable

# Define a simple retriever function and wrap it with @chain to make it a Runnable
# This function takes a query string and returns the top matching document using similarity search
@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)  # Retrieve 1 most relevant result

# Run the retriever in batch mode with multiple queries
# Each query is passed to the retriever and the top document is returned for each
retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(id='bbc2aca2-22cb-4a86-ac65-ab1b0290c416', metadata={'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'creationdate': '2023-07-20T16:22:00-04:00', 'creator': 'EDGAR Filing HTML Converter', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'page': 4, 'page_label': '5', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'source': '/Users/daniel/Documents/Northwestern/MSDS-442 AI Agent Design & Development/Lab_4-1/source-code/nke-10k-2023.pdf', 'start_index': 3125, 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'title': '0000320187-23-000039', 'total_pages': 107}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIK

In [41]:
# Create a retriever directly from the vector store
# 'as_retriever' returns a VectorStoreRetriever, which is a standard LangChain retriever interface
# 'search_type' specifies the search method (e.g., "similarity", "mmr")
# 'search_kwargs' defines parameters for the search — here, k=1 to return the top result only
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

# Use the retriever to perform batch retrieval for multiple queries
# Each query is embedded and the top-matching document is returned
retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(id='bbc2aca2-22cb-4a86-ac65-ab1b0290c416', metadata={'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'creationdate': '2023-07-20T16:22:00-04:00', 'creator': 'EDGAR Filing HTML Converter', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'page': 4, 'page_label': '5', 'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'source': '/Users/daniel/Documents/Northwestern/MSDS-442 AI Agent Design & Development/Lab_4-1/source-code/nke-10k-2023.pdf', 'start_index': 3125, 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'title': '0000320187-23-000039', 'total_pages': 107}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIK