In [2]:
pip install langchain-community pypdf

Note: you may need to restart the kernel to use updated packages.


In [3]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()
os.environ["LANGSMITH_PROJECT"] = "semantic_search"

 ········


In [4]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [5]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./example_data/nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

107


In [6]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './example_data/nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

516

In [8]:
pip install -qU langchain-ollama

Note: you may need to restart the kernel to use updated packages.


In [9]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(model="llama3")

In [10]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 4096

[-0.0045971535, -0.028713051, -0.013401889, 0.0038865888, -0.005400139, -0.025057498, -0.006401032, -0.006304333, -0.034139734, -0.00071351713]


In [11]:
pip install -qU langchain-chroma

Note: you may need to restart the kernel to use updated packages.


In [12]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma/chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [13]:
ids = vector_store.add_documents(documents=all_splits)

In [14]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

page_content='Table of Contents
INTERNATIONAL MARKETS
For fiscal 2023, non-U.S. NIKE Brand and Converse sales accounted for approximately 57% of total revenues, compared to 60% and 61% for fiscal 2022 and fiscal 2021,
respectively. We sell our products to retail accounts through our own NIKE Direct operations and through a mix of independent distributors, licensees and sales
representatives around the world. We sell to thousands of retail accounts and ship products from 67 distribution centers outside of the United States. Refer to Item 2.
Properties for further information on distribution facilities outside of the United States. During fiscal 2023, NIKE's three largest customers outside of the United States
accounted for approximately 14% of total non-U.S. sales.
In addition to NIKE-owned and Converse-owned digital commerce platforms in over 40 countries, our NIKE Direct and Converse direct to consumer businesses operate
the following number of retail stores outside the United States:

In [15]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

page_content='Table of Contents
INTERNATIONAL MARKETS
For fiscal 2023, non-U.S. NIKE Brand and Converse sales accounted for approximately 57% of total revenues, compared to 60% and 61% for fiscal 2022 and fiscal 2021,
respectively. We sell our products to retail accounts through our own NIKE Direct operations and through a mix of independent distributors, licensees and sales
representatives around the world. We sell to thousands of retail accounts and ship products from 67 distribution centers outside of the United States. Refer to Item 2.
Properties for further information on distribution facilities outside of the United States. During fiscal 2023, NIKE's three largest customers outside of the United States
accounted for approximately 14% of total non-U.S. sales.
In addition to NIKE-owned and Converse-owned digital commerce platforms in over 40 countries, our NIKE Direct and Converse direct to consumer businesses operate
the following number of retail stores outside the United States:

In [16]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.708548903465271

page_content='Table of Contents
INTERNATIONAL MARKETS
For fiscal 2023, non-U.S. NIKE Brand and Converse sales accounted for approximately 57% of total revenues, compared to 60% and 61% for fiscal 2022 and fiscal 2021,
respectively. We sell our products to retail accounts through our own NIKE Direct operations and through a mix of independent distributors, licensees and sales
representatives around the world. We sell to thousands of retail accounts and ship products from 67 distribution centers outside of the United States. Refer to Item 2.
Properties for further information on distribution facilities outside of the United States. During fiscal 2023, NIKE's three largest customers outside of the United States
accounted for approximately 14% of total non-U.S. sales.
In addition to NIKE-owned and Converse-owned digital commerce platforms in over 40 countries, our NIKE Direct and Converse direct to consumer businesses operate
the following number of retail stores 

In [17]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='In recent years, uncertain global and regional economic and political conditions have affected international trade and increased protectionist actions around the
world. These trends are affecting many global manufacturing and service sectors, and the footwear and apparel industries, as a whole, are not immune. Companies in our
industry are facing trade protectionism in many different regions, and, in nearly all cases, we are working together with industry groups to address trade issues and reduce
the impact to the industry, while observing applicable competition laws. Notwithstanding our efforts, protectionist measures have resulted in increases in the cost of our
products, and additional measures, if implemented, could adversely affect sales and/or profitability for NIKE, as well as the imported footwear and apparel industry as a
whole.' metadata={'creationdate': '2023-07-20T16:22:00-04:00', 'start_index': 2900, 'page': 6, 'author': 'EDGAR Online, a division of Donnelley