In [1]:
from langchain_core.documents import Document

In [11]:
import os
from langchain.chat_models import init_chat_model

if not os.environ.get("OPENAI_API_KEY"):
    raise "No OpenAI key found"

In [2]:
documents = [
    Document(
        page_content = "Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"}
    ),
    Document(
        page_content = "Cats are independent pets that often enjoy their own space.",
        metadata = {"source": "mammal-pets-doc"}
    )
]

In [6]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "data/nike.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()
print(len(docs))

107


In [9]:
docs[1].metadata

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0',
 'creator': 'EDGAR Filing HTML Converter',
 'creationdate': '2023-07-20T16:22:00-04:00',
 'title': '0000320187-23-000039',
 'author': 'EDGAR Online, a division of Donnelley Financial Solutions',
 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31',
 'keywords': '0000320187-23-000039; ; 10-K',
 'moddate': '2023-07-20T16:22:08-04:00',
 'source': 'data/nike.pdf',
 'total_pages': 107,
 'page': 1,
 'page_label': '2'}

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

all_splits = text_splitter.split_documents(docs)

len(all_splits)

516

In [12]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 3072

[0.009274279698729515, -0.01592199318110943, 0.0002812223683577031, 0.006453848443925381, 0.0206247977912426, -0.039248403161764145, -0.007454445119947195, 0.041074492037296295, -0.008004773408174515, 0.059935737401247025]


In [16]:
all_splits[1].metadata

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0',
 'creator': 'EDGAR Filing HTML Converter',
 'creationdate': '2023-07-20T16:22:00-04:00',
 'title': '0000320187-23-000039',
 'author': 'EDGAR Online, a division of Donnelley Financial Solutions',
 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31',
 'keywords': '0000320187-23-000039; ; 10-K',
 'moddate': '2023-07-20T16:22:08-04:00',
 'source': 'data/nike.pdf',
 'total_pages': 107,
 'page': 0,
 'page_label': '1',
 'start_index': 781}

In [17]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)
ids = vector_store.add_documents(documents=all_splits)

In [18]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

page_content='Table of Contents
PART I
ITEM 1. BUSINESS
GENERAL
NIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"
"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.
Our principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is
the largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores
and sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'cr