In [25]:
# https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/
# https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.RecursiveCharacterTextSplitter.html#recursivecharactertextsplitter
# https://python.langchain.com/docs/integrations/text_embedding/

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

In [26]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

In [27]:
loader = PyPDFLoader("./tictactoe_user_stories.pdf")

In [28]:
pages = loader.load_and_split()

In [29]:
pages[0].page_content

'Emmerson-Miranda adding user stories bb4bded \xa0·\xa0 5 months ago\n62 lines (41 loc) · 3.67 KB\nThis document contains high-level user stories derived from the requirements\nanalysis for the Tic-Tac-Toe Web Application project. Each story follows the\nCucumber format and adheres to SCRUM guidelines. These user stories represent\nboth functional and non-functional requirements and are intended to provide a\nstructured framework for software developers to implement the application\nincrementally.\nAs a Google user, I want to log in using my Google account, so that my identity is\nauthenticated securely and my progress can be tracked.\nEmmerson-Miranda prompts\nCode Issues Pull requests Actions Projects Wiki Security\nmain\nprompts / user _ stories\n/ user _ stories _ generated.md\nHigh-Level User Stories for Tic-T ac-Toe\nWeb Application\nIntroduction\nUser Stories\n1. User Authentication\n2. User Data Collection\nPreview Code Blame Raw\nprompts/user_stories/user_stories_generated.md 

In [30]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)

chunks = text_splitter.split_documents(pages)

In [31]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
doc_embeddings = embeddings.embed_documents([chunk.page_content for chunk in chunks])

In [32]:
len(doc_embeddings)

13

In [34]:
doc_embeddings[0][0:10]

[0.017087478190660477,
 0.02316758967936039,
 -0.016313645988702774,
 0.009025410749018192,
 0.03213772550225258,
 0.05303119495511055,
 0.007750166580080986,
 0.0373966284096241,
 -0.002621554071083665,
 0.03967074677348137]