In [3]:
from langchain.document_loaders import PyPDFLoader

model = "llama3.2"
doc_path = "../samples/test_hardest.pdf"

In [4]:
loader = PyPDFLoader(file_path=doc_path)
data = loader.load()

In [5]:
data[0].page_content[:40]

''

## Document Loading (PDF example)

- Document Loading + extract text (image also): <a href="https://python.langchain.com/docs/how_to/document_loader_pdf/">More detail to extract in future</a>
- Document type: <a href="https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html">Document</a>



In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

"""
Here, the RecursiveCharacterTextSplitter class is imported and an instance is created with specific parameters:

chunk_size=512: Each chunk will have a maximum of 512 characters.
chunk_overlap=64: Chunks will overlap by 64 characters to ensure context is preserved between chunks.
add_start_index=True: This might be used to keep track of the starting index of each chunk in the original text.
"""
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=64,
    add_start_index=True,
)

# Remember this will return a list of Document type (which main component for extract data)
all_splits = text_splitter.split_documents(data)

In [7]:
len(all_splits)

522

In [8]:
all_splits[0].page_content

"Explaining9typesofAPItesting 7Howisdatasentovertheinternet?WhatdoesthathavetodowiththeOSImodel?HowdoesTCP/IPfitintothis? 10Top5commonwaystoimproveAPIperformance 11Thereareover1,000engineeringblogs.Herearemytop9favorites: 15RESTAPIAuthenticationMethods 16LinuxBootProcessIllustrated 18Netflix'sTechStack 22WhatdoesACIDmean? 26Oauth2.0ExplainedWithSimpleTerms 28TheEvolvingLandscapeofAPIProtocolsin2023 30LinuxbootProcessExplained 32Explaining8PopularNetworkProtocolsin1Diagram. 34DataPipelinesOverview"

## Embeddings

In [9]:
from langchain_ollama import OllamaEmbeddings

# embeddings = OllamaEmbeddings(model="nomic-embed-text")
embeddings = OllamaEmbeddings(model="llama3.2:1b")

In [10]:
import time
vector_stores = []
start_time = time.time()

for split in all_splits:
    vector_stores.append(embeddings.embed_query(split.page_content))

end_time = time.time()
print(f"Execution time: {end_time - start_time} seconds")

Execution time: 133.44402623176575 seconds


In [11]:
len(vector_stores)

522

In [12]:
len(vector_stores[0])

2048

In [13]:
import sys
sys.getsizeof(vector_stores)

4792

In [14]:
# Add this new cell to convert bytes to megabytes
byte_size = 4792
mb_size = byte_size / 1048576
print(f"Byte size: {byte_size} bytes")
print(f"Size in megabytes: {mb_size} MB")

Byte size: 4792 bytes
Size in megabytes: 0.00457000732421875 MB


In [18]:
import chromadb

In [19]:
client = chromadb.HttpClient(host="localhost", port=8000)

In [22]:
test_collection = client.get_or_create_collection("test_collection")


AttributeError: 'Collection' object has no attribute 'add_vectors'