## DOC Loader

In [22]:
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv
load_dotenv()

file_path = "./azure-doc.pdf"
loader = PyPDFLoader(file_path, mode="single")
docs = loader.load()
print(len(docs))
docs[0].page_content

1


'1. Introduction \nMicrosoft Azure provides a broad set of GPU-accelerated virtual machines optimized for AI \ntraining, inference, graphics rendering, HPC, and data processing. GPU families on Azure \ninclude the NC, ND, NV , and NCads/NDas/Lsv3 variants, covering a wide range of NVIDIA \nGPUs such as A100, H100, L40S, A10, T4, and previous generations. \nAzure’s GPU portfolio is designed to meet different levels of performance, memory \nrequirements, scalability, and cost, making it suitable for LLM training, fine-tuning, high-\nthroughput inference, and visualization workloads. \n2. GPU VM Families on Azure \n2.1 NC Series – Compute-Optimized for AI Training \n• Best for: Deep learning training, scientific computing. \n• GPU models: NVIDIA A100 80GB, V100, K80. \n• Examples: \no Standard_NC24ads_A100_v4 – 1× A100 80GB. \no Standard_NC96ads_A100_v4 – 4× A100 80GB (multi-GPU node). \n• Use cases: \no Transformer/LLM training (up to 70B parameters). \no Computer vision training. \no Re

## DOC Splitter

In [23]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[r"\r?\n(?=\d+\.\s+)"],
    is_separator_regex=True,
    chunk_size=1,
    chunk_overlap=0,
)

all_splits = text_splitter.split_documents(docs)
print(len(all_splits))

13


In [24]:
all_splits

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-11-17T23:51:16-03:00', 'author': 'Emerson Faria', 'moddate': '2025-11-17T23:51:16-03:00', 'source': './azure-doc.pdf', 'total_pages': 6}, page_content='1. Introduction \nMicrosoft Azure provides a broad set of GPU-accelerated virtual machines optimized for AI \ntraining, inference, graphics rendering, HPC, and data processing. GPU families on Azure \ninclude the NC, ND, NV , and NCads/NDas/Lsv3 variants, covering a wide range of NVIDIA \nGPUs such as A100, H100, L40S, A10, T4, and previous generations. \nAzure’s GPU portfolio is designed to meet different levels of performance, memory \nrequirements, scalability, and cost, making it suitable for LLM training, fine-tuning, high-\nthroughput inference, and visualization workloads. '),
 Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-11-17T23:51:16-03:00', 'author'

## Embeddings and Vector store

In [25]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Chroma.from_documents(
    documents=all_splits,          # your 13 docs or chunks
    embedding=embeddings,          # your OpenAIEmbeddings instance
    persist_directory="../app/data/vec_database/chroma_db"  # folder to persist locally
)

### Metadata

```JSON
{
    "content": "...",
    "metadata": {
        "creator": "Microsoft® Word LTSC",
        "moddate": "2025-11-15T12:10:12-03:00",
        "total_pages": 6,
        "creationdate": "2025-11-15T12:10:12-03:00",
        "source": "./azure-doc.pdf",
        "producer": "Microsoft® Word LTSC",
        "author": "Emerson Faria"
    }
}
```

In [21]:
vectorstore._similarity_search_with_relevance_scores("Azure GPU Pricing Overview", k=4)

[(Document(metadata={'moddate': '2025-11-17T23:51:16-03:00', 'producer': 'Microsoft® Word LTSC', 'source': './azure-doc.pdf', 'total_pages': 6, 'author': 'Emerson Faria', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-11-17T23:51:16-03:00'}, page_content='\n3. Azure GPU Pricing Overview \nAzure GPU pricing varies by: \n• Region \n• GPU type \n• VM family \n• CPU/RAM configuration \n• Networking capabilities \n• Purchase model (Pay-As-You-Go, Reserved Instances, Spot) \nA100 80GB \n• Standard_NC24ads_A100_v4: $3.40/hour \n• Standard_ND96asr_A100_v4: $28.50/hour (8 GPUs) \nL40S \n• Standard_NV24as_v4: $2.10/hour \nA10 \n• Standard_NVadsA10_v5: $1.40/hour \nT4 \n• Standard_NC4as_T4_v3: $0.85/hour '),
  0.6779065344218039),
 (Document(metadata={'source': './azure-doc.pdf', 'moddate': '2025-11-17T23:51:16-03:00', 'total_pages': 6, 'producer': 'Microsoft® Word LTSC', 'author': 'Emerson Faria', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-11-17T23:51:16-03:00'}, page_con