# Setting up Enviornment

In [19]:
# Setting up api and enviornment
import os
import getpass
import asyncio
from dotenv import load_dotenv

# Setting Enviorment
ENV_PATH = "/Users/divyanshusinghania/Documents/Github/LangChain/.env"
load_dotenv(ENV_PATH)

if not os.environ["OPENAI_API_KEY"]:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")


# Document Creator

In [3]:
# Here we can define what an Document is in the code
# Used for in memory operations
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

documents

[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Dogs are great companions, known for their loyalty and friendliness.'),
 Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')]

# Data Loaders in LangChain

In [None]:
# Document Loaders - These are to use to connect and load the data
# - Types:
# - File (CSV, JSON, EXCEL) Based, Database, API Loader -> Sturctured Data Loader
# - File (TEXT, PDF PLUMBER, UNSTRUCTURED PDF), Web Pages -> Unstruvtured Text Loader
# - Cloud Based (S3, GCS, AZURE), Enterprise Knowledge Base -> Specialized and cloud
from langchain_community.document_loaders import PyPDFLoader

file_path = "/Users/divyanshusinghania/Documents/ML_Learning/Reaserch Papers/2503.11651v1.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

20


In [7]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

VGGT: Visual Geometry Grounded Transformer
Jianyuan Wang1,2 Minghao Chen1,2 Nikita Karaev1,2 Andrea Vedaldi1,2
Christian Rupprecht1 David Novotny2
1Visual Geometry Group, University of Oxford 2Meta AI

{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-03-17T01:12:06+00:00', 'author': '', 'keywords': '', 'moddate': '2025-03-17T01:12:06+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': '/Users/divyanshusinghania/Documents/ML_Learning/Reaserch Papers/2503.11651v1.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}


# Chunking/Splitting

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

119

# Embedding Model

A big thing is that langsmith dont track embeddings by default, i will make changes afterwards for now its not tracking embeddings and vector store (Without Retriver)

## OpenAI Paid

In [13]:
# We are using OpenAI Embedding Model
# - "text-embedding-3-large" is .13 dollors per million tokens
# - "text-embedding-3-small" is .02 dollors per million tokens
from langchain_openai import OpenAIEmbeddings

embed_openai = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
Vector_OpenAI_1 = embed_openai.embed_query(all_splits[0].page_content)
Vector_OpenAI_2 = embed_openai.embed_query(all_splits[1].page_content)

assert len(Vector_OpenAI_1) == len(Vector_OpenAI_2)
print(f"Generated vectors of length {len(Vector_OpenAI_1)}\n")
print(Vector_OpenAI_1[:10])

Generated vectors of length 3072

[-0.00047509788419120014, 0.028241749852895737, -0.024183286353945732, -0.0052824439480900764, 0.026876045390963554, 0.020614415407180786, -0.00576559454202652, 0.03540525957942009, -0.024505387991666794, 0.021606484428048134]


## Ollama free (Local Serving, Laptop Intense)


### Setting Up Ollama Locally for LangChain

This note provides a quick revision guide to install and configure Ollama on your local machine so you can use the `OllamaEmbeddings` integration with LangChain.

---

### Prerequisites

- **Operating System:** macOS, Windows, or Linux (or WSL)
- **Python:** Version 3.11 or later
- **Ollama Installer:** Download from [Ollama's website](https://ollama.ai/)

---

### Installation Steps

#### 1. Download and Install Ollama

- **Download:** Visit [Ollama](https://ollama.ai/) and download the installer for your OS.
- **Install:** Follow the provided installation instructions.

#### 2. Start the Ollama Server

- **Open Terminal/Command Prompt**
- Run the command:
  ```bash
  ollama serve


In [14]:
from langchain_ollama import OllamaEmbeddings

embed_ollama = OllamaEmbeddings(
    model="llama3.2:latest"
)

In [None]:
Vector_Ollama_1 = embed_ollama.embed_query(all_splits[0].page_content)
Vector_Ollama_2 = embed_ollama.embed_query(all_splits[1].page_content)

assert len(Vector_Ollama_1) == len(Vector_Ollama_2)
print(f"Generated vectors of length {len(Vector_Ollama_1)}\n")
print(Vector_Ollama_1[:10])

Generated vectors of length 3072

[0.015058594, 0.0008505595, 0.015283242, 0.00901549, -0.021684092, -0.014709408, -0.03411613, 0.021666866, -0.0024111453, -0.0067152134]


## Hugging Face Integration (Clashing with numpy will resolve later)

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embed_HF = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
Vector_HF_1 = embed_HF.embed_query(all_splits[0].page_content)
Vector_HF_2 = embed_HF.embed_query(all_splits[1].page_content)

assert len(Vector_HF_1) == len(Vector_HF_2)
print(f"Generated vectors of length {len(Vector_HF_1)}\n")
print(Vector_HF_1[:10])

# Vector Store

A big thing is that langsmith dont track embeddings by default, i will make changes afterwards for now its not tracking embeddings and vector store (Without Retriver)

## In-Memory

In [20]:
# This is Langchain self implementation of Vector Store for In Memory
from langchain_core.vectorstores import InMemoryVectorStore

vector_store_OpenAI = InMemoryVectorStore(embed_openai)
vector_store_Ollama = InMemoryVectorStore(embed_ollama)

# Just to save some cost on OpenAI API
all_splits = all_splits[:2]
# len(all_splits)

ids_OpenAI = vector_store_OpenAI.add_documents(documents=all_splits)
ids_Ollama = vector_store_Ollama.add_documents(documents=all_splits)

In [21]:
from langsmith import utils
utils.tracing_is_enabled()

True