Source: https://python.langchain.com/docs/tutorials/retrievers/


In [None]:
# type: ignore
import os
from pathlib import Path

import dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

Set Project path

In [None]:
current_dir = Path(os.getcwd())

# # method 1: based on the root dir name
# root_dir_name = 'RAG'
# for p in current_dir.parents:
# if p.name.lower() == root_dir_name.lower():
#     root_dir = p
#     break
# else:
#     raise Exception(f"Root dir \"{root_dir_name}\" Not found")

# method 2: based on the ".git" dir presence
for p in current_dir.parents:
    if ".git" in os.listdir(current_dir.parent) or ".project-root" in os.listdir(current_dir.parent):
        root_dir = current_dir.parent
        print(root_dir)
        break
else:
    raise Exception("No root directory was found that contains a .git directory")

In [None]:
# load variables into env
f = root_dir / ".secrets" / ".env"
assert f.exists(), f"File not found: {f}"
dotenv.load_dotenv(f)

## <span style='color:Orange;'>LangChain Document Object</span>

LangChain implements a Document abstraction, which is intended to represent a unit of text and associated metadata. It has three attributes:

- `page_content`: a string representing the content;
- `metadata`: a dict containing arbitrary metadata;
- `id`: (optional) a string identifier for the document.

The metadata attribute can capture **information about the source** of the document, its **relationship to other documents,** and other information. 

> _Note that an individual Document object often represents a chunk of a larger document._

In [None]:
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.", metadata={"source": "mammal-pets-doc"}
    ),
]

### <span style='color:Khaki;'>Document loaders</span>

DocumentLoaders load data into the standard LangChain Document format.

https://python.langchain.com/docs/integrations/document_loaders/


See this guide for more detail on PDF document loaders.

https://python.langchain.com/docs/how_to/document_loader_pdf/

In [None]:
file_path = root_dir / "data" / "vmd_sample.pdf"
assert os.path.exists(file_path)
loader = PyPDFLoader(file_path)

docs = loader.load()

print(f"{len(docs)=}")
print("CONTENT")
print(f"{docs[37].page_content[:200]}\n")
print(docs[0].metadata)

#### <span style='color:LightGreen;'>Splitting Text</span>

Further splitting the PDF will help ensure that the meanings of relevant portions of the document are not "washed out" by surrounding text.

We will split our documents **into chunks of 1000 characters with 200 characters of overlap** between chunks. The overlap helps mitigate the possibility of separating a statement from important context related to it. We use the [`RecursiveCharacterTextSplitter`](https://python.langchain.com/docs/how_to/recursive_text_splitter/), which will recursively split the document using common separators like new lines until each chunk is the appropriate size. This is the recommended text splitter for generic text use cases.

In [None]:
# `add_start_index=True`` will preserve the character index where each split Document starts within the initial Document, as a metadata attribute “start_index”.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

## <span style='color:Orange;'>Embeddings</span>

Vector search is a common way to store and search over unstructured data (such as unstructured text). The idea is to store numeric vectors that are associated with the text.


### <span style='color:Khaki;'>Installing Ollama</span>

- [Installing Ollama](https://github.com/ollama/ollama?tab=readme-ov-file#ollama)

- [Available models](https://ollama.com/search)

In [None]:
embeddings_model = OllamaEmbeddings(model="llama3.2")

In [None]:
# embedding example
vector_1 = embeddings_model.embed_query(all_splits[0].page_content)
vector_2 = embeddings_model.embed_query(all_splits[1].page_content)

print("Both embedding have same length:", len(vector_1) == len(vector_2))
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:5])

## <span style='color:Orange;'>Vector stores</span>

[Available DBs](https://python.langchain.com/docs/integrations/vectorstores/)

pip install "pymongo[srv]"

In [None]:
# https://www.mongodb.com/docs/manual/reference/connection-string/
# https://swethag04.medium.com/rag-using-mongodb-atlas-vector-search-and-langchain-cba57b67fe29

uri = os.getenv("MONGODB_URI")

# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi("1"))

# Send a ping to confirm a successful connection
try:
    client.admin.command("ping")
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
# https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/

DB_NAME = "langchain_demo"
db = client[DB_NAME]

COLLECTION_NAME = "emp-policy"
MONGODB_COLLECTION = db[COLLECTION_NAME]

ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

In [None]:
vector_store = MongoDBAtlasVectorSearch(
    embedding=embeddings_model,
    collection=MONGODB_COLLECTION,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
    relevance_score_fn="cosine",
)