In [18]:
from pdf_rag.utils import rag_chain, process_pdfs_from_folder
import gradio as gr
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_ollama import ChatOllama
import ollama


In [19]:
import getpass
os.environ["LANGSMITH_TRACING"] = "true"

In [20]:
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

In [21]:
def get_embedding_function():
    embeddings = OllamaEmbeddings(model="deepseek-r1:1.5b")
    return embeddings

## Indexing

### Load documents

In [22]:
loader = PyPDFDirectoryLoader("data/wikipedia/")
docs = loader.load()

### Split

In [23]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,  # Increase chunk size
    chunk_overlap=150,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 3148 sub-documents.


### Store

In [24]:
vector_store = Chroma(embedding_function=get_embedding_function())

In [None]:
document_ids = vector_store.add_documents(documents=all_splits, persist_directory="./chroma_langchain_db")

## Retrieval and generation

### Retrieve

In [None]:
llm = ChatOllama(
    model = "deepseek-r1:1.5b",
    temperature = 1,)

### Generate