## Packages & Paths

In [3]:
import os
import numpy as np 

from typing import List, Tuple  
from pathlib import Path
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader            # loads PDFs page-by-page and stores page metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter     # Text splitter for documents
from langchain_openai import ChatOpenAI, OpenAIEmbeddings               # LLM for answering & Embedding tool to turn text into vectors
from langchain_chroma import Chroma 

In [6]:
BASE_DIR = Path.cwd()
PDF_DIR = BASE_DIR / "data"             # Store data PDFs 
CHROMA_DIR = BASE_DIR / "chroma_db"     # Store Chroma DB

## Get data

In [5]:
pdf_paths = sorted(PDF_DIR.glob("*.pdf"))

for path in pdf_paths:
    print(path)

/Users/andrealunghini/Desktop/data_science/rag_lab_solita/data/foundations_of_data_science.pdf


## Step 1: Ingest & Parse

In [11]:
from langchain_core.documents import Document
import re


def clean_pdf_text(text:str) -> str: 
    "We want pure text in chunks." 
    "Cleaning blanks."
    text = re.sub(r'\n \n', ' ', text) 
    text = re.sub(r'\n\n+', '\n\n', text) 
    text = re.sub(r' +', ' ', text) 
    text = text.strip()
    return text

def load_pdfs(paths: List[Path]) -> List[Document]:
    all_docs: List[Document] = [] 
    for path in paths:
        loader = PyPDFLoader(str(path))
        docs = loader.load() 
        for d in docs:
            d.metadata["source"] = Path(d.metadata.get("source",path)).name 
            d.page_content = clean_pdf_text(d.page_content)
        all_docs.extend(docs)
    return all_docs

docs = load_pdfs(pdf_paths)
print(f"Loaded {len(docs)} page-documents.")

Loaded 479 page-documents.


## Step 2: Chunking & Embedding

### Embedding strategies
- Fixed-size chunking with overlap
- Semantic chunking: avoiding splitting a sentence in a semantically important part of the text
- Recursive chunking: recurservly splitting into chunk
- Document structured-based chunk: inherent structure of the text
- LLM-based chunking: input to LLM and LLM generated chunks.

In [19]:
load_dotenv(override=True)

if not os.getenv("OPENAI_API_KEY"):
    raise RuntimeError(
        "OPENAI_API_KEY not found. Create a .env file (copy from .env.example) and set OPENAI_API_KEY."
    )

CHAT_MODEL = os.getenv("OPENAI_CHAT_MODEL", "gpt-4o-mini")
EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")

print("OPENAI_API_KEY found.")              # never print your API keys :)
print(f"Using chat model: {CHAT_MODEL}")
print(f"Using embed model: {EMBED_MODEL}")

OPENAI_API_KEY found.
Using chat model: gpt-4o-mini
Using embed model: text-embedding-3-small


In [38]:
from langchain_text_splitters import CharacterTextSplitter
# Simplest and dumbest approach

# Common sense: "gambling" that overlap with important sentence is 
# not longer than 150 characters. 
splitter = CharacterTextSplitter(chunk_size=1500, 
                                 chunk_overlap=150, 
                                 separator=" ")

chunks = splitter.split_documents(docs)

# Foundation of DS: this is a good candidate for Document-structured based chunking

# Step 3: Build Vector DB & Create Embeddings