In [1]:
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
DATA_PATH = "data"
def load_documents():
    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    return documents

In [2]:
documents = load_documents()
print(f"Loaded {len(documents)} documents.")
print(f"First document text: {documents[0]}")  # Print first 100 characters of the first document

Loaded 13 documents.
First document text: page_content='Ali Lazraq 
 Data Scientist 
 📧 lazraqali08@gmail.com |  Ali Lazraq  | 📍 Casablanca, Morocco |  📞   +212 623792204 
 EXPERIENCES 
 CIH BANK  Casablanca, Morocco 
 Data Science  Jan 2025 - Current 
 ●  Developed a full-scale churn prediction system using LightGBM, achieving 90.42% recall prioritizing client retention. 
 ●  Engineered end-to-end modular Python pipelines for data extraction via Dremio, preprocessing, feature engineering, and 
 model deployment. 
 ●  Designed interactive Streamlit dashboard for business stakeholders to visualize churn insights and intervention plans. 
 ●  Implemented SHAP for model interpretability and MLflow for rigorous experiment tracking and version control. 
 CAPSTONE PROJECT  Ifrane, Morocco 
 IoT-Based Fleet Management & Driver Monitoring  Sep 2024 - Dec 2024 
 ●  Built a real-time fleet management system integrating Teltonika FM5300, React, Spring Boot, and MySQL. 
 ●  Directed the integration

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
    # Create a text splitter that splits documents into chunks of 800 characters with 80 characters overlap
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,           # Maximum size of each chunk
        chunk_overlap=80,         # Number of overlapping characters between chunks
        length_function=len,      # Function to measure the length of text (here, using len)
        is_separator_regex=False  # Indicates that the separator is not a regex
    )
    # Split the input documents into smaller chunks using the text splitter
    split_docs = text_splitter.split_documents(documents)
    # Return the list of split document chunks
    return split_docs

In [4]:
chunks = split_documents(documents)
print(f"Split into {len(chunks)} chunks.")
print(f"First chunk text: {chunks[0]}")  # Print first 100 characters of the first chunk

Split into 46 chunks.
First chunk text: page_content='Ali Lazraq 
 Data Scientist 
 📧 lazraqali08@gmail.com |  Ali Lazraq  | 📍 Casablanca, Morocco |  📞   +212 623792204 
 EXPERIENCES 
 CIH BANK  Casablanca, Morocco 
 Data Science  Jan 2025 - Current 
 ●  Developed a full-scale churn prediction system using LightGBM, achieving 90.42% recall prioritizing client retention. 
 ●  Engineered end-to-end modular Python pipelines for data extraction via Dremio, preprocessing, feature engineering, and 
 model deployment. 
 ●  Designed interactive Streamlit dashboard for business stakeholders to visualize churn insights and intervention plans. 
 ●  Implemented SHAP for model interpretability and MLflow for rigorous experiment tracking and version control. 
 CAPSTONE PROJECT  Ifrane, Morocco' metadata={'producer': '4-Heights™ PDF Library 3.4.0.6904 (http://www.pdf-tools.com)', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.3

In [5]:
from langchain_community.embeddings.ollama import OllamaEmbeddings

def get_embedding_function():
    # Create an instance of the OllamaEmbeddings class with the model name "mistral"
    embedding_function = OllamaEmbeddings(model="mistral")
    # Print the model name used for embeddings
    print(f"Using model: {embedding_function.model}")
    # Return the embedding function
    return embedding_function
embedding_function = get_embedding_function()
print(f"Embedding function: {embedding_function}")  # Print the embedding function details

Using model: mistral
Embedding function: base_url='http://localhost:11434' model='mistral' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None show_progress=False headers=None model_kwargs=None


  embedding_function = OllamaEmbeddings(model="mistral")


In [6]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks
chunks = calculate_chunk_ids(chunks)
print(f"First chunk ID: {chunks[0].metadata['id']}")  # Print the ID of the first chunk

First chunk ID: data\Ali_Lazraq___eng_CV (1).pdf:0:0


In [7]:
from langchain.vectorstores.chroma import Chroma

def add_to_chroma(chunks: list[Document]):

    CHROMA_PATH = "chroma"  # Path to the Chroma database
    
    # Load the existing database.
    db = Chroma(
        persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

    # Return the database object
    return db
db = add_to_chroma(chunks)
print(f"Number of documents in DB: {len(db.get(include=[])['ids'])}")  # Print the number of documents in the database
print(f"First document ID in DB: {db.get(include=[])['ids'][0]}")  # Print the ID of the first document in the database

Using model: mistral


  db = Chroma(


Number of existing documents in DB: 40
👉 Adding new documents: 6
Number of documents in DB: 46
First document ID in DB: data\monopoly.pdf:0:0


  db.persist()


In [15]:
print(f"First document ID in DB: {db.get(include=[])['ids'][39]}") 

First document ID in DB: data\ticket_to_ride.pdf:3:4


In [16]:
db = add_to_chroma(chunks)

Using model: mistral
Number of existing documents in DB: 46
✅ No new documents to add
