# UseCase B : ingest pdf in a vectorDB & query 1/2

## Ingest pdf

### Context definition

Neo4j is used as Vector DB

In [1]:
import os
import re
import neo4j
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_text_splitters import MarkdownHeaderTextSplitter
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from langchain_core.documents import Document
from dotenv import load_dotenv
from datetime import datetime
import fitz  # PyMuPDF

def printDone():
    # Obtenir la date et l'heure actuelle
    maintenant = datetime.now()
    # Formater la date dans le format souhaité
    date_formatee = maintenant.strftime("DONE - %A %d %B à %Hh%M et %S secondes")
    # Retourner la date formatée
    return date_formatee
    
load_dotenv()

os.environ["OLLAMA_URL"] = "http://ollama:11434"
os.environ["embedding_model"] = "nomic-embed-text"

os.environ["NEO4J_URI"] = "neo4j://neo4j:7687"
os.environ["NEO4J_URI_BOLT"] = "bolt://neo4j:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "strongPassword1"
os.environ["INDEX_NAME"] = "pdf_chunk"
os.environ["FULLTEXT_INDEX_NAME"] = "documentFullTextIndex"
os.environ["embbeded_dim"] = "1536"

os.environ["OLLAMA_MODELE_URL"] = "http://ollama:11434/v1/"
os.environ["LLM_MODEL"] = "llama3.2:3b"

pdf_path1 = "/home/jovyan/datasets/cvss-v31-user-guide_r1.pdf"
pdf_path2 = "/home/jovyan/datasets/ISO_SAE_21434_2021.pdf"

my_chunk_size = 600
my_overlap = 100

printDone()

'DONE - Wednesday 20 November à 16h14 et 54 secondes'

In [2]:
FULLTEXT_INDEX_NAME = os.environ["FULLTEXT_INDEX_NAME"]
URI =os.environ["NEO4J_URI"]
AUTH = (os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])

driver = neo4j.GraphDatabase.driver(URI, auth=AUTH)

def create_fulltext_index():
    query = """
    CREATE FULLTEXT INDEX documentFullTextIndex IF NOT EXISTS FOR (n:documents_import) ON EACH [n.file_name, n.info]
    """
    with driver.session() as session:
        session.run(query)

create_fulltext_index()

driver.close()

printDone()

### Strategy for text extraction & chunking

### Document-Based Chunking

Document-based chunking simplifies large documents by breaking them into smaller, easier-to-handle sections based on their structure or content. Unlike other chunking methods that cut text into pieces at specific points, document-based chunking divides documents into sections like paragraphs or chapters, depending on how they’re organized. For example, a Markdown file would be chunked differently than a Python file or a JSON file because chunking is based on the type of file, not the number of characters.

In [3]:
def create_documents(docPath, chunk_size, overlap):

    def pdf_to_markdown(pdf_path, chunk_size, overlap):
        # Ouvrir le fichier PDF
        pdf_document = fitz.open(pdf_path)
        
        # Initialiser la liste des chunks
        chunks = []
        
        # Parcourir chaque page
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text = page.get_text("markdown")
            
            # Diviser le texte en chunks avec recouvrement
            start = 0
            while start < len(text):
                end = min(start + chunk_size, len(text))
                chunk = text[start:end]
                chunks.append(chunk)  # Ajouter chaque chunk à la liste des chunks
                start += chunk_size - overlap

        print("count of chunks created : " + str(len(chunks)) )
        
        # Retourner la liste des chunks
        return chunks
    
    pdf_markdown_chunks = pdf_to_markdown(docPath, chunk_size, overlap)
    file_name = os.path.basename(docPath)

    docs = []
    for doc in pdf_markdown_chunks:
        docs.append(Document(page_content=doc))
    
    for idx, doc in enumerate(docs):
        doc.metadata["file_name"] = file_name
        doc.metadata["chunk_id"] = idx

    return docs

printDone()

'DONE - Wednesday 20 November à 16h14 et 54 secondes'

#### Set Embeddings engin

In [4]:
from langchain_ollama import OllamaEmbeddings
from neo4j import GraphDatabase

embeddings_engin = OllamaEmbeddings(base_url=os.environ["OLLAMA_URL"], model=os.environ["embedding_model"])

printDone()

'DONE - Wednesday 20 November à 16h14 et 54 secondes'

#### Populate a Vector Index with PDF 1

In [5]:
chunks_markdown1 = create_documents(pdf_path1, my_chunk_size, my_overlap)

db = Neo4jVector.from_documents(chunks_markdown1, 
                                embeddings_engin, 
                                url=os.environ["NEO4J_URI"], 
                                username=os.environ["NEO4J_USERNAME"], 
                                password=os.environ["NEO4J_PASSWORD"],
                                database="neo4j",
                                index_name=os.environ["INDEX_NAME"], 
                                node_label="documents_import",
                                text_node_property="info",
                                embedding_node_property="vector",
                                create_id_index=True,
)

printDone()

count of chunks created : 116


'DONE - Wednesday 20 November à 16h16 et 46 secondes'

#### Add in existing Vector Index PDF 2

In [6]:
chunks_markdown2 = create_documents(pdf_path2, my_chunk_size, my_overlap)

db = Neo4jVector.from_documents(chunks_markdown2, 
                                embeddings_engin, 
                                url=os.environ["NEO4J_URI"], 
                                username=os.environ["NEO4J_USERNAME"], 
                                password=os.environ["NEO4J_PASSWORD"],
                                database="neo4j",
                                index_name=os.environ["INDEX_NAME"], 
                                node_label="documents_import",
                                text_node_property="info",
                                embedding_node_property="vector",
                                create_id_index=True,
)

printDone()

count of chunks created : 420


'DONE - Wednesday 20 November à 16h22 et 03 secondes'

In [7]:
# SHOW INDEXES YIELD name, type, entityType, labelsOrTypes, properties, state
# MATCH (n:documents_import) WITH n.file_name AS file_name, count(n) AS count RETURN file_name, count
# MATCH (n:documents_import) DETACH DELETE n
# DROP INDEX pdf_chunk

# 116 chunks, 116 in DB
# 420 chunks, 418 in DB (!)