# <span style='color:Tomato;'>Load Env Variables</span>

In [None]:
import dotenv
import utils

# Add the modules directory to the Python path if needed
# sys.path.append(os.path.abspath("./modules"))

# load variables into env
root_dir = utils.get_project_root()
f = root_dir / ".secrets" / ".env"
assert f.exists(), f"File not found: {f}"
dotenv.load_dotenv(f)


Root Directory: /DATA/Ali_Data/GraphRAG-Neo4j-VMD-NAMD


True

In [2]:
# from langchain_core.runnables import  RunnablePassthrough
# from langchain_core.prompts import ChatPromptTemplate
# from pydantic import BaseModel, Field
# from langchain_core.output_parsers import StrOutputParser
# from langchain_community.graphs import Neo4jGraph
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.chat_models import ChatOllama
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from neo4j import GraphDatabase
# from yfiles_jupyter_graphs import GraphWidget
# from langchain_community.vectorstores import Neo4jVector
# from langchain_community.document_loaders import TextLoader
# from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
# from langchain_ollama import OllamaEmbeddings
# from langchain_experimental.llms.ollama_functions import OllamaFunctions



# <span style='color:Tomato;'>Process PDFs</span>

We'll use Langchain `PyMuPDF4LLM` to load the PDF files into LangChain documents.

We'll also use LLM to convert images into a summery and extract its data.


## <span style='color:Orange;'>Initialization</span>

### <span style='color:Khaki;'>Basic Imports</span>

In [None]:
# import os
# import sys
import pickle
import pprint
from pathlib import Path

from IPython.display import Markdown, display

# from tqdm import tqdm
from tqdm.notebook import tqdm


### <span style='color:Khaki;'>initializing the graph database</span>

### <span style='color:Khaki;'>Loading PDF file as LangChain Document</span>

> Images will be extracted (to text) using a Multimodal LLM.

You can either use `load()` method to do it all at once in memory or inclemently do it using `lazy_load()`.

Since our docs are big, we'll use `lazy_load()` to also see the progress.

To save time, we will load the docs from a pickle file if previously processed, otherwise process them and save them as a pickle.

#### <span style='color:LightGreen;'>Custom Splitting Mode</span>

> By default, each page in the PDF is a (LangChain) Document!

When loading the PDF file you can split it in two different ways:
- By page `mode="page"`
- As a single text flow `mode="single"`. In other words, the whole PDF would be **one** LangChain Document. You can specify page delimiter to have the pages in the metadata


In [None]:
# pdf file
file_path = Path() / ".." / "data" / "pdfs" / "biopython.pdf"

file_path = file_path.resolve()
file_path = utils.fuzzy_find(file_path)

# create directory for pkl files
pkl_dir = file_path.parent.parent / "pkls"
pkl_dir.mkdir(exist_ok=True, parents=True)

print(f"\nfile_path = {file_path}")


File 'biopython' not found. Fuzzy Searching ...

file_path = /DATA/Ali_Data/GraphRAG-Neo4j-VMD-NAMD/data/pdfs/BioPython.pdf


#### <span style='color:LightGreen;'>How the Asynchronous Lazy Loading Loop</span>

This code demonstrates an asynchronous lazy loading pattern with a progress bar. Let me explain how it works:


##### <span style='color:SkyBlue;'>Key Components</span>

1. `alazy_load()` - An asynchronous generator that yields documents one by one
2. `async for` - Asynchronous iteration through the generator
3. `tqdm.tqdm()` - Progress bar visualization
4. Batching logic to process documents in chunks of 100

##### <span style='color:SkyBlue;'>How the Async Loop Works</span>

```python
async for doc in tqdm.tqdm(await loader.alazy_load()):
    # Process each document as it becomes available
```

The `await loader.alazy_load()` returns an asynchronous iterable. The `async for` loop then:

1. Asynchronously requests the next document
2. Waits for it to be retrieved without blocking the event loop
3. Updates the progress bar via `tqdm`
4. Processes the document once available

The batching logic (collecting 100 pages before processing) allows for more efficient operations on groups of documents rather than one at a time.

This pattern is especially useful when loading documents involves network requests or other I/O operations that would otherwise block execution.


#### <span style='color:LightGreen;'>Which LLM to use?</span>

- `gemma3:4b`: **biggest,** but provide a general understanding of the images.
- `granite3.2-vision`: **small,** and fine-tunned for data extraction from images in PDF docs.
- `moondream`: **smallest,** but only good for overall description of the image.

The Prompt used:
```
You are an assistant tasked with summarizing images for retrieval.

1. These summaries will be embedded and used to retrieve the raw image.
   Give a concise summary of the image that is well optimized for retrieval

2. extract all the text from the image. Do not exclude any content from the page.

Format answer in markdown without explanatory text and without markdown delimiter ``` at the beginning.
```

In [None]:
import fitz  # PyMuPDF
from langchain_community.document_loaders.parsers import LLMImageBlobParser
from langchain_ollama import ChatOllama
from langchain_pymupdf4llm import PyMuPDF4LLMLoader

# from langchain_ollama.llms import OllamaLLM
# Use ChatOllama instead of OllamaLLM for compatibility with LLMImageBlobParser


extract_images = True
file_name = file_path.stem.lower()

if extract_images:
    loader = PyMuPDF4LLMLoader(
        file_path,
        mode="page",
        extract_images=True,
        images_parser=LLMImageBlobParser(model=ChatOllama(model="granite3.2-vision", max_tokens=1024)),
    )
    file_name = f"w.img.{file_name}"
else:
    loader = PyMuPDF4LLMLoader(file_path, mode="page")


# Find the number of pages in the PDF
with fitz.open(file_path) as pdf_doc:
    pdf_len = len(pdf_doc)


print(f"{file_name=} -> {pdf_len} pages")


file_name='w.img.biopython' -> 445 pages


The nice thing about `lazy_load()`, is that we can stop processing any page and skip it if a problem happen.

In [None]:
loader.


TypeError: PyMuPDF4LLMLoader.load() takes 1 positional argument but 2 were given

In [None]:
if (pkl_dir / f"docs_{file_name}.pkl").exists():
    print("Loading docs from pickle")
    with open(pkl_dir / f"docs_{file_name}.pkl", "rb") as f:
        docs = pickle.load(f)
else:
    print("Loading docs from pdf. \nThis will take some time (~5 min)")

    # Option 1: loading small docs
    # docs = loader.load()

    # Option 2: Load documents asynchronously (almost 3x faster)
    # assert not extract_images, "Async loading not supported for image extraction"
    # docs = await loader.aload()

    # Option 3: lazy load with progress bar
    # todo: make this asynchronous
    pages = []
    docs = []
    for doc in tqdm(loader.lazy_load(), total=pdf_len, desc=f"Loading {file_name}"):
        pages.append(doc)
        # process the pages in chunks
        if len(pages) >= 100:
            docs.extend(pages)
            pages = []
    # Don't forget any remaining pages
    if pages:
        docs.extend(pages)

    # pickle save the docs
    with open(pkl_dir / f"docs_{file_name}.pkl", "wb") as f:
        pickle.dump(docs, f)

print(f"Loaded {file_name}: {len(docs)} documents")


Loading docs from pdf. 
This will take some time (~5 min)


  0%|          | 0/445 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
len(docs)


300

In [None]:
from time import sleep

for _ in tqdm(range(5), total=5, desc=f"Loading {file_name}"):
    sleep(0.1)


In [None]:
temp = docs[2]
display(Markdown(temp.page_content))
print('-'*50)
pprint.pp(temp.metadata)


In [None]:
# from langchain_neo4j import Neo4jGraph

# graph = Neo4jGraph()
