# <span style='color:Tomato;'>Load Env Variables</span>

In [None]:
import dotenv
import utils


# Add the modules directory to the Python path if needed
# sys.path.append(os.path.abspath("./modules"))

# load variables into env
root_dir = utils.get_project_root()
f = root_dir / ".secrets" / ".env"
assert f.exists(), f"File not found: {f}"
dotenv.load_dotenv(f)


In [None]:
# from langchain_core.runnables import  RunnablePassthrough
# from langchain_core.prompts import ChatPromptTemplate
# from pydantic import BaseModel, Field
# from langchain_core.output_parsers import StrOutputParser
# from langchain_community.graphs import Neo4jGraph
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.chat_models import ChatOllama
# from langchain_experimental.graph_transformers import LLMGraphTransformer
# from neo4j import GraphDatabase
# from yfiles_jupyter_graphs import GraphWidget
# from langchain_community.vectorstores import Neo4jVector
# from langchain_community.document_loaders import TextLoader
# from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
# from langchain_ollama import OllamaEmbeddings
# from langchain_experimental.llms.ollama_functions import OllamaFunctions



# <span style='color:Tomato;'>Process PDFs</span>

We'll use Langchain PyMuPDF4LLM

## <span style='color:Orange;'>Initialization</span>

### <span style='color:Khaki;'>Basic Imports</span>

In [None]:
# import os
# import sys
from pathlib import Path
import pprint
import pickle
import tqdm
from IPython.display import Markdown, display


ModuleNotFoundError: No module named 'tqdm'

### <span style='color:Khaki;'>initializing the graph database</span>

In [None]:
from langchain_neo4j import Neo4jGraph

graph = Neo4jGraph()


### <span style='color:Khaki;'>Loading PDF file</span>

You can either use `load()` method to do it all at once in memory or inclemently do it using `lazy_load()`.

Since our docs are big, we'll use `lazy_load()` to also see the progress.

To save time, we will load the docs from a pickle file if previously processed, otherwise process them and save them as a pickle.

In [28]:
# pdf file
file_path = (Path() / ".." / "data" / "pdfs" / "VMD_user_guid_2016.pdf").resolve()

file_name = file_path.stem.lower()

assert file_path.exists(), f"File not found: {file_path}"

# create directory for pkl files
pkl_dir = file_path.parent.parent / "pkls"
pkl_dir.mkdir(exist_ok=True, parents=True)

print(file_path)
print(file_name)


/DATA/Ali_Data/GraphRAG-Neo4j-VMD-NAMD/data/pdfs/VMD_user_guid_2016.pdf
vmd_user_guid_2016


In [None]:
from langchain_pymupdf4llm import PyMuPDF4LLMLoader


loader = PyMuPDF4LLMLoader(file_path)


if (pkl_dir / f"docs_{file_name}.pkl").exists():
    print("Loading docs from pickle")
    with open(pkl_dir / f"docs_{file_name}.pkl", "rb") as f:
        docs = pickle.load(f)
else:
    print("Loading docs from pdf. \nThis will take some time (~5 min)")

    # Option 1: loading small docs
    # docs = loader.load()

    # Option 2: Load documents asynchronously (almost 3x faster)
    docs = await loader.aload()

    # Option 3: lazy load with progress bar
    # todo: make this asynchronous
    # pages = []
    # docs = []
    # for doc in tqdm.tqdm(loader.lazy_load()):
    #     pages.append(doc)
    #     # process the pages in chunks
    #     if len(pages) >= 100:
    #         docs.extend(pages)
    #         pages = []

    # pickle save the docs
    with open(pkl_dir / f"docs_{file_name}.pkl", "wb") as f:
        pickle.dump(docs, f)

print(f"Loaded {file_name}: {len(docs)} documents")


Loading docs from pdf. 
This will take some time (~5 min)
Loaded vmd_user_guid_2016: 265 documents
