# Data Extraction (Text-to-KG)

In [None]:
import os
import time
from typing import List, Dict, Any
from dotenv import load_dotenv
from langchain.globals import set_llm_cache
from langchain_community.cache import InMemoryCache
from langchain_core.documents.base import Document


'''
You need the following environment variables:

    1. NEO4J_URL (Default="bolt://localhost:7687"), 
    2. NEO4J_USERNAME (Default="neo4j"), 
    3. NEO4J_PASSWORD (Default=None), 
    4. NEO4J_DATABASE (Default="neo4j")
    (The above are for establishing a Neo4j Connection and can be passed as arguments as well)
    5. MODELS_CACHE_FOLDER (Default=None):- 
        I'm using SentenceTransformer model "all-MiniLM-v6-L2" for creating embeddings to calculate vector similarity.
        This is just the path to the folder to use as the cache folder
    6. GOOGLE_API_KEY (For ChatGoogleGenerativeAI) or any other API key for env variable if you want to use some other LLM
    7. TESSDATA_PREFIX (Might need it for using PyTesseract)
'''
load_dotenv('../.env')

from kgrag.data_extraction import Text2KG
from kgrag.parse_pdf import PDFParserMarkdown, OCREngine

from langchain_google_genai import ChatGoogleGenerativeAI

set_llm_cache(InMemoryCache()) # Set LLM Caching (Optional)

# Replace with llm you want to use
llm = ChatGoogleGenerativeAI(model="models/gemini-1.0-pro", temperature=0)

In [3]:
# filepath = input("Please enter filepath of the file you want to process: ")
# filepath = "../SampleDocs/Leadership-Etsko-Schuitema.pdf" # ...or any other file you want to use
filepath = filepath = "/media/wali/D_Drive/Documents/Books/C++_Programming_Program_Design_Including_Data_Structure_D.S.Malik_5th_DS.pdf"


In [4]:

# Can also accept neo4j_url, neo4j_username, neo4j_password & neo4j_database arguments
# if the above mentioned environment variables have not been set
text2kg = Text2KG(
    llm=llm,
    emb_model=None, # By Default, SentenceTransformer is used ot use any other embedding model
    disambiguate_nodes=False,
    link_nodes=True,
    node_vector_similarity_threshold=0.90,
    subject=filepath.split('/')[-1].split('.')[0].replace('_',' ').replace('-',' '), # Subject can be anything or nothing - filename works well for most cases
    verbose=True
) 

In [None]:
'''
Parse or read the PDF file
Can use any parser as long as some conditions are observed in the output:
    1. Must be in langchain_core.documents.base.Document format
    2. Must contain the following keys in doc.metadata:
        1. page
        2. filename/filepath or source
'''

pages =  list(range(42, 44)) #list(range(68,70)) # None

parser = PDFParserMarkdown(
    pdf_path=filepath,
    pages=pages, # Can pass a list of pages to read, useful for debugging
    ocr_engine=OCREngine.PYTESSERACT, # 3 OCR Options: PYTESSERACT, LLM, RAPIDOCR (LLM is most accurate)
) 

doc_dicts: List[Dict[str, Any]] = parser.process_pdf_document()

docs: List[Document] = [
    Document(
        page_content=doc['text'],
        metadata={**doc['page_metadata'], **doc['doc_metadata']}
    )
    for doc in doc_dicts
]

In [None]:
print(docs[-1].page_content)

In [None]:
start_time = time.process_time()
text2kg.process_documents(docs)
end_time = time.process_time()
print(f"Total Time: {end_time - start_time}")

In [None]:
print("Hi")

# KG Search / Retrieval

In [None]:
import os
from dotenv import load_dotenv

load_dotenv("../.env")

from langchain_google_genai import ChatGoogleGenerativeAI

from kgrag.kg_search import KGSearch


llm = ChatGoogleGenerativeAI(model="models/gemini-1.0-pro", temperature=0)



In [None]:

kg_search = KGSearch(
    ent_llm=llm,
    cypher_llm=llm,
    cypher_examples_json="examples.json",
    fulltext_search_top_k=5,
    vector_search_top_k=5,
    vector_search_min_score=0.8
)

query = input("Enter your query: ") # "How does relation extraction work?"

docs_str = kg_search.retrieve_as_string(
    query, 
    nresults=30,
    use_fulltext_search=True, # Extract all entities (using ent_llm) in the input query and search using fulltext search
    use_vector_search=True, # Search for all entities/nodes in the query using vector search
    generate_cypher=False # Use LLM (cypher_llm) to generate cypher - Uses examples from `cypher_examples_json` for guidance
)
print(docs_str)
'''
OR get lists of strings separately using kg_search.retrieve - retrieve_as_string is only a wrapper for this function

rels, docs, gen_cypher_results = kg_search.retrieve(
    query, 
    nresults=30,
    use_fulltext_search=True, 
    use_vector_search=True,
    generate_cypher=False
)
rels: All the triples that the entities in the query are involved in 
     Empty list is returned if `use_fulltext_search=False` & `use_vector_search=False`
docs: All the documents that contain any entities mentioned in the query
     Empty list is returned if `use_fulltext_search=False` & `use_vector_search=False`
gen_cypher_results: List of string/JSON results from the generated cypher
     Empty list is returned if `generate_cypher=False`

If no entities are found using vector or fulltext search, then a cypher is generated regardless of the value of `generate_cypher`
And results are returned in the gen_cypher_results
WARNING: Using `generate_cypher` is unreliable and error-prone. Needs more work and more examples from 'examples.json'
'''
