In [18]:
import os
from tqdm import tqdm
from neo4j import GraphDatabase
from langchain_groq import ChatGroq
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from neo4j_graphrag.llm.base import LLMInterface
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from dotenv import load_dotenv 
from typing import Any, Optional
from neo4j_graphrag.llm.types import LLMResponse
from langchain_huggingface import HuggingFaceEmbeddings
from neo4j_graphrag.embeddings.sentence_transformers import SentenceTransformerEmbeddings 
import asyncio

In [16]:
load_dotenv()
class GroqAdapter(LLMInterface):
    def __init__(self, llm: Any):
        self.llm = llm

    def invoke(self, input_text: str) -> LLMResponse:
        # Standard LangChain call
        response = self.llm.invoke(input_text)
        # Wrap it in the Neo4j-specific LLMResponse type
        return LLMResponse(content=response.content)

    async def ainvoke(self, input_text: str) -> LLMResponse:
        response = await self.llm.ainvoke(input_text)
        return LLMResponse(content=response.content)

In [None]:

path = r"C:\Users\Cengizhan\Desktop\CMPE492-Project-Rag-Pipeline\Documents\Ragas"
loader = DirectoryLoader(path, glob="**/*.pdf", loader_cls=PyPDFLoader)
docs = loader.load()
print(f"Loaded {len(docs)} document pages.")

URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
AUTH = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
GROQ_API_KEY = os.getenv("GROQ_API_KEY") 

driver = GraphDatabase.driver(URI, auth=AUTH)

groq_llm = ChatGroq(
    model_name="llama-3.1-8b-instant", 
    temperature=0,
    api_key=GROQ_API_KEY ,
    max_tokens=4096,
    model_kwargs={"response_format": {"type": "json_object"}} 
)
neo4j_embedder = SentenceTransformerEmbeddings(model="all-MiniLM-L6-v2")
llm_adapter = GroqAdapter(llm=groq_llm)

kg_pipeline = SimpleKGPipeline(
    llm=llm_adapter,
    driver=driver,
    from_pdf=False,
    embedder= neo4j_embedder,
    on_error="IGNORE" 
)
for doc in tqdm(docs, desc="Ingesting PDF Pages"):
    await kg_pipeline.run_async(text=doc.page_content)

Loaded 8 document pages.


Ingesting PDF Pages:  88%|████████▊ | 7/8 [06:21<00:57, 57.89s/it]ERROR:neo4j_graphrag.experimental.components.entity_relation_extractor:LLM response has improper format for chunk_index=0
Ingesting PDF Pages: 100%|██████████| 8/8 [07:06<00:00, 53.27s/it]


In [None]:
#KG Pipeline Tracker:
## run_async() take the content of the page  and  call runner on it
## FixedSizeSplitter to split the page into chunks (character count)
## TextChunkEmbedder to create embedding of each chunk
## SchemaFromTextExtractor to create schema with LLM
### prompt for this :""You are a top-tier algorithm designed for extracting a labeled property graph schema in structured formats......
##LLMEntityRelationExtractor : extract the relationships and nodes from the text, it uses the only types and labels from the graph schema that SchemaFromTextExtractor provides
## GraphPruning to compare every extracted node label and relationship type against the GraphSchema
### It identifies relationships where the start_node or end_node is missing from the list of extracted nodes.
## KGWriter : write the nodes and relationships to the neo4j
##SinglePropertyExactMatchResolver : Resolve entities with same label and exact same property (default is "name").
### LLM might create with different id but same properties

