In [1]:
import os
from tqdm import tqdm
from neo4j import GraphDatabase
from langchain_groq import ChatGroq
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from neo4j_graphrag.llm.base import LLMInterface
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from dotenv import load_dotenv 
from typing import Any, Optional
from neo4j_graphrag.llm.types import LLMResponse
from langchain_huggingface import HuggingFaceEmbeddings
from neo4j_graphrag.embeddings.sentence_transformers import SentenceTransformerEmbeddings 
import asyncio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from langchain_experimental.text_splitter import SemanticChunker
from neo4j_graphrag.experimental.components.text_splitters.langchain import LangChainTextSplitterAdapter

semantic_splitter = SemanticChunker(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"), 
    breakpoint_threshold_type="percentile" 
)

neo4j_semantic_splitter = LangChainTextSplitterAdapter(semantic_splitter)

In [3]:
load_dotenv()
class GroqAdapter(LLMInterface):
    def __init__(self, llm: Any):
        self.llm = llm

    def invoke(self, input_text: str, *args: Any, **kwargs: Any) -> LLMResponse:
        print("input_text: ", input_text)
        print("args: ",args)
        print("kwargs: ",kwargs)
        system_instruction = kwargs.get("system_instruction", "")
        
        if "json" not in system_instruction.lower():
            system_instruction += " Please provide the output in valid JSON format."

        if system_instruction:
            messages = [
                ("system", system_instruction),
                ("human", input_text)
            ]
            response = self.llm.invoke(messages)
        else:
            response = self.llm.invoke(input_text + " (Output in JSON)")
            
        return LLMResponse(content=response.content)

    async def ainvoke(self, input_text: str, *args: Any, **kwargs: Any) -> LLMResponse:
        system_instruction = kwargs.get("system_instruction")
        print("input_text: ", input_text)
        print("args: ",args)
        print("kwargs: ",kwargs)
        if system_instruction:
            messages = [("system", system_instruction), ("human", input_text)]
            response = await self.llm.ainvoke(messages)
        else:
            response = await self.llm.ainvoke(input_text)
        return LLMResponse(content=response.content)

In [7]:

#loader = DirectoryLoader(path, glob="**/*.pdf", loader_cls=PyPDFLoader)
#docs = loader.load()
#print(f"Loaded {len(docs)} document pages.")
path = 'C:/Users/Cengizhan/Desktop/CMPE492-Project-Rag-Pipeline/Documents/Ragas/ragas_2309.15217v2.pdf'
URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
AUTH = (os.getenv("NEO4J_USERNAME", "neo4j"), os.getenv("NEO4J_PASSWORD", "password"))
GROQ_API_KEY = os.getenv("GROQ_API_KEY") 

driver = GraphDatabase.driver(URI, auth=AUTH)

groq_llm = ChatGroq(
    model_name="meta-llama/llama-4-scout-17b-16e-instruct", 
    temperature=0,
    api_key=GROQ_API_KEY ,
    max_tokens=4096,
    model_kwargs={"response_format": {"type": "json_object"}} 
)
neo4j_embedder = SentenceTransformerEmbeddings(model="all-MiniLM-L6-v2")
llm_adapter = GroqAdapter(llm=groq_llm)

kg_pipeline = SimpleKGPipeline(
    llm=llm_adapter,
    driver=driver,
    from_pdf=True,
    embedder= neo4j_embedder,
    schema="FREE",
    text_splitter=neo4j_semantic_splitter,
    on_error="IGNORE" 
)


In [8]:
await kg_pipeline.run_async(file_path=path)

input_text:  
You are a top-tier algorithm designed for extracting
information in structured formats to build a knowledge graph.

Extract the entities (nodes) and specify their type from the following text.
Also extract the relationships between these nodes.

Return result as JSON using the following format:
{"nodes": [ {"id": "0", "label": "Person", "properties": {"name": "John"} }],
"relationships": [{"type": "KNOWS", "start_node_id": "0", "end_node_id": "1", "properties": {"since": "2024-08-01"} }] }

Use only the following node and relationship types (if provided):
{'node_types': (), 'relationship_types': (), 'patterns': (), 'constraints': (), 'additional_node_types': True, 'additional_relationship_types': True, 'additional_patterns': True}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Make sure you adhere to the following rules to produce valid JSON obje

PipelineResult(run_id='d1a98c51-b2ba-4d82-b99a-9b20a4c9d1e0', result={'resolver': {'number_of_nodes_to_resolve': 271, 'number_of_created_nodes': 223}})

In [None]:
#KG Pipeline Tracker:
## run_async() take the content of the page  and  call runner on it
## FixedSizeSplitter to split the page into chunks (character count)
## TextChunkEmbedder to create embedding of each chunk
## SchemaFromTextExtractor to create schema with LLM
### prompt for this :""You are a top-tier algorithm designed for extracting a labeled property graph schema in structured formats......
##LLMEntityRelationExtractor : extract the relationships and nodes from the text, it uses the only types and labels from the graph schema that SchemaFromTextExtractor provides
## GraphPruning to compare every extracted node label and relationship type against the GraphSchema
### It identifies relationships where the start_node or end_node is missing from the list of extracted nodes.
## KGWriter : write the nodes and relationships to the neo4j
##SinglePropertyExactMatchResolver : Resolve entities with same label and exact same property (default is "name").
### LLM might create with different id but same properties



In [14]:
from neo4j_graphrag.indexes import create_vector_index

create_vector_index(
    driver, 
    name="chunk_vector", 
    label="Chunk", 
    embedding_property="embedding", 
    dimensions=384,  
    similarity_fn="cosine"
)

In [5]:
from neo4j_graphrag.retrievers import VectorCypherRetriever
from neo4j_graphrag.generation import GraphRAG

retriever = VectorCypherRetriever(
    driver=driver,
    index_name="chunk_vector",
    embedder=neo4j_embedder,
    retrieval_query="""
    MATCH (node)-[:NEXT_CHUNK|:FROM_CHUNK]-(neighbor)
    WITH node, collect(coalesce(neighbor.text, neighbor.name, labels(neighbor)[0])) AS info
    RETURN node.text AS text, 
           node.score AS score, 
           {related_info: info} AS metadata
    """
)

In [6]:
rag_system = GraphRAG(
    retriever=retriever, 
    llm=llm_adapter       
)

query = "What are the core components of the RAG pipeline mentioned in the document?"
response = rag_system.search(
    query_text=query, 
    retriever_config={"top_k": 1}
)

print(f"Answer: {response.answer}")



prompt:  Context:
<Record text='Ragas: Automated Evaluation of Retrieval Augmented Generation\nShahul Es†, Jithin James †, Luis Espinosa-Anke ∗♢, Steven Schockaert ∗\n†Exploding Gradients\n∗CardiffNLP, Cardiff University, United Kingdom\n♢AMPLYFI, United Kingdom\nshahules786@gmail.com,jamesjithin97@gmail.com\n{espinosa-ankel,schockaerts1}@cardiff.ac.uk\nAbstract\nWe introduce Ragas (Retrieval Augmented\nGeneration Assessment), a framework for\nreference-free evaluation of Retrieval Aug-\nmented Generation (RAG) pipelines. RAG\nsystems are composed of a retrieval and an\nLLM based generation module, and provide\nLLMs with knowledge from a reference textual\ndatabase, which enables them to act as a natu-\nral language layer between a user and textual\ndatabases, reducing the risk of hallucinations. Evaluating RAG architectures is, however, chal-\nlenging because there are several dimensions to\nconsider: the ability of the retrieval system to\nidentify relevant and focused context passag