In [1]:
import os
from groq import Groq
from dotenv import load_dotenv

load_dotenv()
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [22]:
def extract_triplets(text):
    system_prompt = """
    You are an expert Knowledge Graph engineer. Your task is to extract relationships from text in the form of (subject, relation, object) triplets.
    
    GUIDELINES:
    1. Subjects and Objects should be concise nouns or entities (e.g., "Ragas", "LLM", "Evaluation Framework").
    2. Relations should be verbs or verb phrases representing the link (e.g., "is designed for", "measures", "improves").
    3. If a sentence is complex, break it into multiple simple triplets.
    4. Ensure the output is strictly valid JSON.
    
    EXAMPLE:
    Text: "Ragas uses LLMs to automate the evaluation of RAG pipelines."
    Output: {
      "triplets": [
        ["Ragas", "uses", "LLMs"],
        ["Ragas", "automates", "evaluation of RAG pipelines"]
      ]
    }
    """
    completion = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[
            {
                "role": "system", 
                "content":system_prompt
            },
            {
                "role": "user", 
                "content": f"Extract entities from this text: {text}"
            }
        ],
        temperature=0, # it should be  deterministic for KG extraction
        response_format={"type": "json_object"} # to get valid JSON back
    )
    return completion.choices[0].message.content

# Example 
sample_text = "Patient shows signs of Bradycardia which is often associated with the HCN4 gene mutation."
print(extract_triplets(sample_text))

{
  "triplets": [
       ["Patient", "shows", "signs of Bradycardia"],
       ["Bradycardia", "is associated with", "HCN4 gene mutation"]
   ]
}


In [23]:
import os
import json
from dotenv import load_dotenv
from groq import Groq
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface import HuggingFaceEmbeddings
from neo4j import GraphDatabase

In [24]:
load_dotenv()
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
def get_propositions(text):
    response = groq_client.chat.completions.create(
        model="llama-3.1-8b-instant",
        messages=[
            {
                "role": "system", 
                "content": (
                    "Decompose the input text into standalone, atomic facts. "
                    "Return the result in JSON format with a single key called 'propositions' "
                    "which contains a list of strings. "
                    "Example: {'propositions': ['fact 1', 'fact 2']}" 
                )
            },
            {
                "role": "user", 
                "content": f"Decompose this text: {text}"
            }
        ],
        response_format={"type": "json_object"}
    )
    
    content = json.loads(response.choices[0].message.content)
    return content.get("propositions", [])

In [26]:
def semantic_grouping(propositions):
    # SemanticChunker expects a single string or list of docs
    chunker = SemanticChunker(embed_model, breakpoint_threshold_type="percentile")
    full_text = " ".join(propositions)
    # This splits the text at points where the "meaning" shifts
    chunks = chunker.create_documents([full_text])
    return [chunk.page_content for chunk in chunks]

In [27]:
def save_to_neo4j(triplets):
    uri = os.getenv("NEO4J_URI")
    user = os.getenv("NEO4J_USERNAME") 
    password = os.getenv("NEO4J_PASSWORD")

    driver = GraphDatabase.driver(uri, auth=(user, password))
    
    with driver.session() as session:
        for triplet in triplets:
            # it's a valid 3-item list or not
            if isinstance(triplet, list) and len(triplet) == 3:
                s, r, o = triplet
                
                # Convention
                rel_type = str(r).replace(" ", "_").upper()
                
                session.run("""
                    MERGE (a:Entity {name: $s})
                    MERGE (b:Entity {name: $o})
                    WITH a, b
                    CALL apoc.merge.relationship(a, $rel_type, {}, {}, b) YIELD rel
                    RETURN rel
                """, s=str(s), rel_type=rel_type, o=str(o))
            else:
                print(f"Skipping malformed triplet: {triplet}")
                
    driver.close()

In [28]:
from langchain_community.document_loaders import PyPDFLoader, TextLoader, Docx2txtLoader
def load_local_file(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} was not found.")

    ext = os.path.splitext(file_path)[-1].lower()
    
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
    elif ext == ".txt":
        loader = TextLoader(file_path, encoding="utf-8")
    elif ext == ".docx":
        loader = Docx2txtLoader(file_path)
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    docs = loader.load()
    return "\n\n".join([doc.page_content for doc in docs])

In [29]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
structural_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, 
    chunk_overlap=200,
    separators=["\n\n", "\n", ".", " "]
)

In [30]:
from tqdm import tqdm

target_file = r"C:\Users\Cengizhan\Desktop\CMPE492-Project-Rag-Pipeline\Documents\Ragas\ragas_2309.15217v2.pdf"
print(f"Loading file: {target_file}")
raw_content = load_local_file(target_file)
rough_chunks = structural_splitter.split_text(raw_content)

all_propositions = []

for segment in tqdm(rough_chunks, desc="Extracting Facts"):
    props = get_propositions(segment) 
    all_propositions.extend(props)
    
semantic_chunks = semantic_grouping(all_propositions)


Loading file: C:\Users\Cengizhan\Desktop\CMPE492-Project-Rag-Pipeline\Documents\Ragas\ragas_2309.15217v2.pdf


Extracting Facts: 100%|██████████| 22/22 [01:56<00:00,  5.27s/it]


In [33]:
semantic_chunks

['Ragas is a framework for reference-free evaluation of Retrieval Augmented Generation pipelines. RAG systems are composed of a retrieval and an LLM based generation module. RAG systems provide LLMs with knowledge from a reference textual database. RAG systems reduce the risk of hallucinations by acting as a natural language layer between a user and textual databases. Evaluating RAG architectures is challenging because of several dimensions to consider. The dimensions to consider in RAG architectures include the ability to identify relevant and focused context passages. The dimensions to consider in RAG architectures include the ability to exploit passages in a faithful way. The dimensions to consider in RAG architectures include the quality of the generation itself. Ragas provides a suite of metrics to evaluate different dimensions without relying on ground truth human annotations. A framework like Ragas can contribute to faster evaluation cycles of RAG architectures. RAG architecture

In [38]:
for chunk in tqdm(semantic_chunks, desc="Ingesting to Neo4j"):
    triplets = extract_triplets(chunk)
    print(triplets)
    parsed = json.loads(triplets)
    triplets = parsed.get("triplets", [])
    save_to_neo4j(triplets)

print("\nKnowledge Graph constructed in Neo4j container.")

Ingesting to Neo4j:   0%|          | 0/18 [00:00<?, ?it/s]

{
  "triplets": [
       ["Ragas", "is", "framework"],
       ["Ragas", "provides", "reference-free evaluation"],
       ["RAG systems", "are composed of", "retrieval module"],
       ["RAG systems", "are composed of", "LLM based generation module"],
       ["RAG systems", "provide", "knowledge"],
       ["RAG systems", "reduce", "risk of hallucinations"],
       ["RAG architectures", "are challenging to evaluate", "because of several dimensions"],
       ["RAG architectures", "consider", "ability to identify relevant context passages"],
       ["RAG architectures", "consider", "ability to exploit passages"],
       ["RAG architectures", "consider", "quality of generation"],
       ["Ragas", "provides", "suite of metrics"],
       ["Ragas", "contributes to", "faster evaluation cycles"],
       ["RAG architectures", "are adopted quickly", "due to fast adoption of LLMs"],
       ["Language Models", "capture", "knowledge about the world"],
       ["Language Models", "answer", "questions"]

Ingesting to Neo4j:   6%|▌         | 1/18 [00:05<01:34,  5.55s/it]

{
  "triplets": [
       ["Ragas framework", "integrates with", "llama-index"],
       ["Ragas framework", "integrates with", "Langchain"],
       ["llama-index", "is used for", "building RAG solutions"],
       ["Langchain", "is used for", "building RAG solutions"],
       ["Ragas", "can be integrated into", "standard workflows"],
       ["LLMs", "are used for", "estimating faithfulness"],
       ["LLMs", "struggle with", "detecting hallucination"],
       ["standard prompting strategies", "are used for", "detecting hallucination"],
       ["external knowledge base", "is used for", "linking generated responses"],
       ["BARTScore", "estimates", "factuality"],
       ["BARTScore", "looks at", "conditional probabilities of generated text"],
       ["Kadavath et al.", "studied", "detecting hallucination"]
   ]
}


Ingesting to Neo4j:  11%|█         | 2/18 [00:06<00:44,  2.80s/it]

{
  "triplets": [
       ["LLMs", "provide", "well-calibrated probabilities"],
       ["LLMs", "convert", "answer validation into a multiple-choice question"],
       ["supervised classifier", "can be trained to predict", "whether an answer is true or false"],
       ["Azaria and Mitchell", "propose", "training a supervised classifier on the weights from one of the hidden layers of the LLM"],
       ["supervised classifier", "can predict", "whether a given statement is true or not"],
       ["supervised classifier", "uses", "weights from one of the hidden layers of the LLM"],
       ["approach", "performs well", "training a supervised classifier on the weights from one of the hidden layers of the LLM"],
       ["approach", "is unsuitable for", "systems that access LLMs through an API"]
   ]
}


Ingesting to Neo4j:  17%|█▋        | 3/18 [00:07<00:28,  1.90s/it]

{
  "triplets": [
       ["Methods", "are needed for", "models"],
       ["SelfCheckGPT", "addresses", "problem of token probabilities"],
       ["Factual answers", "are", "semantically similar"],
       ["Hallucinated answers", "are not", "semantically similar"],
       ["LLMs", "can be used for", "evaluation of text generation systems"],
       ["GPTScore", "uses", "prompt"],
       ["GPTScore", "scores", "passages"],
       ["GPTScore", "considers", "fluency"],
       ["GPTScore", "uses", "autoregressive LM"],
       ["Yuan et al.", "considered", "using prompts"]
   ]
}


Ingesting to Neo4j:  28%|██▊       | 5/18 [00:08<00:13,  1.05s/it]

{
  "triplets": [
       ["Yuan et al.", "evaluates", "aspects of generated text"],
       ["Yuan et al.", "performed evaluation in", "2021"]
   ]
}
{
  "triplets": [
       ["2021", "used", "BART"],
       ["ChatGPT", "evaluates", "given answer"],
       ["ChatGPT", "provides", "score"],
       ["ChatGPT", "provides", "rating"],
       ["Prompt", "is sensitive to", "design"],
       ["ChatGPT", "evaluates", "particular aspect"],
       ["LLMs", "select", "best answer"],
       ["LLMs", "compare", "performance"],
       ["LLMs", "require", "care"],
       ["Approaches", "evaluate", "generated text fragments"],
       ["BERTScore", "compares", "generated answers"],
       ["MoverScore", "compares", "generated answers"],
       ["BARTScore", "estimates", "precision and recall"],
       ["RAG systems", "retrieve", "context"],
       ["RAG systems", "generate", "answer"],
       ["Metrics", "are", "self-contained"],
       ["Metrics", "are", "reference-free"],
       ["Faithfulness", "refe

Ingesting to Neo4j:  39%|███▉      | 7/18 [00:10<00:11,  1.08s/it]

{
  "triplets": [
       ["LLM", "is used for", "faithfulness estimation"],
       ["Sentences", "are decomposed into", "shorter assertions"],
       ["Prompt", "is used to create", "statements"],
       ["gpt-3.5-turbo-16k model", "is used for", "prompts"],
       ["LLM", "determines inference of", "statements"],
       ["Verification function", "is carried out using", "prompt"],
       ["Context", "is used for", "verification"],
       ["Statements", "are verified against", "context"],
       ["Verification function", "provides", "explanation and verdict"]
   ]
}
{
  "triplets": [
       ["F", "is computed as", "faithfulness score"],
       ["F", "equals", "|V| / |S|"],
       ["V", "is", "number of statements"],
       ["V", "is supported by", "LLM"],
       ["F", "is calculated using", "V and S"]
   ]
}


Ingesting to Neo4j:  44%|████▍     | 8/18 [00:11<00:09,  1.05it/s]

{
  "triplets": [
       ["Answer relevance", "is assessed by", "Our assessment"],
       ["Our assessment", "penalises", "incomplete answers"],
       ["Our assessment", "penalises", "redundant information"],
       ["LLM", "generates", "potential questions"],
       ["Text-embedding-ada-002 model", "is obtained from", "OpenAI API"],
       ["Text-embedding-ada-002 model", "is used for", "question embeddings"],
       ["Similarity", "is calculated between", "original question and potential questions"]
   ]
}


Ingesting to Neo4j:  56%|█████▌    | 10/18 [00:12<00:06,  1.28it/s]

{
  "triplets": [
       ["LLM", "determines", "inference of Si"],
       ["LLM", "uses", "verification function v"],
       ["Verification function v", "takes", "Si and c(q)"],
       ["Verification step", "is carried out using", "prompt"],
       ["Prompt", "considers", "given context and statements"],
       ["Prompt", "determines", "support of statements by context"]
   ]
}


Ingesting to Neo4j:  61%|██████    | 11/18 [00:13<00:04,  1.45it/s]

{
  "triplets": [
       ["Text", "requires", "explanation"],
       ["Explanation", "is provided for", "statement"],
       ["Statement", "receives", "verdict"],
       ["Verdict", "is given as", "Yes/No"]
   ]
}
{
  "triplets": [
       ["text-embedding-ada-002 model", "is available from", "OpenAI API"],
       ["embeddings", "are obtained for", "questions"],
       ["similarity", "is calculated as", "cosine between embeddings"],
       ["answer relevance score", "is computed as", "sum of similarities"],
       ["answer relevance score", "evaluates", "generated answer"],
       ["context", "contains", "information"],
       ["context relevance metric", "penalises", "model"],
       ["language model", "extracts", "subset of sentences"],
       ["context relevance", "is computed as", "number of extracted sentences divided by total number of sentences"],
       ["WikiEval Dataset", "is proposed to evaluate", "framework"],
       ["WikiEval Dataset", "includes", "question-context-answer 

Ingesting to Neo4j:  72%|███████▏  | 13/18 [00:15<00:04,  1.22it/s]

{
  "triplets": [
       ["question", "must not contain", "links"],
       ["question", "must be framed from", "part"],
       ["part", "contains", "non-trivial information"]
   ]
}


Ingesting to Neo4j:  78%|███████▊  | 14/18 [00:15<00:02,  1.49it/s]

{
  "triplets": [
       ["model", "has training cutoff", "2022"],
       ["model used in experiment", "is different from", "model used in production"]
   ]
}


Ingesting to Neo4j:  83%|████████▎ | 15/18 [00:15<00:01,  1.81it/s]

{
  "triplets": [
       ["question", "should be of", "moderate difficulty"]
   ]
}
{
  "triplets": [
       ["Question", "must be understood by", "humans"],
       ["ChatGPT", "answers", "generated question"],
       ["Introductory section", "is used as", "context"],
       ["Prompt", "instructs to answer", "question"],
       ["Annotators", "annotate", "questions"],
       ["Annotators", "are fluent in", "English"],
       ["Annotators", "receive", "instructions"],
       ["Annotators", "agree on", "faithfulness"],
       ["Annotators", "agree on", "context relevance"],
       ["Disagreements", "are resolved by", "discussion"],
       ["ChatGPT", "answers question without", "additional context"],
       ["Human annotators", "judge", "faithfulness"],
       ["Human annotators", "compare", "incompletely answered questions"],
       ["Context", "is added with", "Wikipedia page"],
       ["Additional sentences", "are scraped from", "Wikipedia page"],
       ["Additional sentences", "are 

Ingesting to Neo4j:  89%|████████▉ | 16/18 [00:19<00:02,  1.40s/it]

{
  "triplets": [
       ["Ragas", "supports", "development of quality assessment framework"],
       ["Ragas", "is", "framework"],
       ["Ragas", "provides", "feedback to developers of RAG systems"],
       ["Ragas", "is easy to use", "use"],
       ["GPT Ranking", "is", "baseline"],
       ["Broken ranking system", "is", "reference"],
       ["WikiEval", "is", "dataset"],
       ["WikiEval", "contains", "human judgements"],
       ["Evaluation framework", "should assess", "faithfulness"],
       ["Evaluation framework", "should assess", "answer relevance"],
       ["Evaluation framework", "should assess", "context relevance"],
       ["Ragas", "predicts", "faithfulness"],
       ["Ragas", "predicts", "answer relevance"],
       ["Ragas", "predicts", "context relevance"],
       ["Amos Azaria", "published", "The internal state of an LLM knows when its lying"],
       ["Tom M. Mitchell", "published", "The internal state of an LLM knows when its lying"],
       ["Sebastian Borgeaud", 

Ingesting to Neo4j:  94%|█████████▍| 17/18 [00:22<00:01,  1.88s/it]

{
  "triplets": [
       ["Robert Oppenheimer", "was", "theoretical physicist"],
       ["J. Robert Oppenheimer", "was pivotal in developing", "first nuclear weapons"],
       ["Manhattan Project", "ushered in", "Atomic Age"],
       ["Cillian Murphy", "stars as", "Oppenheimer"],
       ["Emily Blunt", "stars as", "Katherine 'Kitty' Oppenheimer"],
       ["Christopher Nolan", "directed", "film Oppenheimer"],
       ["PSLV-C56 mission", "is scheduled to be launched on", "Sunday, 30 July 2023"],
       ["PSLV-C56 mission", "will be launched from", "Satish Dhawan Space Centre"],
       ["Satish Dhawan Space Centre", "is located in", "Sriharikota, Andhra Pradesh, India"],
       ["Chimnabai Clock Tower", "was completed in", "1896"],
       ["Chimnabai Clock Tower", "is situated in", "Raopura area of Vadodara, Gujarat, India"],
       ["Chimnabai Clock Tower", "was named in memory of", "Chimnabai I"],
       ["Chimnabai I", "was", "queen"],
       ["Chimnabai I", "was first wife of", "Sayaj

Ingesting to Neo4j: 100%|██████████| 18/18 [00:26<00:00,  1.47s/it]


Knowledge Graph constructed in Neo4j container.



