In [2]:
## Code from https://github.com/neo4j-product-examples/graphrag-python-examples/tree/main

!pip install fsspec langchain-text-splitters tiktoken openai python-dotenv numpy torch neo4j-graphrag

Collecting torch
  Using cached torch-2.5.1-cp311-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Using cached filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting jinja2 (from torch)
  Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting sympy==1.13.1 (from torch)
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting MarkupSafe>=2.0 (from jinja2->torch)
  Using cached MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.0 kB)
Using cached torch-2.5.1-cp311-none-macosx_11_0_arm64.whl (63.9 MB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Using cached filelock-3.16.1-py3-none-any.whl (16 kB)
Using cached jinja2-3.1.5-py3-none-any.whl (134 kB)
Using cached MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl (12 kB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing

In [3]:
from dotenv import load_dotenv
import os

# load neo4j credentials (and openai api key in background).
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')

#uncomment this line if you aren't using a .env file
# os.environ['OPENAI_API_KEY'] = 'copy_paste_the_openai_key_here'

In [4]:
import neo4j
from neo4j_graphrag.llm import OpenAILLM
from neo4j_graphrag.embeddings.openai import OpenAIEmbeddings

driver = neo4j.GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

ex_llm=OpenAILLM(
    model_name="gpt-4o-mini",
    model_params={
        "response_format": {"type": "json_object"}, # use json_object formatting for best results
        "temperature": 0 # turning temperature down for more deterministic results
    }
)

#create text embedder
embedder = OpenAIEmbeddings()

In [5]:
prompt_template = '''
You are a medical researcher tasked with extracting information from papers 
and structuring it in a property graph to inform further medical and research Q&A.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. the relationship direction goes from the start node to the end node. 


Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
  "relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text.  Do not add any additional information.  
- If the input text is empty, return empty Json. 
- Make sure to create as many nodes and relationships as needed to offer rich medical context for further research.
- An AI knowledge assistant must be able to read this graph and immediately understand the context to inform detailed research questions. 
- Multiple documents will be ingested from different sources and we are using this property graph to connect information, so make sure entity types are fairly general. 

Use only fhe following nodes and relationships (if provided):
{schema}

Assign a unique ID (string) to each node, and reuse it to define relationships.
Do respect the source and target node types for relationship and
the relationship direction.

Do not return any additional information other than the JSON in it.

Examples:
{examples}

Input text:

{text}
'''

In [6]:
from neo4j_graphrag.experimental.components.text_splitters.fixed_size_splitter import FixedSizeSplitter
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline

kg_builder_pdf = SimpleKGPipeline(
    llm=ex_llm,
    driver=driver,
    text_splitter=FixedSizeSplitter(chunk_size=500, chunk_overlap=100),
    embedder=embedder,
    # entities=node_labels,
    # relations=rel_types,
    prompt_template=prompt_template,
    from_pdf=False
)

In [15]:
# Read the transcripts into strings
with open("datasets/transcripts/mortgage_loan_1_transcript.txt", "r") as f:
    mortgage_loan_transcript_1 = ''.join(f.read().splitlines())

with open("datasets/transcripts/mortgage_loan_2_transcript.txt", "r") as f:
    mortgage_loan_transcript_2 = ''.join(f.read().splitlines())

with open("datasets/transcripts/mortgage_loan_3_transcript.txt", "r") as f:
    mortgage_loan_transcript_3 = ''.join(f.read().splitlines())

# Create a list of transcript texts
transcripts = [
    mortgage_loan_transcript_1,
    mortgage_loan_transcript_2,
    mortgage_loan_transcript_3,
]

# Process each transcript
for text in transcripts:
    pdf_result = await kg_builder_pdf.run_async(text=text)  # Pass the text directly
    print(f"Result: {pdf_result}")




Result: run_id='05dfe5bf-7b02-4a47-84d5-bac526012dcf' result={'resolver': {'number_of_nodes_to_resolve': 0, 'number_of_created_nodes': None}}




Result: run_id='2b863d91-dac2-4213-adb9-8f6bc43371c5' result={'resolver': {'number_of_nodes_to_resolve': 0, 'number_of_created_nodes': None}}




Result: run_id='1b1feea3-8613-4681-849a-3bb399cbc7b9' result={'resolver': {'number_of_nodes_to_resolve': 0, 'number_of_created_nodes': None}}
