In [1]:
from dotenv import load_dotenv
import os
import json
import textwrap
import warnings
warnings.filterwarnings('ignore')
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_ollama import ChatOllama


In [61]:
# Load from environment
load_dotenv(override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')

In [62]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, 
)

In [63]:
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [64]:
first_file_name = "/home/ali/AI/deepLearning/2.KnowledgeGraph/data/0000950170-23-027948.json"

In [65]:
first_file_as_object = json.load(open(first_file_name))

In [66]:
type(first_file_as_object)

dict

# Print Keys

In [67]:
for k, v in first_file_as_object.items():
    print(k, type(v))

item1 <class 'str'>
item1a <class 'str'>
item7 <class 'str'>
item7a <class 'str'>
cik <class 'str'>
cusip6 <class 'str'>
cusip <class 'list'>
names <class 'list'>
source <class 'str'>


In [68]:
item1_text = first_file_as_object['item1']

In [69]:
item1_text[0:1500]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

In [70]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    length_function = len,
    is_separator_regex = False,
)

In [71]:
item1_text_chunks = text_splitter.split_text(item1_text)

In [72]:
type(item1_text_chunks)

list

In [73]:
len(item1_text_chunks)

254

In [74]:
item1_text_chunks[0]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

In [75]:
def split_form10k_data_from_file(file):
    chunks_with_metadata = []
    file_as_object = json.load(open(file))
    for item in ['item1', 'item1a', 'item7', 'item7a']:
        print(f"Processing {item} from {file}")
        item_text = file_as_object[item]
        item_text_chunks = text_splitter.split_text(item_text)
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]:
            form_id = file[file.rindex('/') + 1:file.rindex('.')]
            chunks_with_metadata.append({
                'text': chunk,
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                'formId': f'{form_id}',
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [76]:
first_file_chunks = split_form10k_data_from_file(first_file_name)

Processing item1 from /home/ali/AI/deepLearning/2.KnowledgeGraph/data/0000950170-23-027948.json
	Split into 20 chunks
Processing item1a from /home/ali/AI/deepLearning/2.KnowledgeGraph/data/0000950170-23-027948.json
	Split into 1 chunks
Processing item7 from /home/ali/AI/deepLearning/2.KnowledgeGraph/data/0000950170-23-027948.json
	Split into 1 chunks
Processing item7a from /home/ali/AI/deepLearning/2.KnowledgeGraph/data/0000950170-23-027948.json
	Split into 1 chunks


In [77]:
print(first_file_chunks[0])

{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clou

In [78]:
merge_chunk_node_query = """
MERGE (mergedChunk: Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId,
        mergedChunk.cik = $chunkParam.cik,
        mergedChunk.cusip6 = $chunkParam.cusip6,
        mergedChunk.source = $chunkParam.source,
        mergedChunk.f10kItem = $chunkParam.f10kItem,
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId,
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [79]:
kg.query(merge_chunk_node_query,
        params = {'chunkParam':first_file_chunks[0]})

[{'mergedChunk': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'cik': '1002047',
   'cusip6': '64110D',
   'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
   'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the 

In [80]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

[]

In [81]:
kg.query("SHOW INDEXES")

[{'id': 4,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 9, 4, 8, 3, 30, 249000000, tzinfo=<UTC>),
  'readCount': 8},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 9, 4, 10, 14, 17, 554000000, tzinfo=<UTC>),
  'readCount': 102},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCo

In [82]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query,
            params = {
                'chunkParam': chunk
            })
    node_count +=1
print(f"Created{node_count} nodes")

Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0000
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0009
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0010
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0011
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0012
Creating `:Chunk` node for chunk ID 0000950170-23-0

In [83]:
kg.query("""
    MATCH (n)
    RETURN count(n) as nodeCount
    """)

[{'nodeCount': 23}]

In [84]:
#use 1536 for openAI instead of 1024
kg.query("""
        CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
        FOR (c:Chunk) ON (c.textEmbedding)
        OPTIONS { indexConfig: {
        `vector.dimensions`: 1024,
        `vector.similarity_function`: 'cosine'
        }}
""")

[]

In [85]:
kg.query("SHOW INDEXES")

[{'id': 4,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 9, 4, 8, 3, 30, 249000000, tzinfo=<UTC>),
  'readCount': 8},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2025, 9, 4, 10, 14, 17, 554000000, tzinfo=<UTC>),
  'readCount': 102},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCo

In [86]:
import requests

def get_embedding(text: str):
    response = requests.post(
        "http://localhost:11434/api/embeddings",
        json={
            "model": "mxbai-embed-large", 
            "prompt": text   # must be 'prompt' for this model
        }
    )
    response.raise_for_status()
    return response.json()["embedding"]


In [87]:
embedding = get_embedding("Graph databases and embeddings are cool!")
print(len(embedding), embedding[:5])  # prints length and first 5 values


1024 [-0.06415103375911713, 0.39690524339675903, -0.1491936445236206, -0.5620548129081726, 0.4960833787918091]


In [88]:
# 1. Get all chunks that need embeddings
chunks = kg.query("""
    MATCH (c:Chunk)
    WHERE c.textEmbedding IS NULL
    RETURN elementId(c) AS elementId, c.text AS text
""")


print(f"Found {len(chunks)} chunks without embeddings")


Found 23 chunks without embeddings


In [89]:
for chunk in chunks:
    emb = get_embedding(chunk["text"])
    kg.query("""
        MATCH (c:Chunk) WHERE elementId(c) = $elementId
        SET c.textEmbedding = $embedding
    """, params={"elementId": chunk["elementId"], "embedding": emb})


In [90]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {chunkId: STRING, names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, textEmbedding: LIST}
Relationship properties:

The relationships:



# Smimilarity Search Function

In [91]:
def vector_index_search(query: str, top_k: int = 5):
    # 1. Get query embedding using Ollama
    query_emb = get_embedding(query)

    # 2. Run Neo4j vector index query
    results = kg.query("""
    CALL db.index.vector.queryNodes($index_name, $top_k, $embedding)
    YIELD node, score
    RETURN node.text AS text, elementId(node) AS id, score
    """, params={
        "index_name": "form_10k_chunks",  # use your index name
        "top_k": top_k,
        "embedding": query_emb
    })

    return results

In [92]:
# Example usage
hits = vector_index_search("hybrid multicloud infrastructure", top_k=3)
for h in hits:
    print(f"Score: {h['score']:.4f}\nID: {h['id']}\nText: {h['text'][:200]}...\n")

Score: 0.8356
ID: 4:49c8fee6-6fe4-4c42-a36f-f2b7cb44e0ec:1
Text: •
Flexibility and consistency: NetApp makes moving data and applications between environments seamless through a common storage foundation across on-premises and multicloud environments.


•
Cyber res...

Score: 0.8129
ID: 4:49c8fee6-6fe4-4c42-a36f-f2b7cb44e0ec:7
Text: Another cloud operations service is 
Instaclustr
, our platform that provides fully managed open-source databases, pipelines, and workflow applications delivered as a service. Instaclustr helps organi...

Score: 0.7977
ID: 4:49c8fee6-6fe4-4c42-a36f-f2b7cb44e0ec:14
Text: We compete with many companies in the markets we serve. Our hybrid cloud solutions primarily compete with legacy IT and storage vendors. Some offer a broad spectrum of products, solutions and services...



In [93]:
search_results = vector_index_search(
    'In a single sentece, tell me about Netapp.'
)

In [94]:
print(type(search_results))

<class 'list'>


In [95]:
print(search_results[0])

{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clou

# Set up a LangChain RAG workflow to chat with the form

In [96]:
embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")

In [97]:
neo4j_vector_store = Neo4jVector.from_existing_graph(
    embedding=embeddings,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name="form_10k_chunks",   # <-- must match exactly
    node_label="Chunk",
    text_node_properties=["text"],
    embedding_node_property="textEmbedding",
)


In [98]:
retriever = neo4j_vector_store.as_retriever()

In [99]:
chain = RetrievalQAWithSourcesChain.from_chain_type(
    ChatOllama(model = 'qwen2.5'), 
    chain_type="stuff", 
    retriever=retriever
)

In [100]:
def prettychain(question: str) -> str:
    """Pretty print the chain's response to a question"""
    response = chain.invoke({"question": question},
        return_only_outputs=True,)
    print(textwrap.fill(response['answer'], 60))

In [101]:
question = "What is Netapp's primary business?"

In [102]:
prettychain(question)

NetApp's primary business is focused on providing global
cloud-led, data-centric software solutions. Specifically,
they offer storage infrastructure and cloud services to help
organizations manage applications and data across hybrid
multicloud environments.


In [103]:
prettychain("Where is Netapp headquartered?")

NetApp is headquartered in San Jose, California.


In [104]:
prettychain("""
    Tell me about Netapp. 
    Limit your answer to a single sentence.
""")

NetApp, Inc. is a global cloud-led, data-centric software
company that provides solutions for hybrid multicloud
environments and offers a portfolio of cloud services and
storage infrastructure powered by intelligent data
management software.


In [105]:
prettychain("""
    Tell me about Apple. 
    Limit your answer to a single sentence.
""")

The given text does not provide any information about Apple.
Therefore, I do not have enough information to answer the
question.


In [106]:
prettychain("""
    Tell me about Apple. 
    Limit your answer to a single sentence.
    If you are unsure about the answer, say you don't know.
""")

I don't know about Apple based on the provided content.
