In [5]:
import os
from pyprojroot import here
from dotenv import load_dotenv
from pyprojroot import here
# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from langchain_openai import ChatOpenAI


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [6]:
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "12345678"
NEO4J_DATABASE = 'neo4j'

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

In [7]:
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [8]:
first_file_name = here("data/form10k/0000950170-23-027948.json")
first_file_as_object = json.load(open(first_file_name))
print(type(first_file_as_object), "\n")
for k,v in first_file_as_object.items():
    print(k, type(v))

<class 'dict'> 

item1 <class 'str'>
item1a <class 'str'>
item7 <class 'str'>
item7a <class 'str'>
cik <class 'str'>
cusip6 <class 'str'>
cusip <class 'list'>
names <class 'list'>
source <class 'str'>


In [9]:
item1_text = first_file_as_object['item1']
print(item1_text[0:1500])

>Item 1.  
Business


Overview


NetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.


Our opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provides custome

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [11]:
item1_text_chunks = text_splitter.split_text(item1_text)
print(type(item1_text_chunks), "\n")
print("Number of chunks:", len(item1_text_chunks), "\n")
print(item1_text_chunks[0])

<class 'list'> 

Number of chunks: 254 

>Item 1.  
Business


Overview


NetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.


Our opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which 

In [12]:
def split_form10k_data_from_file(file):
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            file_name_with_extension = os.path.basename(file)
            file_name_without_extension = os.path.splitext(file_name_with_extension)[0]
            form_id = file_name_without_extension
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [13]:
first_file_chunks = split_form10k_data_from_file(first_file_name)
first_file_chunks[0]

Processing item1 from c:\Users\froozitalab\OneDrive - R.W. Tomlinson Ltd\Documents\Codes\Advanced-QA-and-RAG-Series\KnowledgeGraph-Q&A-and-RAG-with-Text\data\form10k\0000950170-23-027948.json
	Split into 20 chunks
Processing item1a from c:\Users\froozitalab\OneDrive - R.W. Tomlinson Ltd\Documents\Codes\Advanced-QA-and-RAG-Series\KnowledgeGraph-Q&A-and-RAG-with-Text\data\form10k\0000950170-23-027948.json
	Split into 1 chunks
Processing item7 from c:\Users\froozitalab\OneDrive - R.W. Tomlinson Ltd\Documents\Codes\Advanced-QA-and-RAG-Series\KnowledgeGraph-Q&A-and-RAG-with-Text\data\form10k\0000950170-23-027948.json
	Split into 1 chunks
Processing item7a from c:\Users\froozitalab\OneDrive - R.W. Tomlinson Ltd\Documents\Codes\Advanced-QA-and-RAG-Series\KnowledgeGraph-Q&A-and-RAG-with-Text\data\form10k\0000950170-23-027948.json
	Split into 1 chunks


{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clou

In [14]:
first_file_chunks[0]

{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clou

In [15]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip6 = $chunkParam.cusip6, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [16]:
graph.query(merge_chunk_node_query,
            params={'chunkParam':first_file_chunks[0]})

[{'mergedChunk': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'textEmbedding': [-0.017197370529174805,
    -0.01842954009771347,
    0.016150688752532005,
    -0.026061037555336952,
    -0.007008789572864771,
    0.013189508579671383,
    -0.004262907430529594,
    -0.01307689119130373,
    -0.0029793980065733194,
    -0.03712406009435654,
    0.01584595814347267,
    0.013845340348780155,
    0.013779094442725182,
    -0.013560484163463116,
    0.002737601287662983,
    0.004494767170399427,
    0.01513050589710474,
    -0.0292010810226202,
    -0.014958267100155354,
    -0.021662326529622078,
    -0.03129444271326065,
    0.010466812178492546,
    0.00014118604303803295,
    -0.0064655751921236515,
    -0.015527979470789433,
    -0.010705296881496906,
    0.004054233431816101,
    -0.015289495699107647,
    0.01950271800160408,
    -0.004501391667872667,
    0.008558937348425388,
    -0.01992669142782688,
    -0.0080753443762

In [17]:
graph.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

[]

In [18]:
graph.query("SHOW INDEXES")

[{'id': 5,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 5, 2, 19, 4, 22, 531000000, tzinfo=<UTC>),
  'readCount': 13},
 {'id': 1,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 5, 10, 16, 27, 52, 366000000, tzinfo=<UTC>),
  'readCount': 257},
 {'id': 2,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'rea

In [19]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    graph.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0000
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0009
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0010
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0011
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0012
Creating `:Chunk` node for chunk ID 0000950170-23-0

In [20]:
graph.refresh_schema()
print(graph.schema)

Node properties are the following:
Chunk {textEmbedding: LIST, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING, chunkId: STRING},Form {cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING}
Relationship properties are the following:
SECTION {f10kItem: STRING}
The relationships are the following:
(:Chunk)-[:NEXT]->(:Chunk),(:Chunk)-[:PART_OF]->(:Form),(:Form)-[:SECTION]->(:Chunk)


In [21]:
graph.query("""
        MATCH (mergedChunk:Chunk {chunkSeqId: $value})
        RETURN mergedChunk
        ORDER BY rand()
        LIMIT 1
         """,
params={"value": 1})

[{'mergedChunk': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'textEmbedding': [0.010223898105323315,
    -0.004598728381097317,
    0.010656083934009075,
    -0.010858670808374882,
    -0.023040911182761192,
    0.0052199955098330975,
    -0.012195746414363384,
    0.0023989693727344275,
    0.005544134881347418,
    -0.02984783984720707,
    0.024729136377573013,
    0.003852532245218754,
    0.005544134881347418,
    -0.011162552051246166,
    0.011257092468440533,
    0.006320718675851822,
    0.016922779381275177,
    0.0007259372505359352,
    -0.0161124300211668,
    -0.02984783984720707,
    -0.017449505627155304,
    0.011466432362794876,
    -0.005030914209783077,
    -0.026363341137766838,
    -0.014127076603472233,
    -0.011520455591380596,
    0.006749528460204601,
    -0.01990756392478943,
    0.01700381375849247,
    -0.004007848910987377,
    0.01830037124454975,
    -0.02005612663924694,
    -0.012432097457349

In [22]:
model_name = "gpt-35-turbo-1106"
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [23]:
from openai import AzureOpenAI

client = AzureOpenAI(
  api_key = azure_openai_api_key,  
  api_version = "2023-07-01-preview",
  azure_endpoint = azure_openai_endpoint
)
def embed_text(text):
    response = client.embeddings.create(
    input = text,
    model= "text-embedding-ada-002"
    )
    return response.data[0].embedding

In [24]:
for i in range(len(first_file_chunks)):
    first_file_chunks[i]["embedding"] = embed_text(first_file_chunks[i]["text"])

In [25]:
graph.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 24}]

**Create a vector index**

In [26]:
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [27]:
graph.query("""
         CREATE VECTOR INDEX form_10k_chunks IF NOT EXISTS
          FOR (c:Chunk) ON (c.form_10k_chunks) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 1536,
            `vector.similarity_function`: 'cosine'    
         }}
""")

[]

In [28]:
graph.query("""
  SHOW VECTOR INDEXES
  """
)

[{'id': 5,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 5, 2, 19, 4, 22, 531000000, tzinfo=<UTC>),
  'readCount': 13}]

In [29]:
chunkId_lst = []
for i in range(len(first_file_chunks)):
    chunkId_lst.append(first_file_chunks[i]['chunkId'])

len(set(chunkId_lst))

23

In [30]:
for chunk in first_file_chunks:
    chunk_id = chunk['chunkId']
    graph.query(f"MATCH (mergedChunk:Chunk {{chunkId: '{chunk_id}'}}) SET mergedChunk.textEmbedding = {chunk['embedding']}")

In [31]:
graph.refresh_schema()
print(graph.schema)

Node properties are the following:
Chunk {textEmbedding: LIST, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING, chunkId: STRING},Form {cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING}
Relationship properties are the following:
SECTION {f10kItem: STRING}
The relationships are the following:
(:Chunk)-[:NEXT]->(:Chunk),(:Chunk)-[:PART_OF]->(:Form),(:Form)-[:SECTION]->(:Chunk)


**Verify**

In [32]:
graph.query("""
        MATCH (mergedChunk:Chunk {chunkSeqId: $value})
        RETURN mergedChunk
        ORDER BY rand()
        LIMIT 1
         """,
params={"value": 1})

[{'mergedChunk': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'textEmbedding': [0.010223898105323315,
    -0.004598728381097317,
    0.010656083934009075,
    -0.010858670808374882,
    -0.023040911182761192,
    0.0052199955098330975,
    -0.012195746414363384,
    0.0023989693727344275,
    0.005544134881347418,
    -0.02984783984720707,
    0.024729136377573013,
    0.003852532245218754,
    0.005544134881347418,
    -0.011162552051246166,
    0.011257092468440533,
    0.006320718675851822,
    0.016922779381275177,
    0.0007259372505359352,
    -0.0161124300211668,
    -0.02984783984720707,
    -0.017449505627155304,
    0.011466432362794876,
    -0.005030914209783077,
    -0.026363341137766838,
    -0.014127076603472233,
    -0.011520455591380596,
    0.006749528460204601,
    -0.01990756392478943,
    0.01700381375849247,
    -0.004007848910987377,
    0.01830037124454975,
    -0.02005612663924694,
    -0.012432097457349

In [31]:
# Using openAI directly
# graph.query("""
#     MATCH (chunk:Chunk) WHERE chunk.textEmbedding IS NULL
#     WITH chunk, genai.vector.encode(
#       chunk.text, 
#       "OpenAI", 
#       {
#         token: $openAiApiKey, 
#         endpoint: $openAiEndpoint
#       }) AS vector
#     CALL db.create.setNodeVectorProperty(chunk, "textEmbedding", vector)
#     """, 
#     params={"openAiApiKey":OPENAI_API_KEY, "openAiEndpoint": OPENAI_ENDPOINT} )

In [33]:
graph.refresh_schema()
print(graph.schema)

Node properties are the following:
Chunk {textEmbedding: LIST, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING, chunkId: STRING},Form {cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING}
Relationship properties are the following:
SECTION {f10kItem: STRING}
The relationships are the following:
(:Chunk)-[:NEXT]->(:Chunk),(:Chunk)-[:PART_OF]->(:Form),(:Form)-[:SECTION]->(:Chunk)


In [49]:
result = graph.query("""
    MATCH (c:Chunk) 
    WHERE c.text IS NOT NULL
    RETURN c.text, c.textEmbedding
    LIMIT 1
    """
)
result[0]['c.text']

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

In [50]:
result[0]['c.textEmbedding']

[-0.017197370529174805,
 -0.01842954009771347,
 0.016150688752532005,
 -0.026061037555336952,
 -0.007008789572864771,
 0.013189508579671383,
 -0.004262907430529594,
 -0.01307689119130373,
 -0.0029793980065733194,
 -0.03712406009435654,
 0.01584595814347267,
 0.013845340348780155,
 0.013779094442725182,
 -0.013560484163463116,
 0.002737601287662983,
 0.004494767170399427,
 0.01513050589710474,
 -0.0292010810226202,
 -0.014958267100155354,
 -0.021662326529622078,
 -0.03129444271326065,
 0.010466812178492546,
 0.00014118604303803295,
 -0.0064655751921236515,
 -0.015527979470789433,
 -0.010705296881496906,
 0.004054233431816101,
 -0.015289495699107647,
 0.01950271800160408,
 -0.004501391667872667,
 0.008558937348425388,
 -0.01992669142782688,
 -0.008075344376266003,
 -0.006584817543625832,
 -0.008101842366158962,
 0.004160226788371801,
 0.00257198722101748,
 -0.01588570699095726,
 0.0262597743421793,
 -0.011937465518712997,
 0.013752596452832222,
 -0.015488232485949993,
 -0.007730867248028

RAG

In [35]:
question = "'In a single sentence, tell me about Netapp."
question_embedding = embed_text(question)
question_embedding[:10]

[-0.001604900578968227,
 -0.0004043051740154624,
 0.03879358619451523,
 -0.01562519371509552,
 0.005936653818935156,
 -0.0038373058196157217,
 0.005309149157255888,
 -0.005992505233734846,
 -0.01264865417033434,
 -0.017057612538337708]

In [37]:
# def neo4j_vector_search(question):
#   """Search for similar nodes using the Neo4j vector index"""
#   vector_search_query = """
#     WITH genai.vector.encode(
#       $question, 
#       "OpenAI", 
#       {
#         token: $openAiApiKey,
#         endpoint: $openAiEndpoint
#       }) AS question_embedding
#     CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
#     RETURN score, node.text AS text
#   """
#   similar = graph.query(vector_search_query, 
#                      params={
#                       'question': question, 
#                       'openAiApiKey':OPENAI_API_KEY,
#                       'openAiEndpoint': OPENAI_ENDPOINT,
#                       'index_name':VECTOR_INDEX_NAME, 
#                       'top_k': 10})
#   return similar

In [45]:
results = graph.query("""
    with $question_embedding as question_embedding                  
    CALL db.index.vector.queryNodes(
                      'form_10k_chunks',
                      $top_k,
                      question_embedding
                      ) YIELD node AS chunk, score
    RETURN chunk.text, score
    """,
    params={
        "question_embedding": question_embedding,
        "top_k": 5
    })
results

[{'chunk.text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolv

In [51]:
cypher = """
  MATCH (anyChunk:Chunk) 
  WITH anyChunk LIMIT 1
  RETURN anyChunk { .names, .source, .formId, .cik, .cusip6 } as formInfo
"""
form_info_list = graph.query(cypher)

form_info_list


[{'formInfo': {'cik': '1002047',
   'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
   'formId': '0000950170-23-027948',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'cusip6': '64110D'}}]

In [53]:
form_info = form_info_list[0]['formInfo']
form_info

{'cik': '1002047',
 'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
 'formId': '0000950170-23-027948',
 'names': ['Netapp Inc', 'NETAPP INC'],
 'cusip6': '64110D'}

In [55]:
cypher = """
    MERGE (f:Form {formId: $formInfoParam.formId })
      ON CREATE 
        SET f.names = $formInfoParam.names
        SET f.source = $formInfoParam.source
        SET f.cik = $formInfoParam.cik
        SET f.cusip6 = $formInfoParam.cusip6
"""

graph.query(cypher, params={'formInfoParam': form_info})

[]

In [57]:
graph.query("MATCH (f:Form) RETURN count(f) as formCount")

[{'formCount': 1}]

In [59]:
cypher = """
  MATCH (from_same_form:Chunk)
    WHERE from_same_form.formId = $formIdParam
  RETURN from_same_form {.formId, .f10kItem, .chunkId, .chunkSeqId } as chunkInfo
    LIMIT 10
"""

graph.query(cypher, params={'formIdParam': form_info['formId']})

[{'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0001',
   'chunkSeqId': 1}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0002',
   'chunkSeqId': 2}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0003',
   'chunkSeqId': 3}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0004',
   'chunkSeqId': 4}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0005',
   'chunkSeqId': 5}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-

In [61]:
cypher = """
  MATCH (from_same_form:Chunk)
    WHERE from_same_form.formId = $formIdParam
  RETURN from_same_form {.formId, .f10kItem, .chunkId, .chunkSeqId } as chunkInfo 
    ORDER BY from_same_form.chunkSeqId ASC
    LIMIT 10
"""

graph.query(cypher, params={'formIdParam': form_info['formId']})

[{'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item7a',
   'chunkId': '0000950170-23-027948-item7a-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item7',
   'chunkId': '0000950170-23-027948-item7-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1a',
   'chunkId': '0000950170-23-027948-item1a-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0000',
   'chunkSeqId': 0}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0001',
   'chunkSeqId': 1}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950170-23-027948-item1-chunk0002',
   'chunkSeqId': 2}},
 {'chunkInfo': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'chunkId': '0000950

In [62]:
cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.formId = $formIdParam
    AND from_same_section.f10kItem = $f10kItemParam
  WITH from_same_section { .formId, .f10kItem, .chunkId, .chunkSeqId } 
    ORDER BY from_same_section.chunkSeqId ASC
    LIMIT 10
  RETURN collect(from_same_section) // NEW!!!
"""

graph.query(cypher, params={'formIdParam': form_info['formId'], 
                         'f10kItemParam': 'item1'})


[{'collect(from_same_section)': [{'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0000',
    'chunkSeqId': 0},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0001',
    'chunkSeqId': 1},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0002',
    'chunkSeqId': 2},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0003',
    'chunkSeqId': 3},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0004',
    'chunkSeqId': 4},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0005',
    'chunkSeqId': 5},
   {'formId': '0000950170-23-027948',
    'f10kItem': 'item1',
    'chunkId': '0000950170-23-027948-item1-chunk0006',
    'chunkSe

In [63]:
cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.formId = $formIdParam
    AND from_same_section.f10kItem = $f10kItemParam
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )  // NEW!!!
  RETURN size(section_chunk_list)
"""

graph.query(cypher, params={'formIdParam': form_info['formId'], 
                         'f10kItemParam': 'item1'})


[{'size(section_chunk_list)': 20}]

In [64]:
cypher = """
  MATCH (from_same_section:Chunk)
  WHERE from_same_section.formId = $formIdParam
    AND from_same_section.f10kItem = $f10kItemParam
  WITH from_same_section
    ORDER BY from_same_section.chunkSeqId ASC
  WITH collect(from_same_section) as section_chunk_list
    CALL apoc.nodes.link(
        section_chunk_list, 
        "NEXT", 
        {avoidDuplicates: true}
    )
  RETURN size(section_chunk_list)
"""
for form10kItemName in ['item1', 'item1a', 'item7', 'item7a']:
    graph.query(cypher, params={'formIdParam':form_info['formId'], 
                           'f10kItemParam': form10kItemName})

In [65]:
cypher = """
  MATCH (c:Chunk), (f:Form)
    WHERE c.formId = f.formId
  MERGE (c)-[newRelationship:PART_OF]->(f)
  RETURN count(newRelationship)
"""

graph.query(cypher)

[{'count(newRelationship)': 23}]

In [66]:
cypher = """
  MATCH (first:Chunk), (f:Form)
  WHERE first.formId = f.formId
    AND first.chunkSeqId = 0
  WITH first, f
    MERGE (f)-[r:SECTION {f10kItem: first.f10kItem}]->(first)
  RETURN count(r)
"""

graph.query(cypher)

[{'count(r)': 4}]

In [67]:
cypher = """
  MATCH (f:Form)-[r:SECTION]->(first:Chunk)
    WHERE f.formId = $formIdParam
        AND r.f10kItem = $f10kItemParam
  RETURN first.chunkId as chunkId, first.text as text
"""

first_chunk_info = graph.query(cypher, params={
    'formIdParam': form_info['formId'], 
    'f10kItemParam': 'item1'
})[0]

first_chunk_info

{'chunkId': '0000950170-23-027948-item1-chunk0000',
 'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastruct

In [68]:
cypher = """
  MATCH (first:Chunk)-[:NEXT]->(nextChunk:Chunk)
    WHERE first.chunkId = $chunkIdParam
  RETURN nextChunk.chunkId as chunkId, nextChunk.text as text
"""

next_chunk_info = graph.query(cypher, params={
    'chunkIdParam': first_chunk_info['chunkId']
})[0]

next_chunk_info


{'chunkId': '0000950170-23-027948-item1-chunk0001',
 'text': "•\nFlexibility and consistency: NetApp makes moving data and applications between environments seamless through a common storage foundation across on-premises and multicloud environments.\n\n\n•\nCyber resilience: NetApp unifies monitoring, data protection, security, governance, and compliance for total cyber resilience - with consistency and automation across environments. \n\n\n•\nContinuous operations: NetApp uses AI-driven automation for continuous optimization to service applications and store stateless and stateful applications at the lowest possible costs.\n\n\n•\nSustainability: NetApp has industry-leading tools to audit consumption, locate waste, and set guardrails to stop overprovisioning.\n\n\nProduct, Solutions and Services Portfolio\n \n\n\nNetApp's portfolio of cloud services and storage infrastructure is powered by intelligent data management software. Our operations are organized into two segments: Hybrid Clo

In [69]:
print(first_chunk_info['chunkId'], next_chunk_info['chunkId'])

0000950170-23-027948-item1-chunk0000 0000950170-23-027948-item1-chunk0001


In [70]:
cypher = """
    MATCH (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c2.chunkId = $chunkIdParam
    RETURN c1.chunkId, c2.chunkId, c3.chunkId
    """

graph.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'c1.chunkId': '0000950170-23-027948-item1-chunk0000',
  'c2.chunkId': '0000950170-23-027948-item1-chunk0001',
  'c3.chunkId': '0000950170-23-027948-item1-chunk0002'}]

In [71]:
cypher = """
    MATCH window = (c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c1.chunkId = $chunkIdParam
    RETURN length(window) as windowPathLength
    """

graph.query(cypher,
         params={'chunkIdParam': next_chunk_info['chunkId']})

[{'windowPathLength': 2}]

In [73]:
cypher = """
    MATCH window=(c1:Chunk)-[:NEXT]->(c2:Chunk)-[:NEXT]->(c3:Chunk) 
        WHERE c2.chunkId = $chunkIdParam
    RETURN nodes(window) as chunkList
    """
# pull the chunk ID from the first 
graph.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})


[]

In [74]:
cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..1]->(:Chunk) 
    WHERE c.chunkId = $chunkIdParam
  RETURN length(window)
  """

graph.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})

[{'length(window)': 0}, {'length(window)': 1}]

In [75]:
cypher = """
  MATCH window=
      (:Chunk)-[:NEXT*0..1]->(c:Chunk)-[:NEXT*0..1]->(:Chunk)
    WHERE c.chunkId = $chunkIdParam
  WITH window as longestChunkWindow 
      ORDER BY length(window) DESC LIMIT 1
  RETURN length(longestChunkWindow)
  """

graph.query(cypher,
         params={'chunkIdParam': first_chunk_info['chunkId']})

[{'length(longestChunkWindow)': 1}]

In [76]:
retrieval_query_extra_text = """
WITH node, score, "Andreas knows Cypher. " as extraText
RETURN extraText + "\n" + node.text as text,
    score,
    node {.source} AS metadata
"""

In [80]:
graph.refresh_schema()
print(graph.schema)

Node properties are the following:
Chunk {textEmbedding: LIST, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING, chunkId: STRING},Form {cik: STRING, cusip6: STRING, names: LIST, formId: STRING, source: STRING}
Relationship properties are the following:
SECTION {f10kItem: STRING}
The relationships are the following:
(:Chunk)-[:NEXT]->(:Chunk),(:Chunk)-[:PART_OF]->(:Form),(:Form)-[:SECTION]->(:Chunk)


In [79]:
# vector_store_extra_text = Neo4jVector.from_existing_index(
#     embedding=OpenAIEmbeddings(),
#     url=NEO4J_URI,
#     username=NEO4J_USERNAME,
#     password=NEO4J_PASSWORD,
#     database="neo4j",
#     index_name=VECTOR_INDEX_NAME,
#     text_node_property=VECTOR_SOURCE_PROPERTY,
#     retrieval_query=retrieval_query_extra_text, # NEW !!!
# )

# # Create a retriever from the vector store
# retriever_extra_text = vector_store_extra_text.as_retriever()

# # Create a chatbot Question & Answer chain from the retriever
# chain_extra_text = RetrievalQAWithSourcesChain.from_chain_type(
#     ChatOpenAI(temperature=0), 
#     chain_type="stuff", 
#     retriever=retriever_extra_text
# )