In [1]:
from dotenv import load_dotenv
import os

# Common data processing
import json
import textwrap

# Langchain
from langchain_community.graphs import Neo4jGraph
from langchain_community.vectorstores import Neo4jVector
# from langchain_openai import OpenAIEmbeddings
from langchain.embeddings import OpenAIEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter

# from langchain.chains import RetrievalQAWithSourcesChain
# from langchain_openai import ChatOpenAI


# Warning control
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
# OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
# Note the code below is unique to this course environment, and not a 
# standard part of Neo4j's integration with OpenAI. Remove if running 
# in your own environment.
# OPENAI_ENDPOINT = os.getenv('OPENAI_BASE_URL') + '/embeddings'

# Global constants
VECTOR_INDEX_NAME = 'form_10k_chunks'
VECTOR_NODE_LABEL = 'Chunk'
VECTOR_SOURCE_PROPERTY = 'text'
VECTOR_EMBEDDING_PROPERTY = 'textEmbedding'

In [3]:
first_file_name = "./data/form10k/0000950170-23-027948.json"
first_file_as_object = json.load(open(first_file_name))
type(first_file_as_object)

dict

In [4]:
for k,v in first_file_as_object.items():
    print(k, type(v))

item1 <class 'str'>
item1a <class 'str'>
item7 <class 'str'>
item7a <class 'str'>
cik <class 'str'>
cusip6 <class 'str'>
cusip <class 'list'>
names <class 'list'>
source <class 'str'>


In [5]:
item1_text = first_file_as_object['item1']

In [6]:
item1_text[0:1500]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

In [8]:
item1_text_chunks = text_splitter.split_text(item1_text)

In [9]:
type(item1_text_chunks)

list

In [10]:
len(item1_text_chunks)

254

In [11]:
item1_text_chunks[0]

'>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved cloud’, provi

In [12]:
def split_form10k_data_from_file(file):
    chunks_with_metadata = [] # use this to accumlate chunk records
    file_as_object = json.load(open(file)) # open the json file
    for item in ['item1','item1a','item7','item7a']: # pull these keys from the json
        print(f'Processing {item} from {file}') 
        item_text = file_as_object[item] # grab the text of the item
        item_text_chunks = text_splitter.split_text(item_text) # split the text into chunks
        chunk_seq_id = 0
        for chunk in item_text_chunks[:20]: # only take the first 20 chunks
            form_id = file[file.rindex('/') + 1:file.rindex('.')] # extract form id from file name
            # finally, construct a record with metadata and the chunk text
            chunks_with_metadata.append({
                'text': chunk, 
                # metadata from looping...
                'f10kItem': item,
                'chunkSeqId': chunk_seq_id,
                # constructed metadata...
                'formId': f'{form_id}', # pulled from the filename
                'chunkId': f'{form_id}-{item}-chunk{chunk_seq_id:04d}',
                # metadata from file...
                'names': file_as_object['names'],
                'cik': file_as_object['cik'],
                'cusip6': file_as_object['cusip6'],
                'source': file_as_object['source'],
            })
            chunk_seq_id += 1
        print(f'\tSplit into {chunk_seq_id} chunks')
    return chunks_with_metadata

In [13]:
first_file_chunks = split_form10k_data_from_file(first_file_name)

Processing item1 from ./data/form10k/0000950170-23-027948.json
	Split into 20 chunks
Processing item1a from ./data/form10k/0000950170-23-027948.json
	Split into 1 chunks
Processing item7 from ./data/form10k/0000950170-23-027948.json
	Split into 1 chunks
Processing item7a from ./data/form10k/0000950170-23-027948.json
	Split into 1 chunks


In [14]:
first_file_chunks[0]

{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clou

In [15]:
type(first_file_chunks), len(first_file_chunks)

(list, 23)

In [16]:
first_file_chunks

[{'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure and data management, which we term ‘evolved clo

Create graph nodes using text chunks

In [17]:
merge_chunk_node_query = """
MERGE(mergedChunk:Chunk {chunkId: $chunkParam.chunkId})
    ON CREATE SET 
        mergedChunk.names = $chunkParam.names,
        mergedChunk.formId = $chunkParam.formId, 
        mergedChunk.cik = $chunkParam.cik, 
        mergedChunk.cusip6 = $chunkParam.cusip6, 
        mergedChunk.source = $chunkParam.source, 
        mergedChunk.f10kItem = $chunkParam.f10kItem, 
        mergedChunk.chunkSeqId = $chunkParam.chunkSeqId, 
        mergedChunk.text = $chunkParam.text
RETURN mergedChunk
"""

In [18]:
kg = Neo4jGraph(
    url=NEO4J_URI, username=NEO4J_USERNAME, password=NEO4J_PASSWORD, database=NEO4J_DATABASE
)

In [19]:
kg.query(merge_chunk_node_query, 
         params={'chunkParam':first_file_chunks[0]})

[{'mergedChunk': {'formId': '0000950170-23-027948',
   'f10kItem': 'item1',
   'names': ['Netapp Inc', 'NETAPP INC'],
   'cik': '1002047',
   'cusip6': '64110D',
   'source': 'https://www.sec.gov/Archives/edgar/data/1002047/000095017023027948/0000950170-23-027948-index.htm',
   'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the 

In [20]:
kg.query("""
CREATE CONSTRAINT unique_chunk IF NOT EXISTS 
    FOR (c:Chunk) REQUIRE c.chunkId IS UNIQUE
""")

[]

In [21]:
kg.query("SHOW INDEXES")

[{'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 2,
  'name': 'unique_chunk',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'RANGE',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['chunkId'],
  'indexProvider': 'range-1.0',
  'owningConstraint': 'unique_chunk',
  'lastRead': None,
  'readCount': 0}]

In [22]:
node_count = 0
for chunk in first_file_chunks:
    print(f"Creating `:Chunk` node for chunk ID {chunk['chunkId']}")
    kg.query(merge_chunk_node_query, 
            params={
                'chunkParam': chunk
            })
    node_count += 1
print(f"Created {node_count} nodes")

Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0000
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0001
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0002
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0003
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0004
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0005
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0006
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0007
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0008
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0009
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0010
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0011
Creating `:Chunk` node for chunk ID 0000950170-23-027948-item1-chunk0012
Creating `:Chunk` node for chunk ID 0000950170-23-0

In [23]:
kg.query("""
         MATCH (n)
         RETURN count(n) as nodeCount
         """)

[{'nodeCount': 23}]

https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/

vector index : INTEGER between 1 and 4096 inclusively.



In [None]:
kg.query("""
         CREATE VECTOR INDEX `form_10k_chunks` IF NOT EXISTS
          FOR (c:Chunk) ON (c.textEmbedding) 
          OPTIONS { indexConfig: {
            `vector.dimensions`: 4096,
            `vector.similarity_function`: 'cosine'    
         }}
""")

DatabaseError: {code: Neo.DatabaseError.Statement.ExecutionFailed} {message: 'vector.dimensions' must be between 1 and 4096 inclusively}

In [58]:
kg.query("SHOW INDEXES")

[{'id': 4,
  'name': 'form_10k_chunks',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'VECTOR',
  'entityType': 'NODE',
  'labelsOrTypes': ['Chunk'],
  'properties': ['textEmbedding'],
  'indexProvider': 'vector-2.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 0,
  'name': 'index_343aff4e',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'NODE',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': neo4j.time.DateTime(2024, 11, 12, 6, 10, 12, 508000000, tzinfo=<UTC>),
  'readCount': 3},
 {'id': 1,
  'name': 'index_f7700477',
  'state': 'ONLINE',
  'populationPercent': 100.0,
  'type': 'LOOKUP',
  'entityType': 'RELATIONSHIP',
  'labelsOrTypes': None,
  'properties': None,
  'indexProvider': 'token-lookup-1.0',
  'owningConstraint': None,
  'lastRead': None,
  'readCount': 0},
 {'id': 2,
  'name': 'unique_chunk',
  'state': 'ONLI

In [3]:
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')#.to(device)

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)#.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()



In [4]:
get_embeddings('zxy1298')

array([[ 1.48059621e-01, -2.09049154e-02,  1.80991009e-01,
        -1.75338298e-01,  3.16612184e-01, -2.21977547e-01,
        -7.08250850e-02,  5.53206325e-01, -3.59156132e-02,
         1.43371254e-01,  1.58441916e-01, -4.08272445e-01,
        -2.23827049e-01,  7.61420846e-01, -1.25323102e-01,
         3.18867475e-01, -4.58233431e-02,  9.09725949e-02,
        -3.23135406e-01,  3.13324749e-01,  8.17095563e-02,
        -2.57612944e-01,  2.61395931e-01,  6.54649734e-01,
         3.40279311e-01, -2.69733965e-01,  4.23296029e-03,
        -4.40138653e-02,  6.21559285e-02, -2.52598356e-02,
         2.19459370e-01, -3.47875357e-02,  4.10643190e-01,
        -5.91001995e-02, -2.79651731e-01,  2.59392291e-01,
        -3.41561943e-01, -2.19262112e-02,  9.51049700e-02,
         1.26200244e-01, -2.25273728e-01, -5.20625293e-01,
        -1.67916074e-01, -3.51750292e-02,  1.10207610e-01,
        -2.99208939e-01,  6.88232249e-03,  6.62061274e-01,
         7.53848672e-01, -8.51377398e-02, -3.71098131e-0

Summary

- Meta-Llama-3-8B model gave me a score of 0.68
- BERT gave a scode of 0.79
- Meta-Llama-3-8B model gave me a score of 0.68
- Llama-3.1-70B, does not fit in a single GPU, i am testing on CPU, it generates embeddings of 8192 dimenssions, which is higher than 4096 which is the max dimenssion of the vector index of neo4j! just confirm that the mebeddings of 3.1-70B is 8192.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

email_handle_embeddings = [get_embeddings(email_handle)[0] for email_handle in emails_handles]

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/nur-dev/llama-1.9B-kaz-instruct.
403 Client Error. (Request ID: Root=1-6732df2d-33a592627ed5471a669484c9;92023bf1-9479-4315-a7b2-cd432fbefca4)

Cannot access gated repo for url https://huggingface.co/nur-dev/llama-1.9B-kaz-instruct/resolve/main/config.json.
Access to model nur-dev/llama-1.9B-kaz-instruct is restricted and you are not in the authorized list. Visit https://huggingface.co/nur-dev/llama-1.9B-kaz-instruct to ask for access.

In [46]:
import torch
device = torch.device('cuda:0')
print(f"CUDA_VISIBLE_DEVICES set to: {device}")

CUDA_VISIBLE_DEVICES set to: cuda:0


In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
import torch

# Load the tokenizer and model from the local directory
# model_path = "/home/dan/my_research/my_local_llama/meta-llama/Meta-Llama-3-8B"#/original"  # Path to the directory
# model_path = "/home/dan/my_research/my_local_llama/meta-llama/Meta-Llama-3-8B"#/original"  # Path to the directory
model_path = "/home/dan/my_research/my_local_llama/meta-llama/Llama-3.1-70B"#/original"  # Path to the directory

# Load the tokenizer from the model directory
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model from the model directory
model = AutoModel.from_pretrained(model_path)

# # Test tokenization (optional)
# text = "This is a test sentence."
# tokens = tokenizer(text)

# print("Tokens:", tokens)

# model = AutoModelForCausalLM.from_pretrained(model_path)
# model = AutoModelForCausalLM.from_pretrained(model_path)#, torch_dtype=torch.bfloat16).to("cuda")

def get_embeddings(text):
    # Set pad token if not defined
    tokenizer.pad_token = tokenizer.eos_token  # or use add_special_tokens if you want a dedicated pad token
    
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)#.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()


# emb = get_embeddings(text)
# emb.shape

Loading checkpoint shards: 100%|██████████| 30/30 [09:43<00:00, 19.47s/it]


In [34]:
chunks = kg.query("""
    MATCH (chunk:Chunk) 
    WHERE chunk.textEmbedding IS NULL 
    RETURN chunk.chunkId AS id, chunk.text AS text
""")

In [35]:
for c in chunks:
    print (c,)# c['text'], c['id'])
    

{'id': '0000950170-23-027948-item1-chunk0000', 'text': '>Item 1.  \nBusiness\n\n\nOverview\n\n\nNetApp, Inc. (NetApp, we, us or the Company) is a global cloud-led, data-centric software company. We were incorporated in 1992 and are headquartered in San Jose, California. Building on more than three decades of innovation, we give customers the freedom to manage applications and data across hybrid multicloud environments. Our portfolio of cloud services, and storage infrastructure, powered by intelligent data management software, enables applications to run faster, more reliably, and more securely, all at a lower cost.\n\n\nOur opportunity is defined by the durable megatrends of data-driven digital and cloud transformations. NetApp helps organizations meet the complexities created by rapid data and cloud growth, multi-cloud management, and the adoption of next-generation technologies, such as AI, Kubernetes, and modern databases. Our modern approach to hybrid, multicloud infrastructure an

In [36]:
# Step 2: Compute embeddings locally
chunk_embeddings = {}
for chunk in chunks:
    text = chunk['text']
    chunk_id = chunk['id']
    embedding = get_embeddings(text)
    chunk_embeddings[chunk_id] = embedding 

In [None]:
import pickle

# Save chunk_embeddings to a file
with open('chunk_embeddings_Llama_3_1_70B.pkl', 'wb') as file:
    pickle.dump(chunk_embeddings, file)


In [None]:
import pickle

# Load chunk_embeddings from the file
with open('chunk_embeddings_Llama_3_1_70B.pkl', 'rb') as file:
    chunk_embeddings_new = pickle.load(file)
chunk_embeddings_new

{'0000950170-23-027948-item1-chunk0000': array([[-0.5166815 , -0.08864088,  0.19969292, ...,  1.0027709 ,
         -1.2434479 , -0.5435967 ]], dtype=float32),
 '0000950170-23-027948-item1-chunk0001': array([[-0.20656812, -0.72979623,  0.4156664 , ...,  0.85177326,
         -1.8440838 , -0.9053261 ]], dtype=float32),
 '0000950170-23-027948-item1-chunk0002': array([[-0.6879011, -0.9348478,  0.5717283, ...,  1.1459522, -2.0685847,
         -0.8164457]], dtype=float32),
 '0000950170-23-027948-item1-chunk0003': array([[-0.45972177,  0.00981281,  0.57732713, ...,  0.7307635 ,
         -3.1174972 , -0.4132972 ]], dtype=float32),
 '0000950170-23-027948-item1-chunk0004': array([[-0.07823444, -0.17354007, -0.21544048, ...,  0.8408205 ,
         -2.853965  , -0.85123557]], dtype=float32),
 '0000950170-23-027948-item1-chunk0005': array([[-0.21783504, -0.22896345, -0.07046735, ...,  1.181591  ,
         -2.3846467 , -0.5290723 ]], dtype=float32),
 '0000950170-23-027948-item1-chunk0006': array([[-0.

In [37]:
chunk_embeddings['0000950170-23-027948-item1-chunk0000'].shape

(1, 8192)

In [39]:
chunk_embeddings['0000950170-23-027948-item1-chunk0000']

array([[-0.5166815 , -0.08864088,  0.19969292, ...,  1.0027709 ,
        -1.2434479 , -0.5435967 ]], dtype=float32)

In [59]:
import numpy as np

# Step 3: Update Neo4j with computed embeddings
for chunk_id, embedding in chunk_embeddings.items():
    # Flatten the embedding if it's multi-dimensional
    flat_embedding = np.array(embedding).flatten().tolist()  # Ensure it's a 1D list

    kg.query("""
        MATCH (chunk:Chunk {chunkId: $chunk_id})  
        SET chunk.textEmbedding = $embedding  
    """, params={"chunk_id": chunk_id, "embedding": flat_embedding})  # Pass chunk_id and flat_embedding as parameters


In [60]:
kg.refresh_schema()
print(kg.schema)

Node properties:
Chunk {chunkId: STRING, names: LIST, formId: STRING, cik: STRING, cusip6: STRING, source: STRING, f10kItem: STRING, chunkSeqId: INTEGER, text: STRING, textEmbedding: LIST}
Relationship properties:

The relationships:



### Use similarity search to find relevant chunks

here,

ok so far i could get llama3 from my local to calculate the emneddings and replace it with openai embeddings, i updated the kg database based on these embeddings, next i need to perform the search using similarity

In [None]:
def neo4j_vector_search(question):
  """Search for similar nodes using the Neo4j vector index"""
  vector_search_query = """
    WITH genai.vector.encode(
      $question, 
      "OpenAI", 
      {
        token: $openAiApiKey,
        endpoint: $openAiEndpoint
      }) AS question_embedding
    CALL db.index.vector.queryNodes($index_name, $top_k, question_embedding) yield node, score
    RETURN score, node.text AS text
  """
  similar = kg.query(vector_search_query, 
                     params={
                      'question': question, 
                      'openAiApiKey':OPENAI_API_KEY,
                      'openAiEndpoint': OPENAI_ENDPOINT,
                      'index_name':VECTOR_INDEX_NAME, 
                      'top_k': 10})
  return similar

In [61]:
def neo4j_vector_search(question):
    """Search for similar nodes using the Neo4j vector index with local Llama 3 embeddings."""
    # Generate the embedding using the local Llama 3 model
    question_embedding = get_embeddings(question)
    question_embedding = question_embedding.flatten()  # or question_embedding.squeeze()
    # question_embedding = question_embedding / np.linalg.norm(question_embedding)  # Normalize if using cosine

    # Neo4j query without OpenAI API
    vector_search_query = """
      CALL db.index.vector.queryNodes($index_name, $top_k, $question_embedding) 
      YIELD node, score
      RETURN score, node.text AS text
    """
    
    similar = kg.query(vector_search_query, 
                       params={
                           'question_embedding': question_embedding,
                           'index_name': VECTOR_INDEX_NAME, 
                           'top_k': 10
                       })
    
    return similar

In [62]:
search_results = neo4j_vector_search(
    'In a single sentence, tell me about Netapp.'
)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.index.vector.queryNodes`: Caused by: java.lang.IllegalArgumentException: Index query vector has 8192 dimensions, but indexed vectors have 11.}

In [298]:
search_results

[{'score': 0.6802825927734375,
  'text': '>Item 7A\n\n\n\xa0\n\n\nQuantitative and Qualitative Disclosures About Market Risk\n\n\n\xa0\n\n\n50'},
 {'score': 0.6581363677978516,
  'text': '8\n\n\n\n\n\xa0\n\n\nOur diversified customer base spans industry segments and vertical markets such as energy, financial services, government, technology, internet, life sciences, healthcare services, manufacturing, media, entertainment, animation, video postproduction and telecommunications. NetApp focuses primarily on the enterprise storage and data management, cloud storage and cloud operations markets. We design our products to meet the evolving requirements of a hybrid, multicloud world, driven by digital transformation and cloud initiatives.\n\n\nOur partnerships with the industry’s leading cloud, infrastructure, consulting, application, and reseller partners are created with one goal in mind: the success of our customers. Global enterprises, local businesses, and government installations look 

In [243]:
question_embedding = get_embeddings('In a single sentence, tell me about Netapp.')
print(question_embedding.flatten().shape)  # Should output (4096,)


(768,)


In [152]:
# Assuming `kg` is your Neo4j driver session or connection
indexes = kg.query("CALL db.schema.visualization")

# Display the list of schema details, including indexes
for index in indexes:
    print(index)


{'nodes': [{'name': 'Chunk', 'indexes': ['textEmbedding'], 'constraints': ["Constraint( id=3, name='unique_chunk', type='UNIQUENESS', schema=(:Chunk {chunkId}), ownedIndex=2 )"]}], 'relationships': []}


In [65]:
# Assuming `kg` is your Neo4j connection
query = "MATCH (c:Chunk) RETURN c.textEmbedding LIMIT 1"
result = kg.query(query)

# Assuming the result is a list of dictionaries with the 'textEmbedding' key
embedding = result[0]['c.textEmbedding']

# Check the length of the embedding to see its dimensionality
print(len(embedding))  # This will print the number of dimensions in the embedding


8192
