# INDEXING

This is the node used to index all of the node of the Graph with ada v2 (through AzureOpenAIEmbeddings).

In [2]:
import os
from neo4j import GraphDatabase
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores.neo4j_vector import Neo4jVector
import config.EnvLoader as el
from tqdm import tqdm

In [None]:
# Connect to the neo4j DB
URI = "neo4j://localhost"
AUTH = ("neo4j", el.NEO4J_PWD)
os.environ["NEO4J_URI"] = URI
os.environ["NEO4J_USERNAME"] = AUTH[0]
os.environ["NEO4J_PASSWORD"] = AUTH[1]

In [None]:
# Initialize Azure Embeddings
embeddings = AzureOpenAIEmbeddings(
    model="text-embedding-ada-002",
    api_key=el.OPENAI_API_KEY,
    azure_endpoint=el.AZURE_ENDPOINT,
    openai_api_version="2023-03-15-preview"
)

# Nodes

## Page and SubChapter

In [None]:
# To avoid rate limit errors, the function from_existing_graph was modified in this way

import time

finished_at = datetime.datetime(2009, 1, 6, 15, 8, 24, 78915)
while True:
    fetch_query = (
        f"MATCH (n:`{node_label}`) "
        f"WHERE n.{embedding_node_property} IS null "
        "AND any(k in $props WHERE n[k] IS NOT null) "
        f"RETURN elementId(n) AS id, reduce(str='',"
        "k IN $props | str + '\\n' + k + ':' + coalesce(n[k], '')) AS text "
        "LIMIT 1200"
    )

    delta = datetime.datetime.now() - finished_at
    if delta.seconds < 60:
        time.sleep(65 - delta.seconds)

    data = store.query(fetch_query, params={"props": text_node_properties})
    text_embeddings = embedding.embed_documents([el["text"] for el in data])

    params = {
        "data": [
            {"id": el["id"], "embedding": embedding}
            for el, embedding in zip(data, text_embeddings)
        ]
    }

    store.query(
        "UNWIND $data AS row "
        f"MATCH (n:`{node_label}`) "
        "WHERE elementId(n) = row.id "
        f"CALL db.create.setVectorProperty(n, "
        f"'{embedding_node_property}', row.embedding) "
        "YIELD node RETURN count(*)",
        params=params,
    )
    # If embedding calculation should be stopped
    if len(data) < 1200:
        break

    finished_at = datetime.datetime.now()

In [4]:
# Create embeddings 426m
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="vector",
    node_label="Page",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

In [5]:
# Create embeddings 73m
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="vector",
    node_label="SubChapter",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

## Chunks

In [None]:
# To avoid rate limit errors, the function from_existing_graph was modified in this way

import datetime
import tiktoken

encoding = tiktoken.get_encoding("o200k_base")
finished_at = datetime.datetime(2009, 1, 6, 15, 8, 24, 78915)
while True:
    fetch_query = (
        f"MATCH (n:`{node_label}`) "
        f"WHERE n.{embedding_node_property} IS null "
        "AND any(k in $props WHERE n[k] IS NOT null) "
        f"RETURN elementId(n) AS id, reduce(str='',"
        "k IN $props | str + '\\n' + k + ':' + coalesce(n[k], '')) AS text "
        "LIMIT 1200"
    )
    data = store.query(fetch_query, params={"props": text_node_properties})
    new_data = []
    total_tokens = 0
    for el in data:
        total_tokens += len(encoding.encode(el["text"]))
        if total_tokens > 200000:
            break
        new_data.append(el)

    delta = datetime.datetime.now() - finished_at
    if delta.seconds < 60:
        time.sleep(65 - delta.seconds)

    text_embeddings = embedding.embed_documents([el["text"] for el in new_data])

    params = {
        "new_data": [
            {"id": el["id"], "embedding": embedding}
            for el, embedding in zip(new_data, text_embeddings)
        ]
    }

    store.query(
        "UNWIND $new_data AS row "
        f"MATCH (n:`{node_label}`) "
        "WHERE elementId(n) = row.id "
        f"CALL db.create.setVectorProperty(n, "
        f"'{embedding_node_property}', row.embedding) "
        "YIELD node RETURN count(*)",
        params=params,
    )
    # If embedding calculation should be stopped
    if len(data) < 1200 and len(data) == len(new_data):
        break

    finished_at = datetime.datetime.now()

In [4]:
# Create embeddings 203m
vector_index = Neo4jVector.from_existing_graph(
    embeddings,
    search_type="vector",
    node_label="Chunk",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)

# Relationships
Embedding the relationships of the graph

In [4]:
# Get all distinct relationships
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    rel_lst, summary, keys = driver.execute_query(
        """MATCH ()-[r:vector_rel]->()
        RETURN DISTINCT r.text;"""
    )

In [5]:
len(rel_lst)

7668

In [8]:
relationship_embeddings = []

for rel in tqdm(rel_lst):
    new_embedding = embeddings.embed_query(rel['r.text'])
    relationship_embeddings.append(new_embedding)
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        result, summary, keys = driver.execute_query(
            """MATCH ()-[r:vector_rel {text: $text}]->()
            SET r.embedding = $embedding;""",
            text= rel['r.text'], embedding=new_embedding
        )

100%|██████████| 7668/7668 [43:13<00:00,  2.96it/s] 


In [9]:
# Create relationship vector index
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    result, summary, keys = driver.execute_query(
        """CREATE VECTOR INDEX vector_rel_index
        FOR ()-[r:vector_rel]-() ON (r.embedding)
        OPTIONS {indexConfig: {
        `vector.dimensions`: 1536,
        `vector.similarity_function`: 'cosine'
        }}"""
    )

# Queries
Embedding the question of the NQ dataset.

In [4]:
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    questions, _, _ = driver.execute_query(
        """MATCH (a:Chunk)
        WHERE a.is_answer_of IS NOT NULL
        RETURN DISTINCT a.is_answer_of"""
    )

In [5]:
len(questions)

5223

In [7]:
embedded_questions = {}
for question in tqdm(questions):
    question = question["a.is_answer_of"]
    embedded_questions[question] = embeddings.embed_query(question)

100%|██████████| 5223/5223 [12:23<00:00,  7.03it/s]


In [8]:
import pickle

with open('embedded_questions.pkl', 'wb') as f:
    pickle.dump(embedded_questions, f)