In [12]:
from neo4j import GraphDatabase
import pandas as pd
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import openai
import os

In [13]:
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_ENDPOINT="https://api.openai.com/v1/embeddings"

In [14]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [15]:
def connect_to_neo4j(uri, user, password):
    with GraphDatabase.driver(uri, auth=(user,password)) as driver:
        driver.verify_connectivity()
        print("Connection estabilished.")
    return driver

In [16]:
driver = connect_to_neo4j(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

Connection estabilished.


In [17]:
# Define function to extract post titles
def extract_post_titles(driver):
    query = """
    MATCH (post:Post)
    RETURN post.post_id AS post_id, post.post_title AS post_title
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query)
        data = [record.data() for record in result]
    return pd.DataFrame(data)

In [18]:
# Extract post titles from Neo4j
posts_df = extract_post_titles(driver)
# Replace None with empty string
posts_df.fillna('', inplace=True)

  with driver.session(database=NEO4J_DATABASE) as session:


In [19]:
# Load SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [20]:
# Create embeddings for post titles
posts_df['titleEmbedding'] = posts_df['post_title'].apply(lambda x: model.encode(x).tolist())

In [21]:
posts_df.head()

Unnamed: 0,post_id,post_title,titleEmbedding
0,kf0d1g,Please sign petition to get nerves in the clit...,"[0.02098042704164982, -0.025324901565909386, 0..."
1,dwup3z,I know this is apart of the standardized proce...,"[-0.0063416543416678905, 0.017067179083824158,..."
2,hamqgj,I'm sure every person here relates.,"[-0.04929392784833908, -0.026714123785495758, ..."
3,j8mxtl,🩸,"[-0.04159634932875633, -0.024681774899363518, ..."
4,mqc0v5,why is this so true?,"[0.07418786734342575, 0.006123346742242575, 0...."


In [23]:
# Define function to set embeddings in Neo4j
def set_embeddings(driver, posts_df):
    query = """
    UNWIND $rows AS row
    MATCH (post:Post {post_id: row.post_id})
    SET post.titleEmbedding = row.titleEmbedding
    """
    rows = posts_df.to_dict('records')
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run(query, rows=rows)

In [24]:
# Set embeddings in Neo4j
set_embeddings(driver, posts_df)

  with driver.session(database=NEO4J_DATABASE) as session:


In [25]:
# Create vector index for embeddings in Neo4j
def create_vector_index(driver):
    query = """
    CREATE VECTOR INDEX post_title_embeddings IF NOT EXISTS
    FOR (post:Post) ON (post.titleEmbedding) 
    OPTIONS { indexConfig: {
      `vector.dimensions`: 384,
      `vector.similarity_function`: 'cosine'
    }}
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run(query)

In [26]:
# Create vector index
create_vector_index(driver)

  with driver.session(database=NEO4J_DATABASE) as session:


In [34]:
# Show vector indexes
def show_vector_indexes(driver):
    query = "SHOW VECTOR INDEXES"
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query)
        for record in result:
            print(record)

In [36]:
# Show vector indexes
show_vector_indexes(driver)

3


  with driver.session(database=NEO4J_DATABASE) as session:


In [49]:
query = "MATCH (post:Post) WHERE post.post_title IS NOT NULL RETURN post.post_title, post.titleEmbedding LIMIT 1"
with driver.session(database=NEO4J_DATABASE) as session:
    result = session.run(query)
    for record in result:
        print(record['post.titleEmbedding'])
        print(len(record['post.titleEmbedding']))

[0.02098042704164982, -0.025324901565909386, 0.04823486506938934, -0.06533394753932953, -0.07412794232368469, -0.025691673159599304, 0.08220510929822922, -0.021601347252726555, -0.04532847926020622, 0.011480504646897316, 0.02284853532910347, -0.04405072331428528, -0.00994692463427782, -0.016345951706171036, 0.014048004522919655, 0.03758910670876503, 0.026783810928463936, -0.01320159062743187, 0.006532056722790003, -0.005929651670157909, -0.023744143545627594, 0.09942561388015747, 0.08153500407934189, -0.03458850458264351, -0.10618103295564651, -0.04806007817387581, -0.07097279280424118, -0.011757824569940567, -0.05288395285606384, -0.02929963916540146, -0.04277489706873894, 0.0014894299674779177, -0.033776599913835526, 0.02960771508514881, 0.08737408369779587, -0.03360888734459877, -0.03283962234854698, 0.007209123112261295, 0.09324287623167038, 0.07724897563457489, 0.0020719990134239197, -0.03259175643324852, -0.03763700649142265, 0.03321734815835953, 0.04336751252412796, 0.0734710544

  with driver.session(database=NEO4J_DATABASE) as session:


In [50]:
# Function to encode the question
def encode_question(question):
    return model.encode(question).tolist()

In [51]:
# Function to query Neo4j for relevant nodes
def query_neo4j(driver, question_embedding, top_k=5):
    query = """
    WITH $question_embedding AS question_embedding
    CALL db.index.vector.queryNodes(
        'post_title_embeddings', 
        $top_k, 
        question_embedding
        ) YIELD node AS post, score
    RETURN post.post_title AS title, post.selftext AS text, score
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query, question_embedding=question_embedding, top_k=top_k)
        return pd.DataFrame([record.data() for record in result])


In [52]:
question = "What movies are about love?"
question_embedding = encode_question(question)
print(question_embedding)
print(len(question_embedding))

[-0.09366146475076675, -0.007969941943883896, -0.0005426190327852964, 0.05199532210826874, 0.003305514343082905, 0.09475867450237274, 0.041249677538871765, -0.034284017980098724, 0.12188263237476349, -0.07566624879837036, -0.0044990405440330505, 0.013837050646543503, -0.033316709101200104, 0.05087319388985634, 0.015167927369475365, 0.010243657045066357, 0.030091140419244766, 0.017825603485107422, -0.009585381485521793, 0.039552152156829834, 0.0033921091817319393, -0.010152192786335945, -0.012711655348539352, 0.008278241381049156, -0.0808093398809433, -0.028548406437039375, 0.06044885143637657, 0.005360702518373728, -0.10775967687368393, 0.051103539764881134, 0.0473317988216877, 0.021148663014173508, -0.017969051375985146, 0.026637699455022812, -0.050484418869018555, 0.00881864596158266, -0.03764573484659195, 0.03274989128112793, 0.012460834346711636, -0.047001998871564865, -0.013321301899850368, -0.08895041793584824, 0.04674527049064636, 0.04115918278694153, 0.006522365380078554, -0.05

In [53]:
results = query_neo4j(driver, question_embedding, top_k=5)
print(results)

  with driver.session(database=NEO4J_DATABASE) as session:


ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `db.index.vector.queryNodes`: Caused by: java.lang.IllegalArgumentException: Index query vector has 384 dimensions, but indexed vectors have 1536.}