In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from neo4j import GraphDatabase
from openai import OpenAI

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import os
from dotenv import load_dotenv
# Load from environment
load_dotenv('.env', override=True)
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [4]:
def connect_to_neo4j(uri, user, password):
    with GraphDatabase.driver(uri, auth=(user,password)) as driver:
        driver.verify_connectivity()
        print("Connection estabilished.")
    return driver

In [5]:
driver = connect_to_neo4j(NEO4J_URI, NEO4J_USERNAME, NEO4J_PASSWORD)

Connection estabilished.


In [6]:
def extract_data(driver, query):
    with driver.session() as session:
        result = session.run(query)
        data = [record.data() for record in result]
    return pd.DataFrame(data)

In [7]:
query_posts = """
MATCH (p:Post)
RETURN p.post_id AS post_id, p.post_title AS post_title, p.selftext AS selftext
"""

In [8]:
query_comments = """
MATCH (c:Comment)
RETURN c.comment AS comment,c.post_id AS post_id
"""

In [9]:
posts_df = extract_data(driver, query_posts)
comments_df = extract_data(driver, query_comments)

  with driver.session() as session:


In [10]:
posts_df.fillna('', inplace=True)
comments_df.fillna('',inplace=True)

In [11]:
posts_df.head()

Unnamed: 0,post_id,post_title,selftext
0,kf0d1g,Please sign petition to get nerves in the clit...,"Hey! I spoke with them on the phone, and they ..."
1,dwup3z,I know this is apart of the standardized proce...,
2,hamqgj,I'm sure every person here relates.,
3,j8mxtl,🩸,
4,mqc0v5,why is this so true?,


In [12]:
comments_df.head()

Unnamed: 0,comment,post_id
0,Oh no. Oh no NO. I’m so sorry. Signed sealed d...,kf0d1g
1,Why are vaginas still taboo??? I don’t get it.,kf0d1g
2,*Has this been cross posted to r/twoxchromosom...,kf0d1g
3,"How does their excuse that it ""doesn't fit the...",kf0d1g
4,Signed and donated 5000 shares. You are a saint!,kf0d1g


In [13]:
openai=OpenAI()
def create_openai_embeddings(texts):
    embeddings = []
    for text in texts:
        response = openai.embeddings.create(
            input=text,
            model="text-embedding-ada-002"
        )
        embedding = response.data[0].embedding
        embeddings.append(embedding)
    return embeddings

In [14]:
def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalnum() and word not in stopwords.words('english')]
    return ' '.join(tokens)

In [15]:
posts_df['clean_post_title']=posts_df['post_title'].apply(preprocess_text)
# posts_df['clean_self_text']=posts_df['selftext'].apply(preprocess_text)
# comments_df['clean_comment']=comments_df['comment'].apply(preprocess_text)

In [16]:
posts_df.head()

Unnamed: 0,post_id,post_title,selftext,clean_post_title
0,kf0d1g,Please sign petition to get nerves in the clit...,"Hey! I spoke with them on the phone, and they ...",please sign petition get nerves clitoris inclu...
1,dwup3z,I know this is apart of the standardized proce...,,know apart standardized procedure still kind f...
2,hamqgj,I'm sure every person here relates.,,sure every person relates
3,j8mxtl,🩸,,
4,mqc0v5,why is this so true?,,true


In [17]:
comments_df.head()

Unnamed: 0,comment,post_id
0,Oh no. Oh no NO. I’m so sorry. Signed sealed d...,kf0d1g
1,Why are vaginas still taboo??? I don’t get it.,kf0d1g
2,*Has this been cross posted to r/twoxchromosom...,kf0d1g
3,"How does their excuse that it ""doesn't fit the...",kf0d1g
4,Signed and donated 5000 shares. You are a saint!,kf0d1g


In [18]:
posts_df['titleEmbedding'] = create_openai_embeddings(posts_df['post_title'].tolist())

In [19]:
def set_embeddings(driver, posts_df):
    query = """
    UNWIND $rows AS row
    MATCH (post:Post {post_id: row.post_id})
    SET post.titleEmbedding = row.titleEmbedding
    """
    rows = posts_df.to_dict('records')
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run(query, rows=rows)

set_embeddings(driver, posts_df)

  with driver.session(database=NEO4J_DATABASE) as session:


In [20]:
def create_vector_index(driver):
    query = """
    CREATE VECTOR INDEX post_title_embeddings IF NOT EXISTS
    FOR (post:Post) ON (post.titleEmbedding) 
    OPTIONS { indexConfig: {
      `vector.dimensions`: 1536,
      `vector.similarity_function`: 'cosine'
    }}
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        session.run(query)

create_vector_index(driver)

  with driver.session(database=NEO4J_DATABASE) as session:
Failed to read from defunct connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))


SessionExpired: Failed to read from defunct connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))

In [None]:
def encode_question_openai(question):
    response = openai.embeddings.create(
        input=question,
        model='text-embedding-ada-002'
    )
    return response.data[0].embedding

In [None]:
def query_neo4j(driver, question_embedding, top_k=5):
    query = """
    WITH $question_embedding AS question_embedding
    CALL db.index.vector.queryNodes(
        'post_title_embeddings', 
        $top_k, 
        question_embedding
        ) YIELD node AS post, score
    RETURN post.post_title AS title, post.selftext AS text, score
    """
    with driver.session(database=NEO4J_DATABASE) as session:
        result = session.run(query, question_embedding=question_embedding, top_k=top_k)
        return pd.DataFrame([record.data() for record in result])

In [None]:
question = "Which posts are about clitoris?"

In [None]:
question_embedding = encode_question_openai(question)

In [None]:
results = query_neo4j(driver, question_embedding, top_k=5)

In [None]:
print(results)