In [13]:
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship

In [14]:
df = pd.read_csv("./data/GraphData.csv")

In [38]:
# Use percentage or a specified number of rows
# sample_percentage = 0.001
num_rows = 500


# sampled_df = df.sample(frac=sample_percentage, random_state=42)
sampled_df = df.sample(n=num_rows, random_state=42)
print(sampled_df.head(5))

sampled_df.to_csv("./data/GraphData_sampled.csv", index=False)


     Subreddit   Word1 Dependency     Word2  \
3163       BPD    said      nsubj     swore   
1893       BPD   ended      nsubj  marriage   
3690       BPD     saw      nsubj   periods   
1703       BPD  quotes      nsubj     https   
5210       BPD  period      nsubj    people   

                                               MHlabels  
3163                                                 {}  
1893                                                 {}  
3690                                                 {}  
1703  {'NEURO-DEVELOPMENTAL DISORDERS': ['autism'], ...  
5210                                                 {}  


In [39]:
from dotenv import load_dotenv
import os
load_dotenv()

# Christian, add these to a .env file. "neo4j", "letmein1234" - delete this comment if you manage.
client_id = os.getenv('N4J_USER')
client_secret = os.getenv('N4J_PW')
assert client_id is not None and client_secret is not None

graph = Graph(uri="bolt://localhost:7687", auth=(client_id, client_secret))
graph.delete_all()
existing_nodes = {}
tx = graph.begin()

for row in sampled_df.itertuples():
    existing_subreddit = existing_nodes.get(row.Subreddit)

    if existing_subreddit:
        subreddit_node = existing_subreddit
    else:
        subreddit_node = Node("Subreddit", name=row.Subreddit)
        tx.create(subreddit_node)
        existing_nodes[row.Subreddit] = subreddit_node

    existing_word = existing_nodes.get(row.Word2)
    if existing_word:
        word2_node = existing_word
    else:
        word2_node = Node("Word", name=row.Word2)
        tx.create(word2_node)
        existing_nodes[row.Word2] = word2_node

    # Determine the relationship direction based on 'Dependency'
    if row.Dependency == 'dobj':
        # If 'Dependency' is 'dobj', create relationship from 'Subreddit' to 'Word2'
        relationship = Relationship(subreddit_node, row.Word1, word2_node)
    elif row.Dependency == 'nsubj':
        # If 'Dependency' is 'nsubj', create relationship from 'Word2' to 'Subreddit'
        relationship = Relationship(word2_node, row.Word1, subreddit_node)
    else:
        # Handle other cases or default to a specific direction
        # For example, you can choose a default direction or skip the relationship
        continue

    # Always create relationships
    tx.create(relationship)
graph.commit(tx)

In [37]:
query = "MATCH (n:Subreddit) RETURN n LIMIT 10"
graph.run(query).data()

# Query to return all nodes in the graph
# query = "MATCH (n) return n"
# graph.run(query).data()

[{'n': Node('Subreddit', name='Anxiety')},
 {'n': Node('Subreddit', name='BPD')},
 {'n': Node('Subreddit', name='depression')},
 {'n': Node('Subreddit', name='schizophrenia')},
 {'n': Node('Subreddit', name='mentalillness')},
 {'n': Node('Subreddit', name='bipolar')}]