# Graph Building Notebook

## Method

* Graph data as an output from the post_feature_engineering is loaded
* Nodes - Every subreddit and every word2 - word2 being noun/pronoun, subreddit being the mental health diseases
* Going through every entry, if node already exist - use it, if not create a new one
* comparing word2 (noun) with subreddit entries - if the same, the subreddit node is to be used and points to itself, or to another subreddit node
* Word 1 (the verb) is the relationship between the nodes (subreddit and word2)
* The direction of the relationship is determined by the semantics of the sentence - dobj or nsubj





In [1]:
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship
import ast
import os
from dotenv import load_dotenv
from os import getenv

In [2]:
df = pd.read_csv("./data/GraphData.csv")

In [5]:
# Use percentage or a specified number of rows
# sample_percentage = 0.001
num_rows = 500


# sampled_df = df.sample(frac=sample_percentage, random_state=42) 
sampled_df = df.sample(n=num_rows, random_state=42)
print(sampled_df.head(5))

sampled_df.to_csv("./data/GraphData_sampled.csv", index=False)



        Subreddit     Word1 Dependency     Word2  \
6193596   Anxiety   knowing       dobj       day   
1973406       BPD   boosted       dobj  finances   
5840048   Anxiety    taking       dobj     carbs   
6722662   Anxiety  universe      nsubj     death   
2766017       BPD     heard      nsubj      help   

                                                  MHlabels  
6193596  {'SYMPTOMS': ['anxiety', 'panic attack'], 'SLE...  
1973406  {'ANXIETY DISORDERS': ['anxiety'], 'DEPRESSIVE...  
5840048  {'SYMPTOMS': ['anxiety'], 'ANXIETY DISORDERS':...  
6722662                                                 {}  
2766017                            {'SYMPTOMS': ['anger']}  


From Spacy: https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

nsubj - nominal subject (verb (action) performed by the NS)
dobj - direct object  (verb (action) performed on the DO)

This is used to determine the direction of the relationship.

In [15]:
# load_dotenv()
# client_id = os.getenv('N4J_USER')
# client_secret = os.getenv('N4J_PW')
# print(client_id)
# print(client_secret)
# assert client_id is not None and client_secret is not None

graph = Graph(uri="bolt://localhost:7687", auth=("neo4j", "letmein1234"))
# graph = Graph(uri="bolt://localhost:7687", auth=(client_id, client_secret))

graph.delete_all()
existing_nodes = {}
tx = graph.begin()

for row in df.itertuples():
# for row in sampled_df.itertuples():

    # Check if the Subreddit node already exists
    existing_subreddit = existing_nodes.get(row.Subreddit.lower())

    if existing_subreddit:
        subreddit_node = existing_subreddit
    else:
        subreddit_node = Node("Mental_Health_Disorder", name=row.Subreddit.lower()) 
        tx.create(subreddit_node)
        existing_nodes[row.Subreddit.lower()] = subreddit_node

    
    # Check if the Word node already exists
    word2_lowercase = row.Word2.lower()
    existing_word = existing_nodes.get(word2_lowercase)   

    if word2_lowercase == row.Subreddit.lower():
        word2_node = subreddit_node  # Use the Subreddit node for Word2
    elif existing_word:
        word2_node = existing_word
    else:
        word2_node = Node("Word", name=word2_lowercase)
        tx.create(word2_node)
        existing_nodes[word2_lowercase] = word2_node


    mh_labels_dict = ast.literal_eval(row.MHlabels)

    # Determine the relationship direction based on 'Dependency'
    if row.Dependency == 'dobj':
        # If 'Dependency' is 'dobj', create relationship from 'Subreddit' to 'Word2'
        relationship = Relationship(subreddit_node, row.Word1, word2_node, **mh_labels_dict)
    elif row.Dependency == 'nsubj':
        # If 'Dependency' is 'nsubj', create relationship from 'Word2' to 'Subreddit'
        relationship = Relationship(word2_node, row.Word1, subreddit_node, **mh_labels_dict)
    else:
        continue

    tx.create(relationship)

graph.commit(tx)


In [None]:
# Query to return all nodes in the graph
query = "MATCH (n) return n"
graph.run(query).data()

[{'n': Node('Mental_Health_Disorder', name='Anxiety')},
 {'n': Node('Word', name='day')},
 {'n': Node('Mental_Health_Disorder', name='BPD')},
 {'n': Node('Word', name='finances')},
 {'n': Node('Word', name='carbs')},
 {'n': Node('Word', name='death')},
 {'n': Node('Word', name='help')},
 {'n': Node('Word', name='way')},
 {'n': Node('Mental_Health_Disorder', name='depression')},
 {'n': Node('Word', name='freak')},
 {'n': Node('Mental_Health_Disorder', name='schizophrenia')},
 {'n': Node('Mental_Health_Disorder', name='mentalillness')},
 {'n': Node('Word', name='reality')},
 {'n': Node('Word', name='ups')},
 {'n': Node('Word', name='audience')},
 {'n': Node('Word', name='hate')},
 {'n': Node('Word', name='wishes')},
 {'n': Node('Word', name='birthday')},
 {'n': Node('Word', name='want')},
 {'n': Node('Word', name='deal')},
 {'n': Node('Word', name='freaking')},
 {'n': Node('Word', name='stop')},
 {'n': Node('Mental_Health_Disorder', name='bipolar')},
 {'n': Node('Word', name='organizatio