# Graph Building Notebook

Author: Christian Spiteri Gauci

## Method

* Graph data as an output from the post_feature_engineering is loaded
* Nodes - Every subreddit and every word2 - word2 being noun/pronoun, subreddit being the mental health diseases
* Going through every entry, if node already exist - use it, if not create a new one
* comparing word2 (noun) with subreddit entries - if the same, the subreddit node is to be used and points to itself, or to another subreddit node
* Word 1 (the verb) is the relationship between the nodes (subreddit and word2)
* The direction of the relationship is determined by the semantics of the sentence - dobj or nsubj





In [17]:
import pandas as pd
import numpy as np
from py2neo import Graph, Node, Relationship
import ast
import os
from dotenv import load_dotenv
from os import getenv
from concurrent.futures import ThreadPoolExecutor
import gc
import concurrent.futures
from tqdm import tqdm

In [18]:
df = pd.read_csv("./data/GraphData.csv")

In [19]:
# sample_percentage = 0.001
num_rows = 500

# sampled_df = df.sample(frac=sample_percentage, random_state=42) 
sampled_df = df.sample(n=num_rows, random_state=42)

sampled_df.to_csv("./data/GraphData_sampled.csv", index=False)

From Spacy: https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md

nsubj - nominal subject (verb (action) performed by the NS)
dobj - direct object  (verb (action) performed on the DO)

This is used to determine the direction of the relationship.

In [22]:
load_dotenv()
client_id = os.getenv('N4J_USER')
client_secret = os.getenv('N4J_PW')
# assert client_id is not None and client_secret is not None

graph = Graph(uri="bolt://localhost:7687", auth=("neo4j", "letmein1234"))
# graph = Graph(uri="bolt://localhost:7687", auth=(client_id, client_secret))

graph.delete_all()
existing_nodes = {}
tx = graph.begin()

for row in tqdm(sampled_df.itertuples(), total=len(sampled_df), desc="Processing Rows"):
    # Check if the Subreddit node already exists
    existing_subreddit = existing_nodes.get(row.Subreddit.lower())

    if existing_subreddit:
        subreddit_node = existing_subreddit
    else:
        subreddit_node = Node("Mental_Health_Disorder", name=row.Subreddit.lower()) 
        tx.create(subreddit_node)
        existing_nodes[row.Subreddit.lower()] = subreddit_node

    
    # Check if the Word node already exists
    word2_lowercase = row.Word2.lower()
    existing_word = existing_nodes.get(word2_lowercase)   

    if word2_lowercase == row.Subreddit.lower():
        word2_node = subreddit_node  # Use the Subreddit node for Word2
    elif existing_word:
        word2_node = existing_word
    else:
        word2_node = Node("Word", name=word2_lowercase)
        tx.create(word2_node)
        existing_nodes[word2_lowercase] = word2_node


    mh_labels_dict = ast.literal_eval(row.MHlabels)

    # Determine the relationship direction based on 'Dependency'
    if row.Dependency == 'dobj':
        # If 'Dependency' is 'dobj', create relationship from 'Subreddit' to 'Word2'
        relationship = Relationship(subreddit_node, row.Word1, word2_node, **mh_labels_dict)
    elif row.Dependency == 'nsubj':
        # If 'Dependency' is 'nsubj', create relationship from 'Word2' to 'Subreddit'
        relationship = Relationship(word2_node, row.Word1, subreddit_node, **mh_labels_dict)
    else:
        continue

    tx.create(relationship)
graph.commit(tx)


Processing Rows:   0%|          | 0/500 [00:00<?, ?it/s]

Processing Rows: 100%|██████████| 500/500 [00:00<00:00, 605.46it/s]


In [24]:
load_dotenv()
client_id = os.getenv('N4J_USER')
client_secret = os.getenv('N4J_PW')
# assert client_id is not None and client_secret is not None

graph = Graph(uri="bolt://localhost:7687", auth=("neo4j", "letmein1234"))
# graph = Graph(uri="bolt://localhost:7687", auth=(client_id, client_secret))

graph.delete_all()

# Function to process a batch of rows
def process_batch(graph, existing_nodes, batch):

    tx = graph.begin()

    for row in batch.itertuples():
        # Check if the Subreddit node already exists
        existing_subreddit = existing_nodes.get(row.Subreddit.lower())

        if existing_subreddit:
            subreddit_node = existing_subreddit
        else:
            subreddit_node = Node("Mental_Health_Disorder", name=row.Subreddit.lower()) 
            tx.create(subreddit_node)
            existing_nodes[row.Subreddit.lower()] = subreddit_node

        
        # Check if the Word node already exists
        word2_lowercase = row.Word2.lower()
        existing_word = existing_nodes.get(word2_lowercase)   

        if word2_lowercase == row.Subreddit.lower():
            word2_node = subreddit_node  # Use the Subreddit node for Word2
        elif existing_word:
            word2_node = existing_word
        else:
            word2_node = Node("Word", name=word2_lowercase)
            tx.create(word2_node)
            existing_nodes[word2_lowercase] = word2_node


        mh_labels_dict = ast.literal_eval(row.MHlabels)

        # Determine the relationship direction based on 'Dependency'
        if row.Dependency == 'dobj':
            # If 'Dependency' is 'dobj', create relationship from 'Subreddit' to 'Word2'
            relationship = Relationship(subreddit_node, row.Word1, word2_node, **mh_labels_dict)
        elif row.Dependency == 'nsubj':
            # If 'Dependency' is 'nsubj', create relationship from 'Word2' to 'Subreddit'
            relationship = Relationship(word2_node, row.Word1, subreddit_node, **mh_labels_dict)
        else:
            continue

        tx.create(relationship)
    
    graph.commit(tx)


graph = Graph(uri="bolt://localhost:7687", auth=("neo4j", "letmein1234"))
existing_nodes = {}    
num_workers = 6
batch_size = len(df) // num_workers

for i in tqdm(range(0, len(df), batch_size), desc="Processing Batches"):
    batch = df.iloc[i:i+batch_size]

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_batch, graph,existing_nodes, batch) for batch in np.array_split(batch, num_workers)]

    concurrent.futures.wait(futures)

Processing Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
# Query to return all nodes in the graph
query = "MATCH (n) return n"
graph.run(query).data()