# Create DB for all triples

In [2]:
import pandas as pd
from tqdm.notebook import tqdm
import os
from sqlalchemy import create_engine

engine = create_engine('sqlite:///Download/main.db')

In [22]:
PATH = "Download/Cleaned Data/"

for file in os.listdir(PATH):
    
    df = pd.read_csv(os.path.join(PATH,file))

    df.to_sql(os.path.splitext(file)[0], con=engine, if_exists='replace')

In [24]:
engine.table_names()

['Confession',
 'Confessions',
 'CreepyPasta',
 'NoSleep',
 'Self',
 'ShortScaryStories',
 'SpanishText',
 'StoriesAboutKevin',
 'TIFU']

In [27]:
df = pd.read_sql_table('NoSleep', 'sqlite:///Download/main.db')  
# df['selftext'].to_list()
df

Unnamed: 0,index,created_utc,id,score,selftext,subreddit,title,title_language,selftext_language
0,0,1269397810,bhef5,24,Hey guys! Thanks for checking out /r/NoSleep! ...,nosleep,Welcome to /r/NoSleep!,nl,en
1,1,1269458850,bhq4w,68,This is copypasta from an earlier post... no w...,nosleep,The only possible supernatural experience I've...,en,en
2,2,1269476233,bhtlo,19,Reposting at the request of another. This was ...,nosleep,The Longhorn,en,en
3,3,1269479929,bhu80,29,"So, after looking at some other subreddits, I ...",nosleep,/r/NoSleep Rules,af,en
4,4,1269481302,bhufy,18,This is copy and pasted from the thread that s...,nosleep,My dad's story,cy,en
...,...,...,...,...,...,...,...,...,...
173274,173274,1591452441,gxrm8v,1,"""Any sign of 'em yet?"" \n\nI continued staring...",nosleep,I'm tasked with killing nameless things out in...,en,en
173275,173275,1591452568,gxrnj7,1,They say the devil is in the details. Well th...,nosleep,"I picked up a hitchhiker by mistake, now he's ...",en,en
173276,173276,1591453707,gxrytp,1,It is hard for me to talk about my old friend ...,nosleep,My Best Friend Saw Bugs Under His Skin,de,en
173277,173277,1591454493,gxs6jf,1,This is the only rule of our household. If you...,nosleep,Do NOT open your eyes. (The Beginning),en,en


In [30]:
import spacy
nlp = spacy.load('en_core_web_lg')

from openie import StanfordOpenIE
import truecase
from tqdm.notebook import tqdm

with StanfordOpenIE() as client:
    for table in tqdm(engine.table_names()):
        df = pd.read_sql_table(table, 'sqlite:///Download/main.db').head(1000)
        df_li = []
        for story_id, story in tqdm(zip(df['id'].to_list(), df['selftext'].to_list()), total = len(df['id'].to_list())):
            doc = nlp(story)
            for sent in doc.sents:
                original_text = sent.text
                truecase_text = truecase.get_true_case(original_text)
                text = truecase_text[0].lower() + truecase_text[1:]
                for triple in client.annotate(text):
                    triple['id'] = story_id
                    triple['subreddit'] = table
                    triple['original sent'] = text
                    df_li.append(triple)
            
        kg_df = pd.DataFrame(df_li)
        kg_df.to_sql(f"{table}_kg", con=engine, if_exists='replace')

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Starting server with command: java -Xmx8G -cp /home/anthony/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-b7156e21bcdc44a4.props -preload openie


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/144 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [2]:
engine.table_names()

['Confession',
 'Confession_kg',
 'Confessions',
 'Confessions_kg',
 'CreepyPasta',
 'CreepyPasta_kg',
 'NoSleep',
 'NoSleep_kg',
 'Self',
 'Self_kg',
 'ShortScaryStories',
 'ShortScaryStories_kg',
 'SpanishText',
 'SpanishText_kg',
 'StoriesAboutKevin',
 'StoriesAboutKevin_kg',
 'TIFU',
 'TIFU_kg']

In [3]:
pd.DataFrame(engine.execute("SELECT * FROM NoSleep_kg LIMIT 5").fetchall())

Unnamed: 0,0,1,2,3,4,5,6
0,0,i,would really like,see,bhef5,NoSleep,i would really like to see this get off the gr...
1,1,i,would like,see,bhef5,NoSleep,i would really like to see this get off the gr...
2,2,I,love,scared,bhef5,NoSleep,* I love being scared
3,3,I,being,scared,bhef5,NoSleep,* I love being scared
4,4,you,being,scared,bhef5,NoSleep,* you love being scared


# Generate knowledge graph and support queries using Neo4j¶

In [1]:
from py2neo import Graph
from py2neo import Node, Relationship

In [6]:
df = pd.DataFrame(engine.execute("SELECT * FROM NoSleep_kg").fetchall())
df

Unnamed: 0,0,1,2,3,4,5,6
0,0,i,would really like,see,bhef5,NoSleep,i would really like to see this get off the gr...
1,1,i,would like,see,bhef5,NoSleep,i would really like to see this get off the gr...
2,2,I,love,scared,bhef5,NoSleep,* I love being scared
3,3,I,being,scared,bhef5,NoSleep,* I love being scared
4,4,you,being,scared,bhef5,NoSleep,* you love being scared
...,...,...,...,...,...,...,...
100552,100552,it,was,Freaky,fo0mi,NoSleep,"but, it was a completely Freaky experience tha..."
100553,100553,that,has,my ghost story,fo0mi,NoSleep,that's my ghost story ...
100554,100554,anyone,ever see,her,fo0mi,NoSleep,anyone ever see her in King's wharf?
100555,100555,King,in,wharf,fo0mi,NoSleep,anyone ever see her in King's wharf?


In [19]:
len(df)

100557

In [None]:
from tqdm.notebook import tqdm
# Connect to Neo4j using corresponding <port:7687> and <password>
graph = Graph("bolt://localhost:7687", user="neo4j", password="Neo4j")
# graph.delete_all()

entities = {}

# Parse the entities and build the knowledge graph in Neo4j Database
for index, row in tqdm(df.iterrows(), total=len(df)):
    index, sub, rel, obj, story_id, subreddit, original_sent = row
    
    sub_node = graph.nodes.match(entities.get(sub, "Object"), name=sub).first()
    obj_node = graph.nodes.match("Object", name=obj).first()
    if not sub_node:
        sub_node = Node(entities.get(sub, "Object"),name = sub)
    if not obj_node:
        obj_node = Node(entities.get(obj, "Object"),name = obj)
    relation = Relationship.type(rel)    
    graph.merge(relation(sub_node, obj_node), entities.get(sub, "Object"), "name" )

  0%|          | 0/100557 [00:00<?, ?it/s]