Suppose that you would like to implement a tool that supports message writing, suggesting the next word while you are typing. Moreover, suppose that you would like the tool to learn from you or from a specific set of documents. Such a tool could be useful not only for providing message-writing assistance, but also for supporting spell checking, extracting common phrases, summarizing, and so on.

# Load Data

In [None]:
import os

from py2neo import Graph

# Connect to database
url = "bolt://localhost:7687"
username = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
database = "masc"

graph = Graph(url, auth=(username, password), name=database)

In [None]:
add_constraint = "CREATE CONSTRAINT FOR (w:Word) REQUIRE w.value IS UNIQUE;"
graph.run(add_constraint)

In [None]:
# doesn't work here use desktop
load_data_1 = """
LOAD CSV FROM "file:///masc_sentences.tsv" AS line FIELDTERMINATOR '\t'
CALL {
    WITH line
    WITH line[6] as sentence
    WITH split(sentence, " ") as words
    FOREACH ( idx IN range(0,size(words)-2) |
    MERGE (w1:Word {value:apoc.text.clean(words[idx])})
    MERGE (w2:Word {value:apoc.text.clean(words[idx+1])})
    MERGE (w1)-[r:NEXT]->(w2)
      ON CREATE SET r.weight = 1
      ON MATCH SET r.weight = r.weight + 1)
} IN TRANSACTIONS OF 500 ROWS
"""
# without cleaning
# Added 170037 labels, created 170037 nodes, set 2553105 properties, created 929137 relationships, completed after 261495 ms.
# with cleaning
# Added 97209 labels, created 97209 nodes, set 2480277 properties, created 806397 relationships, completed after 229257 ms.


# New importing query that uses the sentence identifier
# The word nodes are unique, so if you have millions of sentences, this schema will create supernodes—that is,
# nodes with millions of relationships coming in, going out, or both.
load_data = """
LOAD CSV FROM "file:///masc_sentences.tsv" AS line FIELDTERMINATOR '\t'
CALL {
    WITH line
    WITH line[6] as sentence, line[2] as sentenceId
    WITH split(sentence," ") as words, sentenceId
    FOREACH ( idx IN range(0,size(words)-2) |
    MERGE (w1:Word {value:apoc.text.clean(words[idx])})
    MERGE (w2:Word {value:apoc.text.clean(words[idx+1])})
    CREATE (w1)-[r:NEXT {sentence: sentenceId}]->(w2))
} IN TRANSACTIONS OF 500 ROWS
"""
# Added 97209 labels, created 97209 nodes, set 2480277 properties, created 2383068 relationships, completed after 93746 ms.
delete = """
CALL apoc.periodic.iterate(
"MATCH (p:Word) RETURN p",
"DETACH DELETE p", {batchSize:500})
"""
# Had to add masc_sentences.tsv to the DB server
graph.run(load_data)

# Analysis

In [None]:
query = """
MATCH (w:Word {value: "how"})-[e:NEXT]->(w2:Word)
RETURN w2.value as next, e.weight as frequency
ORDER BY frequency desc
LIMIT 3
"""

res = graph.run(query).to_data_frame()
res

In [None]:
query = """
MATCH (w:Word)-[e:NEXT]->(w2:Word)
RETURN apoc.text.clean(w2.value) as next, e.weight as frequency
ORDER BY frequency desc
LIMIT 100
"""
res = graph.run(query).to_data_frame()
res

### sentence

In [None]:
query = """
MATCH (w2:Word {value: "know"})-[r:NEXT]->(w3:Word {value: "how"})-[e:NEXT]-> (w4:Word)
WHERE r.sentence = e.sentence
RETURN w4.value as next, count(DISTINCT r) as frequency
ORDER BY frequency desc
LIMIT 3
"""
query = """
MATCH (w1:Word {value: "you"})-[a:NEXT]->(w2:Word {value: "know"})-[r:NEXT]->(w3:Word {value: "how"})-[e:NEXT]->(w4:Word)
WHERE a.sentence = r.sentence AND r.sentence = e.sentence
RETURN w4.value as next, count(DISTINCT r) as frequency
ORDER BY frequency desc
LIMIT 3
"""
res = graph.run(query).to_data_frame()
res

Simple Spacy Example

In [None]:



def tokenize(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    i = 1
    for sentence in doc.sents:
        print("-------- Sentence ", i, "-----------")
        i += 1
        for token in sentence:
            print(token.idx, "-", token.text, "-", token.lemma_, "-", token.tag_)


sentence = "Marie Curie received the Nobel Prize in Physic in 1903. She became the first woman to win the prize."
tokenize(sentence)

In [None]:
import stanza


# stanza.download("en")

def tokenize(text):
    nlp = stanza.Pipeline()
    # models_dir='stanfordnlp_resources')  # This sets up a default neural pipeline in English
    doc = nlp(text)
    i = 1
    for sentence in doc.sentences:
        print("--------Sentence ", i, "-----------")
        i += 1
        for token in sentence.tokens:
            # print(token.pretty_print())
            print(token.id, "-", token.text, "-", token.words[0].lemma)


tokenize("Barack Obama was born in Hawaii.  He was elected president in 2008.")

In [None]:
# Clean up memory
import torch

import gc


def report_gpu():
    print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()


report_gpu()

# Dataframe Ingestion Version

In [28]:

from neo4j import GraphDatabase
import pandas as pd
import spacy
import os

uri = "bolt://localhost:7687"
user = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")

driver = GraphDatabase.driver(uri, auth=(user, password))


def get_session(database):
    return driver.session(database=database)


text = "Marie Curie received the Nobel Prize in Physic in 1903. She became the first woman to win the prize."
df = pd.DataFrame([text], columns=["raw_text"])


def execute_query(query, params):
    results = []
    with get_session("spacy") as session:
        for items in session.run(query, params):
            item = items["result"]
            results.append(item)

    return results


def create_annotated_text(nlp_json, id=1):
    query = """MERGE (ann:AnnotatedText {id: $id, nlp_json: $nlp_json})
        RETURN id(ann) as result
    """
    params = {"id": id, "nlp_json": nlp_json}
    results = execute_query(query, params)
    return results[0]


nlp = spacy.load("en_core_web_sm")


def create_graph_object(row):
    raw_text = row["raw_text"]
    for doc in nlp.pipe([raw_text], disable=["ner"]):
        row["nlp_json"] = str(doc.to_json())
        annotated_text = create_annotated_text(row["nlp_json"])
        i = 1
        for sentence in doc.sents:
            print("-------- Sentence ", i, "-----------")
            print(annotated_text)
            print(sentence.text)
            # self.store_sentence(sentence, annotated_text, text_id, i, store_tag)
            i += 1

    return row


df = df.apply(create_graph_object, axis=1)

Executing query:
 MERGE (ann:AnnotatedText {id: $id, nlp_json: $nlp_json})
        RETURN id(ann) as result
    
with params:  {'id': 1, 'nlp_json': "{'text': 'Marie Curie received the Nobel Prize in Physic in 1903. She became the first woman to win the prize.', 'sents': [{'start': 0, 'end': 55}, {'start': 56, 'end': 100}], 'tokens': [{'id': 0, 'start': 0, 'end': 5, 'tag': 'NNP', 'pos': 'PROPN', 'morph': 'Number=Sing', 'lemma': 'Marie', 'dep': 'compound', 'head': 1}, {'id': 1, 'start': 6, 'end': 11, 'tag': 'NNP', 'pos': 'PROPN', 'morph': 'Number=Sing', 'lemma': 'Curie', 'dep': 'nsubj', 'head': 2}, {'id': 2, 'start': 12, 'end': 20, 'tag': 'VBD', 'pos': 'VERB', 'morph': 'Tense=Past|VerbForm=Fin', 'lemma': 'receive', 'dep': 'ROOT', 'head': 2}, {'id': 3, 'start': 21, 'end': 24, 'tag': 'DT', 'pos': 'DET', 'morph': 'Definite=Def|PronType=Art', 'lemma': 'the', 'dep': 'det', 'head': 5}, {'id': 4, 'start': 25, 'end': 30, 'tag': 'NNP', 'pos': 'PROPN', 'morph': 'Number=Sing', 'lemma': 'Nobel', 'd