Suppose that you would like to implement a tool that supports message writing, suggesting the next word while you are typing. Moreover, suppose that you would like the tool to learn from you or from a specific set of documents. Such a tool could be useful not only for providing message-writing assistance, but also for supporting spell checking, extracting common phrases, summarizing, and so on.

# Load Data

In [None]:
import os

from py2neo import Graph

# Connect to database
url = "bolt://localhost:7687"
username = os.getenv("NEO4J_USER")
password = os.getenv("NEO4J_PASSWORD")
database = "masc"

graph = Graph(url, auth=(username, password), name=database)

In [None]:
add_constraint = "CREATE CONSTRAINT FOR (w:Word) REQUIRE w.value IS UNIQUE;"
graph.run(add_constraint)

In [None]:
# doesn't work here use desktop
load_data_1 = """
LOAD CSV FROM "file:///masc_sentences.tsv" AS line FIELDTERMINATOR '\t'
CALL {
    WITH line
    WITH line[6] as sentence
    WITH split(sentence, " ") as words
    FOREACH ( idx IN range(0,size(words)-2) |
    MERGE (w1:Word {value:apoc.text.clean(words[idx])})
    MERGE (w2:Word {value:apoc.text.clean(words[idx+1])})
    MERGE (w1)-[r:NEXT]->(w2)
      ON CREATE SET r.weight = 1
      ON MATCH SET r.weight = r.weight + 1)
} IN TRANSACTIONS OF 500 ROWS
"""
# without cleaning
# Added 170037 labels, created 170037 nodes, set 2553105 properties, created 929137 relationships, completed after 261495 ms.
# with cleaning
# Added 97209 labels, created 97209 nodes, set 2480277 properties, created 806397 relationships, completed after 229257 ms.


# New importing query that uses the sentence identifier
# The word nodes are unique, so if you have millions of sentences, this schema will create supernodes—that is,
# nodes with millions of relationships coming in, going out, or both.
load_data = """
LOAD CSV FROM "file:///masc_sentences.tsv" AS line FIELDTERMINATOR '\t'
CALL {
    WITH line
    WITH line[6] as sentence, line[2] as sentenceId
    WITH split(sentence," ") as words, sentenceId
    FOREACH ( idx IN range(0,size(words)-2) |
    MERGE (w1:Word {value:apoc.text.clean(words[idx])})
    MERGE (w2:Word {value:apoc.text.clean(words[idx+1])})
    CREATE (w1)-[r:NEXT {sentence: sentenceId}]->(w2))
} IN TRANSACTIONS OF 500 ROWS
"""
# Added 97209 labels, created 97209 nodes, set 2480277 properties, created 2383068 relationships, completed after 93746 ms.
delete = """
CALL apoc.periodic.iterate(
"MATCH (p:Word) RETURN p",
"DETACH DELETE p", {batchSize:500})
"""
# Had to add masc_sentences.tsv to the DB server
graph.run(load_data)

# Analysis

In [None]:
query = """
MATCH (w:Word {value: "how"})-[e:NEXT]->(w2:Word)
RETURN w2.value as next, e.weight as frequency
ORDER BY frequency desc
LIMIT 3
"""

res = graph.run(query).to_data_frame()
res

In [None]:
query = """
MATCH (w:Word)-[e:NEXT]->(w2:Word)
RETURN apoc.text.clean(w2.value) as next, e.weight as frequency
ORDER BY frequency desc
LIMIT 100
"""
res = graph.run(query).to_data_frame()
res

### sentence

In [None]:
query = """
MATCH (w2:Word {value: "know"})-[r:NEXT]->(w3:Word {value: "how"})-[e:NEXT]-> (w4:Word)
WHERE r.sentence = e.sentence
RETURN w4.value as next, count(DISTINCT r) as frequency
ORDER BY frequency desc
LIMIT 3
"""
query = """
MATCH (w1:Word {value: "you"})-[a:NEXT]->(w2:Word {value: "know"})-[r:NEXT]->(w3:Word {value: "how"})-[e:NEXT]->(w4:Word)
WHERE a.sentence = r.sentence AND r.sentence = e.sentence
RETURN w4.value as next, count(DISTINCT r) as frequency
ORDER BY frequency desc
LIMIT 3
"""
res = graph.run(query).to_data_frame()
res