## Exploration of full-text indexing

In [1]:
import set_path      # Importing this module will add the project's home directory to sys.path

Added 'D:\Docs\- MY CODE\Brain Annex\BA-Win7' to sys.path


In [2]:
import os
import sys
import getpass

from neoaccess import NeoAccess

from BrainAnnex.modules.neo_schema.neo_schema import NeoSchema
from BrainAnnex.modules.full_text_indexing.full_text_indexing import FullTextIndexing
from BrainAnnex.modules.media_manager.media_manager import MediaManager

# Connect to the database

In [3]:
# Save your credentials here - or use the prompts given by the next cell
host = ""      # EXAMPLES:  bolt://123.456.789.012   OR   neo4j://localhost
password = ""

In [4]:
print("To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES:  bolt://1.2.3.4  OR  neo4j://localhost )\n")

host = input("Enter host IP WITHOUT the port number.  EXAMPLE: bolt://123.456.789.012 ")
host += ":7687"    # EXAMPLE of host value:  "bolt://123.456.789.012:7687"

password = getpass.getpass("Enter the database password:")

print(f"\n=> Will be using: host='{host}', username='neo4j', password=**********")

To create a database connection, enter the host IP, but leave out the port number: (EXAMPLES:  bolt://1.2.3.4  OR  neo4j://localhost )



Enter host IP WITHOUT the port number.  EXAMPLE: bolt://123.456.789.012  bolt://123.456.789.012
Enter the database password: ········



=> Will be using: host='bolt://123.456.789.012:7687', username='neo4j', password=**********


In [4]:
db = NeoAccess(host=host,
               credentials=("neo4j", password), debug=False)   # Notice the debug option being OFF

Attempting to connect to Neo4j database


In [5]:
print("Version of the Neo4j driver: ", db.version())

Version of the Neo4j driver:  4.4.11


# Explorations

In [None]:
# Verify that the database is empty
q = "MATCH (n) RETURN COUNT(n) AS number_nodes"

db.query(q, single_cell="number_nodes")

In [7]:
# Count the number of Notes
q = "MATCH (n: Notes) RETURN COUNT(n) AS number_notes_items"

db.query(q, single_cell="number_notes_items")

484

In [None]:
q = "MATCH (n:Notes) RETURN n LIMIT 3"

In [7]:
db.query(q)

[]

In [8]:
match = db.match(labels="Notes")

In [9]:
# Retrieve the car node 
db.get_nodes(match, limit=3)

[]

In [8]:
NeoSchema.db = db
FullTextIndexing.db = db

In [9]:
MediaManager.set_media_folder("D:/tmp/transfer/")

In [10]:
import os

path = "D:/tmp/transfer"     # Use forward slashes even on Windows!

In [16]:
file_list = os.listdir(path)

print(len(file_list))
#print(os.listdir(path))

484


In [17]:
# Locate files that lack a database record
for filename in file_list:   # [0:13]
    #print(filename)
    (basename, suffix) = os.path.splitext(filename) 
    q = f"MATCH (n:Notes) WHERE n.basename='{basename}' AND n.suffix='htm' RETURN COUNT(n) AS number_nodes"
    n = db.query(q, single_cell="number_nodes")
    if n == 0:
        print(f"Notes record for file `{filename}`  NOT FOUND!")

In [11]:
file_list[0:2]

['notes-1000.htm', 'notes-1001.htm']

In [None]:
# Index the content of all the files
i = 1
for filename in file_list[1:]:   #file_list[0:2]
    print(f"\n {i} -------------------------\n", filename)
    (basename, suffix) = os.path.splitext(filename) 
    q = f"MATCH (n:Notes) WHERE n.basename='{basename}' AND n.suffix='htm' RETURN ID(n) AS node_int_id"
    node_int_id = db.query(q, single_cell="node_int_id")
    print("    node_int_id: ", node_int_id)
    file_contents = MediaManager.get_from_file(filename)
    #print(file_contents)
    word_list = FullTextIndexing.extract_unique_good_words(file_contents)
    print(word_list)
    FullTextIndexing.new_indexing(content_item_id = node_int_id, unique_words = word_list)
    i += 1

In [16]:
q = "MATCH (n:Notes) WHERE n.basename='notes-1001' AND n.suffix='htm' RETURN COUNT(n) AS number_nodes"

db.query(q, single_cell="number_nodes")

1

In [17]:
q = "MATCH (n:Notes) WHERE n.basename='notes-1000' AND n.suffix='htm' RETURN COUNT(n) AS number_nodes"

db.query(q, single_cell="number_nodes")

0

In [20]:
for filename in file_list[0:2]:
    print("\n", filename)
    file_contents = MediaManager.get_from_file(filename)
    print(file_contents)
    word_list = FullTextIndexing.extract_unique_good_words(file_contents)
    print(word_list)


 notes-1000.htm
<p><em>From <a href="https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/accessinginstance.htm">https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/accessinginstance.htm</a>:</em></p>

<h3>Connect to the Linux instance using a .ppk private key file</h3>

<p>If the instance uses a key pair that you created using PuTTY Key Generator, use the following procedure.</p>

<ol>
	<li>Open PuTTY.</li>
	<li>
	<p>In the <strong>Category</strong> pane, select <strong>Session</strong> and enter the following:</p>

	<ul>
		<li>
		<p><strong>Host Name (or IP address):</strong></p>

		<p><code><var>&lt;username&gt;</var>@<var>&lt;public-ip-address&gt;</var></code></p>

		<p><var>&lt;username&gt;</var> is the default username for the instance. For Oracle Linux and CentOS images, the default username is <code>opc</code>. For Ubuntu images, the default username is <code>ubuntu</code>.</p>

		<p><var>&lt;public-ip-address&gt;</var> is your instance public IP&nbsp;address that you

In [28]:
filename = "notes-2637.htm"
file_contents = MediaManager.get_from_file(filename)
print(file_contents)
word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list

content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
content_item_id
FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list)

<p>As a PLATFORM for SYSTEMS BIOLOGY, the open-source project LIFE123 (https://life123.science) offers a growing set of tools to wrestle the complications of numerical solutions to complex dynamical systems, such as cycles of biochemical reactions and diffusion.<br>
<br>
* ONLY 1/2 OF THE BATTLE, is to offer functionality to set up and simulate webs of reactions, optionally with spatial components and diffusion (plus - upcoming - membranes...)<br>
<br>
* The OTHER 1/2 OF THE BATTLE is a battery of tools, to tame numerical instabilities, to visualize &amp; analyze the results, to provide helpful diagnostics, and to integrate high-performance computing: a collaboration of software engineering, data science, numerical methods, biochemistry &amp; biology, among other fields.<br>
<br>
=&gt; Pleased to announce version Beta 24 of this collaborative OPEN-SOURCE project.<br>
<br>
Explore it all with just a few lines of Python code in convenient Jupyter notebooks! (optionally in a 1-click hoste

In [31]:
for filename in file_list[0:2]:
    print("\n-----\n", filename)
    file_contents = MediaManager.get_from_file(filename)
    print(file_contents)
    word_list = FullTextIndexing.extract_unique_good_words(file_contents)
    word_list

    content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
    print("New Content Item node created: ", content_item_id)
    FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list)


 notes-1000.htm
<p><em>From <a href="https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/accessinginstance.htm">https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/accessinginstance.htm</a>:</em></p>

<h3>Connect to the Linux instance using a .ppk private key file</h3>

<p>If the instance uses a key pair that you created using PuTTY Key Generator, use the following procedure.</p>

<ol>
	<li>Open PuTTY.</li>
	<li>
	<p>In the <strong>Category</strong> pane, select <strong>Session</strong> and enter the following:</p>

	<ul>
		<li>
		<p><strong>Host Name (or IP address):</strong></p>

		<p><code><var>&lt;username&gt;</var>@<var>&lt;public-ip-address&gt;</var></code></p>

		<p><var>&lt;username&gt;</var> is the default username for the instance. For Oracle Linux and CentOS images, the default username is <code>opc</code>. For Ubuntu images, the default username is <code>ubuntu</code>.</p>

		<p><var>&lt;public-ip-address&gt;</var> is your instance public IP&nbsp;address that you

In [22]:
MediaManager.MEDIA_FOLDER

'D:/tmp/transfer/'

In [23]:
MediaManager.get_from_file("notes-1000.htm")

'<p><em>From <a href="https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/accessinginstance.htm">https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/accessinginstance.htm</a>:</em></p>\n\n<h3>Connect to the Linux instance using a .ppk private key file</h3>\n\n<p>If the instance uses a key pair that you created using PuTTY Key Generator, use the following procedure.</p>\n\n<ol>\n\t<li>Open PuTTY.</li>\n\t<li>\n\t<p>In the <strong>Category</strong> pane, select <strong>Session</strong> and enter the following:</p>\n\n\t<ul>\n\t\t<li>\n\t\t<p><strong>Host Name (or IP address):</strong></p>\n\n\t\t<p><code><var>&lt;username&gt;</var>@<var>&lt;public-ip-address&gt;</var></code></p>\n\n\t\t<p><var>&lt;username&gt;</var> is the default username for the instance. For Oracle Linux and CentOS images, the default username is <code>opc</code>. For Ubuntu images, the default username is <code>ubuntu</code>.</p>\n\n\t\t<p><var>&lt;public-ip-address&gt;</var> is your instance public IP&nbsp

In [13]:
FullTextIndexing.initialize_schema()

In [16]:
filename = "test1.txt"
file_contents = MediaManager.get_from_file(filename)
file_contents

'hello to the world !!! ?  Welcome to learning how she cooks with potatoes...'

In [17]:
word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list

['hello', 'world', 'welcome', 'learning', 'cooks', 'potatoes']

In [18]:
content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
content_item_id

25

In [19]:
FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list)

In [20]:
filename = "test2.htm"
file_contents = MediaManager.get_from_file(filename)
file_contents

"<p>Let's make a <i>much better world</i>, shall we?  What do you say to that enticing prospect?</p>\n\n<p>Starting on a small scale &ndash; we&rsquo;ll learn cooking a potato well.</p>"

In [21]:
word_list = FullTextIndexing.extract_unique_good_words(file_contents)
word_list

['world', 'say', 'enticing', 'prospect', 'scale', 'learn', 'cooking', 'potato']

In [22]:
content_item_id = NeoSchema.create_data_node(class_node="Content Item", properties = {"name": filename})
content_item_id

0

In [23]:
FullTextIndexing.new_indexing(content_item_id = content_item_id, unique_words = word_list)

In [30]:
def search_word(word :str):
    q= f'''MATCH (w:Word)-[:occurs]->(:Indexer)<-[:has_index]-(ci:`Content Item`)
         WHERE w.name CONTAINS toLower('{word}')
         RETURN ci.name AS content_name
         '''
    result = db.query(q, single_column="content_name")
    return result

In [31]:
search_word("hello")

['test1.txt']

In [32]:
search_word("world")

['test2.htm', 'test1.txt']

In [33]:
search_word("POTATO")

['test2.htm', 'test1.txt']

In [34]:
search_word("POTATOES")

['test1.txt']

In [35]:
search_word("Learn")

['test2.htm', 'test1.txt']

In [37]:
search_word("Learning")

['test1.txt']