# Exploratory Notebook
Contains a whole bunch of code where I tested various things out, left as a reference

In [1]:
# Setup
from db_funcs import get_all_content, get_db_con
import dotenv
from utils import Sentence_Extractor
from neo4j_funcs import Neo4j_Driver
from utils import jupyter_print_paragraph
from EntityRelationshipExtractor import ER_Extractor
from sentence_transformers import SentenceTransformer
import numpy as np
import random

dotenv.load_dotenv()

neo4j = Neo4j_Driver()
sentence_extractor = Sentence_Extractor()
er_extractor=ER_Extractor()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /home/chris/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cuda:0


# Dataset Visualization
Provide some statistics on content sample size

In [2]:
with get_db_con().cursor() as cur:
    content_ids, all_content = get_all_content(cur)
content_lengths = [len(content) for content in all_content]

print(f"Average length: {np.mean(content_lengths):.2f}")
print(f"Maximum length: {max(content_lengths)}")
print(f"Minimum length: {min(content_lengths)}")
print(f"Standard deviation: {np.std(content_lengths):.2f}")
print(f"\nSample of lengths: {random.sample(content_lengths, min(20, len(content_lengths)))}")

Average length: 802.61
Maximum length: 4063
Minimum length: 169
Standard deviation: 359.58

Sample of lengths: [522, 1034, 552, 539, 856, 803, 981, 626, 696, 665, 280, 1207, 673, 797, 895, 977, 550, 610, 872, 1459]


# Example Entity Relationship Extraction

In [8]:
sentence="bob is married to sue"
triplets = er_extractor.extract_ERs([sentence])
print(triplets)


[triplet(head='Bob', type='spouse', tail='Sue'), triplet(head='Sue', type='spouse', tail='Bob')]


# Sentence vs Paragraph Entity Relationship Extraction

In [4]:
# test paragraph or sentence entity relationships
with get_db_con().cursor() as cur:
    content_ids, all_content = get_all_content(cur)
random_paragraph = random.choice(all_content)
sentences = sentence_extractor.get_sentences(random_paragraph)

# Entity extraction on each sentence individually
individual_triplets = []
for sentence in sentences:
    triplets = er_extractor.extract_ERs([sentence])
    individual_triplets.extend(triplets)

# Entity extraction on the entire paragraph as one
paragraph_triplets = er_extractor.extract_ERs([random_paragraph])

# Print results
print("Random Paragraph:\n")
jupyter_print_paragraph(random_paragraph)
print("\nEntity relationships extracted from sentences individually:")
for triplet in individual_triplets:
    print(f"{triplet.head} -- {triplet.type} -> {triplet.tail}")

print("\nEntity relationships extracted from the entire paragraph:")
for triplet in paragraph_triplets:
    print(f"{triplet.head} -- {triplet.type} -> {triplet.tail}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Random Paragraph:

Warsaw, especially its city centre (Śródmieście), is home not only to many natio
nal institutions and government agencies, but also to many domestic and internat
ional companies. In 2006, 304,016 companies were registered in the city. Warsaw'
s ever-growing business community has been noticed globally, regionally, and nat
ionally. MasterCard Emerging Market Index has noted Warsaw's economic strength a
nd commercial center. Moreover, Warsaw was ranked as the 7th greatest emerging m
arket. Foreign investors' financial participation in the city's development was 
estimated in 2002 at over 650 million euro. Warsaw produces 12% of Poland's nati
onal income, which in 2008 was 305.1% of the Polish average, per capita (or 160%
 of the European Union average). The GDP per capita in Warsaw amounted to PLN 94
 000 in 2008 (c. EUR 23 800, USD 33 000). Total nominal GDP of the city in 2010 
amounted to 191.766 billion PLN, 111696 PLN per capita, which was 301,1 % of Pol
ish avera

# Basic Semantic Embedding Example
Demonstrates how sentance encoding works and what kind of values this system outputs

In [5]:
# Load the model
model = SentenceTransformer("all-MiniLM-L6-v2")
test_passages = [
    "A long time ago in a galaxy far far away",
    "There was a space ship floating with anakin on it",
    "A farmer milks his cows",
    "they actually were on his land",
    "When a phone falls down into a drain, something interesting happens",
]
test_query = [
    "Star wars begins on episode 4 with a jedi who must travel to a different planet"
]

passage_embeddings = model.encode(test_passages)
query_embeddings = model.encode(test_query)

# Compute the (cosine) similarity scores
scores = model.similarity(query_embeddings, passage_embeddings) * 100

# Print the test passages with their corresponding similarities
print(test_query[0])
for passage, score in zip(test_passages, scores.tolist()[0]):
    print(f"{round(score, 2)}% - {passage}")

Star wars begins on episode 4 with a jedi who must travel to a different planet
22.31% - A long time ago in a galaxy far far away
35.85% - There was a space ship floating with anakin on it
9.22% - A farmer milks his cows
5.74% - they actually were on his land
-0.87% - When a phone falls down into a drain, something interesting happens


# Test Neo4j Content ID merge
To track which content blocks are referenced in the neo4j database, we need to make sure merges are being handled by the insert relationship function correctly
In order to do so, two nodes are added from documents 1 and 2. If everything goes according to plan, both nodes will have doc1 and doc2 listed in their content id field of their properties. It is worth noting that this is being ran against the actual neo4j database so they need to be removed after the test has been preformed. Please run the insert, inspect to make sure it was handled correctly, and then remove.

In [6]:
result = neo4j.insert_relationship(
    subject="John",
    subject_type="Person",
    predicate="KNOWS",
    object="Jane",
    object_type="Person",
    content_id="doc1"
)

result = neo4j.insert_relationship(
    subject="John",
    subject_type="Person",
    predicate="KNOWS",
    object="Jane",
    object_type="Person",
    content_id="doc2"
)

In [7]:
# Clean up
neo4j.remove_node_by_name("John")
neo4j.remove_node_by_name("Jane")


True