# Exploratory Notebook
Contains a whole bunch of code where I tested various things out, left as a reference

In [None]:
# Setup
from db_funcs import get_all_content, get_db_con
import dotenv
from utils import Sentence_Extractor
from neo4j_funcs import Neo4j_Driver
from utils import jupyter_print_paragraph
from EntityRelationshipExtractor import ER_Extractor

dotenv.load_dotenv()

neo4j = Neo4j_Driver()
sentence_extractor = Sentence_Extractor()
er_extractor=ER_Extractor()

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /home/chris/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cuda:0


# Dataset Visualization
Provide some statistics on content sample size

In [2]:
import numpy as np
import random
with get_db_con().cursor() as cur:
    content_ids, all_content = get_all_content(cur)
content_lengths = [len(content) for content in all_content]

print(f"Average length: {np.mean(content_lengths):.2f}")
print(f"Maximum length: {max(content_lengths)}")
print(f"Minimum length: {min(content_lengths)}")
print(f"Standard deviation: {np.std(content_lengths):.2f}")
print(f"\nSample of lengths: {random.sample(content_lengths, min(20, len(content_lengths)))}")

Average length: 802.61
Maximum length: 4063
Minimum length: 169
Standard deviation: 359.58

Sample of lengths: [1576, 338, 771, 815, 1585, 1008, 565, 531, 599, 517, 407, 845, 545, 534, 609, 1049, 545, 576, 520, 531]


# Sentence vs Paragraph Entity Relationship Extraction

In [3]:
# test paragraph or sentence entity relationships
with get_db_con().cursor() as cur:
    content_ids, all_content = get_all_content(cur)
random_paragraph = random.choice(all_content)
sentences = sentence_extractor.get_sentences(random_paragraph)

# Entity extraction on each sentence individually
individual_triplets = []
for sentence in sentences:
    triplets = er_extractor.extract_ERs([sentence])
    individual_triplets.extend(triplets)

# Entity extraction on the entire paragraph as one
paragraph_triplets = er_extractor.extract_ERs([random_paragraph])

# Print results
print("Random Paragraph:\n")
jupyter_print_paragraph(random_paragraph)
print("\nEntity relationships extracted from sentences individually:")
for triplet in individual_triplets:
    print(f"{triplet.head} -- {triplet.type} -> {triplet.tail}")

print("\nEntity relationships extracted from the entire paragraph:")
for triplet in paragraph_triplets:
    print(f"{triplet.head} -- {triplet.type} -> {triplet.tail}")

Random Paragraph:

Since all modern ctenophores except the beroids have cydippid-like larvae, it ha
s widely been assumed that their last common ancestor also resembled cydippids, 
having an egg-shaped body and a pair of retractable tentacles. Richard Harbison'
s purely morphological analysis in 1985 concluded that the cydippids are not mon
ophyletic, in other words do not contain all and only the descendants of a singl
e common ancestor that was itself a cydippid. Instead he found that various cydi
ppid families were more similar to members of other ctenophore orders than to ot
her cydippids. He also suggested that the last common ancestor of modern ctenoph
ores was either cydippid-like or beroid-like. A molecular phylogeny analysis in 
2001, using 26 species, including 4 recently discovered ones, confirmed that the
 cydippids are not monophyletic and concluded that the last common ancestor of m
odern ctenophores was cydippid-like. It also found that the genetic differences 
between t

# Basic Sentence Encoding Example
Demonstrates how sentance encoding works and what kind of values this system outputs

In [None]:
from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer("all-MiniLM-L6-v2")
test_passages = [
    "A long time ago in a galaxy far far away",
    "There was a space ship floating with anakin on it",
    "A farmer milks his cows",
    "they actually were on his land",
    "When a phone falls down into a drain, something interesting happens",
]
test_query = [
    "Star wars begins on episode 4 with a jedi who must travel to a different planet"
]

passage_embeddings = model.encode(test_passages)
query_embeddings = model.encode(test_query)

# Compute the (cosine) similarity scores
scores = model.similarity(query_embeddings, passage_embeddings) * 100

# Print the test passages with their corresponding similarities
print(test_query[0])
for passage, score in zip(test_passages, scores.tolist()[0]):
    print(f"{round(score, 2)}% - {passage}")