In [1]:
from preprocessing import (
    preprocess_kjv,
    get_word_types_with_tf_idf,
)
from extract_relations import (
    get_directed_relations,
    order_directed_relations,
)
from ontology_algorithm import (
    construct_ontology_hierarchy,
    print_hierarchy_tree_from_ontology,
)

## Preprocessing

We will preprocess words from the kjv Bible using tf-idf and extract the top n words.

In [32]:
kjv_path = "/Users/zebo/Documents/Freelancing/upwork/Peter_J_Worth_Jr/NLP/hierarchical_clustering/data/t_kjv.csv"
kjv_bible_df, genesis_df = preprocess_kjv(
    path_to_kjv=kjv_path,
    get_book="Genesis",
)


Genesis_parts = [0, 11, 26, 37, 51]

# Specify the number of chapters to use.
last_chapter = 20
# Specify the number of top words to use.
n = 30

selected_chapter_verses = []
for chapter_index in range(1, last_chapter + 1):
    chapter = genesis_df[genesis_df["chapter"] == chapter_index]
    chapter_verses = ""
    for verse in chapter["text"].values:
        chapter_verses = chapter_verses + " " + verse
        if "the LORD" in verse and "the LORD God" not in verse:
            # replace "the LORD" with "God"
            verse = verse.replace("the LORD", "God")  # TODO change?
    selected_chapter_verses.append(chapter_verses)

tf_idf_pre_filtering = get_word_types_with_tf_idf(
    selected_chapter_verses,
    "tf_idf",
    skip_stopwords=True,
    include_verbs=False,
    include_determiners=False,
    include_pronouns=False,
    include_adverbs=False,
    include_numbers=False,
)

top_n_words = tf_idf_pre_filtering.head(n)["word"].values

# Extract relations from the corpus

In [23]:
# Create a list of all verses of the corpus.
all_verses = []
for chapter_index in range(1, last_chapter + 1):
    Chapter = genesis_df[genesis_df["chapter"] == chapter_index]
    chapter_verses = []
    for verse in Chapter["text"].values:
        if "the LORD" in verse and "the LORD God" not in verse:
            # replace "the LORD" with "God"
            # optional, better clarity
            verse = verse.replace("the LORD", "God")
        chapter_verses.append(verse)
    all_verses.extend(chapter_verses)

Example of spacy dependency trees. These we parse in the `get_directed_relations` method from `extract_relations.py`:

In [24]:
import spacy

nlp = spacy.load("en_core_web_lg")
doc_1 = nlp(all_verses[50])
spacy.displacy.render(doc_1, style="dep", jupyter=True)

In [29]:
directed_relations = get_directed_relations(
    top_n_words=top_n_words,
    all_verses=all_verses,
    verbose=True,
)


 1  sentences in verse  0
sentence:  In the beginning God created the heaven and the earth.
ents:  []
subject:  God
object:  beginning
object:  heaven
Adding relation: ' god ' -> ' earth '

 2  sentences in verse  1
sentence:  And the earth was without form, and void; and darkness was upon the face of the deep.
ents:  []
subject:  earth
object:  form
subject:  darkness
object:  face
object:  deep
sentence:  And the Spirit of God moved upon the face of the waters.
ents:  []
subject:  Spirit
object:  face
object:  waters

 1  sentences in verse  2
sentence:  And God said, Let there be light: and there was light.
ents:  []
subject:  God
Using conjunct subject:  God

 1  sentences in verse  3
sentence:  And God saw the light, that it was good: and God divided the light from the darkness.
ents:  []
subject:  God
object:  light
Adding relation: ' god ' -> ' light '
subject:  it
subject:  God
object:  light
Adding relation: ' god ' -> ' light '
object:  darkness

 2  sentences in verse  4
se

In [30]:
ordered_directed_relations = order_directed_relations(
    directed_relations=directed_relations,
    tf_idf_pre_filtering=tf_idf_pre_filtering,
    order_by="product",
    include_ordering_wrt_occurences=True,
    verbose=True,
)
ordered_directed_relations

             relation  occurances
0         (god, noah)           8
1        (god, earth)           7
2     (waters, earth)           5
3       (god, garden)           3
4         (god, cain)           3
5       (earth, kind)           3
6        (god, light)           2
7    (god, firmament)           2
8         (god, tree)           2
9         (god, adam)           2
10    (waters, month)           2
11        (noah, ark)           2
12  (sons, daughters)           2
13       (cain, abel)           2
14       (ark, month)           2
15     (terah, abram)           2
16       (enoch, god)           2
17      (god, waters)           1
18        (god, kind)           1
19       (god, thing)           1
20       (god, flesh)           1
21       (god, woman)           1
22        (god, abel)           1
23         (god, ark)           1
24        (god, sons)           1
25      (waters, ark)           1
26       (noah, sons)           1
27     (noah, waters)           1
28      (noah,

[('god', 'noah'),
 ('god', 'earth'),
 ('waters', 'earth'),
 ('god', 'garden'),
 ('god', 'cain'),
 ('earth', 'kind'),
 ('god', 'light'),
 ('god', 'firmament'),
 ('god', 'tree'),
 ('god', 'adam'),
 ('waters', 'month'),
 ('noah', 'ark'),
 ('sons', 'daughters'),
 ('cain', 'abel'),
 ('ark', 'month'),
 ('terah', 'abram'),
 ('enoch', 'god'),
 ('god', 'waters'),
 ('god', 'kind'),
 ('god', 'thing'),
 ('god', 'flesh'),
 ('god', 'woman'),
 ('god', 'abel'),
 ('god', 'ark'),
 ('god', 'sons'),
 ('waters', 'ark'),
 ('noah', 'sons'),
 ('noah', 'waters'),
 ('noah', 'years'),
 ('flesh', 'earth'),
 ('flesh', 'thing'),
 ('terah', 'nahor'),
 ('terah', 'haran')]

## Construct the ontology hierarchy

In [31]:
ontology_hierarchy, words_with_parents = construct_ontology_hierarchy(
    ordered_directed_relations=ordered_directed_relations,
)
print_hierarchy_tree_from_ontology(
    ontological_hierarchy=ontology_hierarchy,
    words_with_parents=words_with_parents,
)

terah
├── abram
├── nahor
└── haran
enoch
└── god
    ├── noah
    │   ├── ark
    │   └── years
    ├── earth
    │   └── kind
    ├── garden
    ├── cain
    │   └── abel
    ├── light
    ├── firmament
    ├── tree
    ├── adam
    ├── waters
    │   └── month
    ├── thing
    ├── flesh
    ├── woman
    └── sons
        └── daughters
