In [1]:
from preprocessing import (
    preprocess_kjv,
    get_word_types_with_tf_idf,
)
from extract_relations import (
    get_directed_relations,
    order_directed_relations,
)
from ontology_algorithm import (
    construct_ontology_hierarchy,
    print_hierarchy_tree_from_ontology,
)

## Preprocessing

We will preprocess words from the kjv Bible using tf-idf and extract the top n words.

In [2]:
kjv_path = "/Users/zebo/Documents/Freelancing/upwork/Peter_J_Worth_Jr/NLP/hierarchical_clustering/data/t_kjv.csv"
kjv_bible_df, genesis_df = preprocess_kjv(
    path_to_kjv=kjv_path,
    get_book="Genesis",
)

# Specify the number of chapters to use.
last_chapter = 50
# Specify the number of top words to use.
n = 50

selected_chapter_verses = []
for chapter_index in range(1, last_chapter + 1):
    chapter = genesis_df[genesis_df["chapter"] == chapter_index]
    chapter_verses = ""
    for verse in chapter["text"].values:
        chapter_verses = chapter_verses + " " + verse
        if "the LORD" in verse and "the LORD God" not in verse:
            # replace "the LORD" with "God"
            verse = verse.replace("the LORD", "God")  # TODO change?
    selected_chapter_verses.append(chapter_verses)

tf_idf_pre_filtering = get_word_types_with_tf_idf(
    selected_chapter_verses,
    "tf_idf",
    skip_stopwords=True,
    include_verbs=False,
    include_determiners=False,
    include_pronouns=False,
    include_adverbs=False,
    include_numbers=False,
)

top_n_words = tf_idf_pre_filtering.head(n)["word"].values

Excluding words with the following word types: {'VBZ', 'CD', 'RBS', 'WP', 'VBG', 'RBR', 'WP$', 'PRP', 'PRP$', 'PDT', 'VBD', 'VB', 'WDT', 'VBN', 'RB', 'WRB', 'DT', 'VBP'}


# Extract relations from the corpus

In [3]:
# Create a list of all verses of the corpus.
all_verses = []
for chapter_index in range(1, last_chapter + 1):
    Chapter = genesis_df[genesis_df["chapter"] == chapter_index]
    chapter_verses = []
    for verse in Chapter["text"].values:
        if "the LORD" in verse and "the LORD God" not in verse:
            # replace "the LORD" with "God"
            verse = verse.replace("the LORD", "God")  # TODO change?
        chapter_verses.append(verse)
    all_verses.extend(chapter_verses)

Example of spacy dependency trees. These we parse in the `get_directed_relations` method from `extract_relations.py`:

In [10]:
import spacy

nlp = spacy.load("en_core_web_lg")
doc_1 = nlp(all_verses[50])
spacy.displacy.render(doc_1, style="dep", jupyter=True)

In [6]:
directed_relations = get_directed_relations(
    top_n_words=top_n_words,
    all_verses=all_verses,
    verbose=True,
)


 1  sentences in verse  0
sentence:  In the beginning God created the heaven and the earth.
ents:  []
subject:  God
object:  beginning
object:  heaven

 2  sentences in verse  1
sentence:  And the earth was without form, and void; and darkness was upon the face of the deep.
ents:  []
sentence:  And the Spirit of God moved upon the face of the waters.
ents:  []

 1  sentences in verse  2
sentence:  And God said, Let there be light: and there was light.
ents:  []

 1  sentences in verse  3
sentence:  And God saw the light, that it was good: and God divided the light from the darkness.
ents:  []
subject:  God
object:  light
subject:  God
object:  light
object:  darkness

 2  sentences in verse  4
sentence:  And God called the light Day, and the darkness he called Night.
ents:  []
subject:  God
object:  Day
sentence:  And the evening and the morning were the first day.
ents:  [the first day]

 1  sentences in verse  5
sentence:  And God said, Let there be a firmament in the midst of the w

In [7]:
ordered_directed_relations = order_directed_relations(
    directed_relations=directed_relations,
    tf_idf_pre_filtering=tf_idf_pre_filtering,
)
ordered_directed_relations

[('joseph', 'egypt'),
 ('joseph', 'brethren'),
 ('joseph', 'cattle'),
 ('joseph', 'dream'),
 ('joseph', 'name'),
 ('joseph', 'father'),
 ('joseph', 'servants'),
 ('joseph', 'money'),
 ('joseph', 'house'),
 ('joseph', 'pharaoh'),
 ('jacob', 'years'),
 ('jacob', 'name'),
 ('jacob', 'house'),
 ('jacob', 'esau'),
 ('jacob', 'father'),
 ('jacob', 'wife'),
 ('jacob', 'daughter'),
 ('jacob', 'pharaoh'),
 ('jacob', 'laban'),
 ('jacob', 'rachel'),
 ('jacob', 'daughters'),
 ('abraham', 'name'),
 ('abraham', 'isaac'),
 ('abraham', 'wife'),
 ('abraham', 'abimelech'),
 ('abraham', 'sarah'),
 ('abraham', 'sons'),
 ('abraham', 'son'),
 ('pharaoh', 'dream'),
 ('pharaoh', 'name'),
 ('pharaoh', 'abram'),
 ('esau', 'daughters'),
 ('esau', 'father'),
 ('esau', 'daughter'),
 ('esau', 'son'),
 ('abram', 'name'),
 ('abram', 'wife'),
 ('master', 'wife'),
 ('isaac', 'jacob'),
 ('isaac', 'esau'),
 ('isaac', 'wife'),
 ('isaac', 'rebekah'),
 ('sons', 'egypt'),
 ('sons', 'daughters'),
 ('sons', 'father'),
 ('sons'

## Construct the ontology hierarchy

In [8]:
ontology_hierarchy, words_with_parents = construct_ontology_hierarchy(
    ordered_directed_relations=ordered_directed_relations,
)
print_hierarchy_tree_from_ontology(
    ontological_hierarchy=ontology_hierarchy,
    words_with_parents=words_with_parents,
)

god
├── covenant
├── sodom
├── city
├── noah
│   └── ark
├── servant
│   └── camels
├── abraham
│   ├── isaac
│   │   ├── jacob
│   │   │   ├── years
│   │   │   ├── esau
│   │   │   ├── wife
│   │   │   ├── daughter
│   │   │   ├── laban
│   │   │   │   └── leah
│   │   │   ├── rachel
│   │   │   │   └── joseph
│   │   │   │       ├── egypt
│   │   │   │       ├── brethren
│   │   │   │       ├── cattle
│   │   │   │       ├── dream
│   │   │   │       ├── name
│   │   │   │       ├── father
│   │   │   │       ├── servants
│   │   │   │       ├── money
│   │   │   │       ├── house
│   │   │   │       └── pharaoh
│   │   │   │           └── abram
│   │   │   └── daughters
│   │   └── rebekah
│   │       └── brother
│   ├── abimelech
│   ├── sarah
│   │   └── master
│   ├── sons
│   └── son
├── earth
│   ├── seed
│   └── kind
└── cain
