In [1]:
from preprocessing import (
    preprocess_kjv,
    get_word_types_with_tf_idf,
)
from extract_relations import (
    get_directed_relations,
    order_directed_relations,
)
from ontology_algorithm import (
    construct_ontology_hierarchy,
    print_hierarchy_tree_from_ontology,
)

## Preprocessing

We will preprocess words from the kjv Bible using tf-idf and extract the top n words.

In [2]:
kjv_path = "/Users/zebo/Documents/Freelancing/upwork/Peter_J_Worth_Jr/NLP/hierarchical_clustering/data/t_kjv.csv"
kjv_bible_df = preprocess_kjv(
    path_to_kjv=kjv_path,
)

# Get book column unique values.
all_books = kjv_bible_df["book"].unique()

# Specify the books to use as corpus.
#chosen_books = all_books[39:40]

# Disciples
# chosen_books = [
#     "Hebrews",
#     "1 John",
#     "Galatians",
#     "2 Timothy",
#     "Titus",
#     "2 Corinthians",
#     "Philippians",
#     "1 Peter",
#     "1 Corinthians",
#     "James",
#     "Colossians",
#     "Ephesians",
#     "2 John",
#     "3 John",
#     "Romans",
#     "1 Timothy",
#     "2 Peter",
#     "Jude",
#     "1 Thessalonians",
#     "Philemon",
#     "2 Thessalonians",
# ]

# Evangelists
chosen_books = [
    "Matthew",
    "Luke",
    #"Acts",
    #"Revelation",
    "John",
    "Mark",
]
    

print("Chosen books: ", chosen_books)

# Specify the number of top words to use.
n = 70

text_per_chapter = []
for book in chosen_books:
    book_df = kjv_bible_df[kjv_bible_df["book"] == book]
    for chapter in book_df["chapter"].unique():
        chapter_df = book_df[book_df["chapter"] == chapter]
        text_per_chapter.append(" ".join(chapter_df["text"].values))

tf_idf_pre_filtering = get_word_types_with_tf_idf(
    text_per_chapter,
    "tf",
    skip_stopwords=True,
    include_verbs=False,
    include_determiners=False,
    include_pronouns=False,
    include_adverbs=False,
    include_numbers=False,
)

top_n_words = tf_idf_pre_filtering.head(n)["word"].values
top_n_words

Chosen books:  ['Matthew', 'Luke', 'John', 'Mark']
Excluding words with the following word types: {'RBR', 'VBP', 'VBN', 'DT', 'WP', 'WRB', 'PDT', 'RB', 'RBS', 'VBG', 'VBD', 'WP$', 'VBZ', 'PRP', 'CD', 'PRP$', 'WDT', 'VB'}


array(['unto', 'shall', 'jesus', 'man', 'son', 'god', 'things', 'thy',
       'father', 'lord', 'disciples', 'day', 'men', 'many', 'house',
       'kingdom', 'people', 'world', 'upon', 'great', 'john', 'good',
       'peter', 'may', 'might', 'among', 'days', 'way', 'hand', 'jews',
       'would', 'life', 'name', 'pharisees', 'mother', 'time', 'word',
       'children', 'city', 'jerusalem', 'dead', 'certain', 'master',
       'chief', 'spirit', 'temple', 'multitude', 'hour', 'priests',
       'simon', 'bread', 'galilee', 'place', 'whole', 'christ', 'servant',
       'scribes', 'woman', 'nothing', 'brother', 'earth', 'thine', 'king',
       'prophet', 'hands', 'pilate', 'light', 'sea', 'mary', 'wife'],
      dtype=object)

In [3]:
tf_idf_pre_filtering.head(30)

Unnamed: 0,word,word_type,tc,tf,dc,idf,tf_idf
0,unto,"{'JJ': 561, 'NN': 18, 'IN': 780, 'NNP': 3, 'RP...",1469,0.017483,89,0.0,0.0
1,shall,"{'MD': 900, 'NNP': 4, 'PERSON': 1}",904,0.010759,86,0.034289,0.000369
4,jesus,"{'NNP': 624, 'PERSON': 569, 'ORGANIZATION': 9,...",625,0.007438,79,0.119189,0.000887
6,man,"{'NN': 474, 'NNP': 6, 'PERSON': 6}",480,0.005713,83,0.069796,0.000399
9,son,"{'NN': 171, 'NNP': 153, 'PERSON': 14, 'ORGANIZ...",324,0.003856,75,0.171148,0.00066
10,god,"{'NNP': 317, 'PERSON': 43, 'GPE': 56, 'ORGANIZ...",318,0.003785,75,0.171148,0.000648
15,things,{'NNS': 273},273,0.003249,79,0.119189,0.000387
16,thy,"{'JJ': 215, 'NN': 22, 'VB': 12, 'VBN': 1, 'NNP...",268,0.00319,64,0.329753,0.001052
17,father,"{'NN': 84, 'NNP': 172, 'PERSON': 39, 'RB': 5, ...",262,0.003118,65,0.314249,0.00098
19,lord,"{'NNP': 210, 'ORGANIZATION': 92, 'GPE': 87, 'N...",250,0.002975,69,0.25453,0.000757


In [4]:
# print the row with word "cross"
tf_idf_pre_filtering[tf_idf_pre_filtering["word"] == "cross"]

Unnamed: 0,word,word_type,tc,tf,dc,idf,tf_idf
416,cross,{'NN': 17},17,0.000202,10,2.186051,0.000442


# Extract relations from the corpus

In [5]:
# Create a list of all verses of the chosen books.
all_verses = []
for book in chosen_books:
    book_df = kjv_bible_df[kjv_bible_df["book"] == book]
    for chapter in book_df["chapter"].unique():
        chapter_df = book_df[book_df["chapter"] == chapter]
        for verse in chapter_df["text"].values:
            all_verses.append(verse)

In [6]:
import spacy

nlp = spacy.load("en_core_web_lg")
doc_1 = nlp(all_verses[2239])
spacy.displacy.render(doc_1, style="dep", jupyter=True)

In [7]:
directed_relations = get_directed_relations(
    top_n_words=top_n_words,
    all_verses=all_verses,
    verbose=True,
)


 1  sentences in verse  0
sentence:  The book of the generation of Jesus Christ, the son of David, the son of Abraham.
ents:  [Jesus Christ, David, Abraham]

 1  sentences in verse  1
sentence:  Abraham begat Isaac; and Isaac begat Jacob; and Jacob begat Judas and his brethren;
ents:  [Abraham begat Isaac, Isaac begat Jacob, Jacob begat Judas]
subject:  Abraham
object:  Isaac
subject:  Isaac
object:  Jacob
subject:  Jacob
object:  Judas

 1  sentences in verse  2
sentence:  And Judas begat Phares and Zara of Thamar; and Phares begat Esrom; and Esrom begat Aram;
ents:  [Zara of Thamar, Phares begat Esrom, Esrom begat Aram]
subject:  Judas
object:  Phares
object:  Thamar

 1  sentences in verse  3
sentence:  And Aram begat Aminadab; and Aminadab begat Naasson; and Naasson begat Salmon;
ents:  [Aram begat Aminadab, Aminadab begat Naasson, Naasson]

 1  sentences in verse  4
sentence:  And Salmon begat Booz of Rachab; and Booz begat Obed of Ruth; and Obed begat Jesse;
ents:  [Salmon begat

In [12]:
ordered_directed_relations = order_directed_relations(
    directed_relations=directed_relations,
    tf_idf_pre_filtering=tf_idf_pre_filtering,
    order_by="tf",
)
ordered_directed_relations

[('pilate', 'jesus'),
 ('jews', 'priests'),
 ('jews', 'jerusalem'),
 ('jews', 'jesus'),
 ('peter', 'lord'),
 ('peter', 'word'),
 ('john', 'disciples'),
 ('john', 'christ'),
 ('mary', 'things'),
 ('father', 'spirit'),
 ('father', 'things'),
 ('father', 'hands'),
 ('father', 'house'),
 ('father', 'bread'),
 ('father', 'son'),
 ('father', 'life'),
 ('father', 'name'),
 ('king', 'man'),
 ('king', 'hand'),
 ('people', 'light'),
 ('people', 'things'),
 ('people', 'day'),
 ('brother', 'wife'),
 ('simon', 'sea'),
 ('simon', 'peter'),
 ('jesus', 'mother'),
 ('jesus', 'things'),
 ('jesus', 'word'),
 ('jesus', 'john'),
 ('jesus', 'disciples'),
 ('jesus', 'spirit'),
 ('jesus', 'multitude'),
 ('jesus', 'people'),
 ('jesus', 'days'),
 ('jesus', 'hand'),
 ('jesus', 'woman'),
 ('jesus', 'jerusalem'),
 ('jesus', 'god'),
 ('jesus', 'bread'),
 ('jesus', 'sea'),
 ('jesus', 'peter'),
 ('jesus', 'nothing'),
 ('jesus', 'temple'),
 ('woman', 'house'),
 ('pharisees', 'things'),
 ('pharisees', 'god'),
 ('priest

## Construct the ontology hierarchy

In [13]:
ontology_hierarchy, words_with_parents = construct_ontology_hierarchy(
    ordered_directed_relations=ordered_directed_relations,
)
print_hierarchy_tree_from_ontology(
    ontological_hierarchy=ontology_hierarchy,
    words_with_parents=words_with_parents,
)

king
├── man
│   ├── many
│   ├── men
│   └── jews
│       ├── priests
│       │   └── scribes
│       └── jerusalem
└── hand
simon
├── sea
└── peter
    ├── lord
    │   ├── city
    │   └── hour
    └── word
        └── place
pilate
└── jesus
    ├── mother
    ├── john
    │   ├── disciples
    │   └── christ
    ├── multitude
    │   └── way
    ├── people
    │   ├── light
    │   └── day
    ├── days
    ├── woman
    ├── god
    │   ├── father
    │   │   ├── spirit
    │   │   ├── hands
    │   │   ├── house
    │   │   ├── bread
    │   │   ├── son
    │   │   │   └── earth
    │   │   ├── life
    │   │   └── name
    │   ├── time
    │   └── world
    ├── nothing
    └── temple
mary
└── things
brother
└── wife
