### NOTE TO SELF

Search for main characters throughout the whole book first, then do a reference reassignment to top 50-70 most common characters

In [None]:
import os 
import json

from get_main_char import get_main_char

bookname = 'worm'

ner_coref_data_dir = os.path.join('output', bookname)
characters_data_dir = os.path.join('temp_files', bookname)

get_main_char(ner_coref_data_dir, characters_data_dir)

with open(os.path.join(characters_data_dir, "main_characters.json"), 'r') as file:
    main_characters_data = json.load(file)

In [None]:
main_characters_data = dict(sorted(main_characters_data.items(), key = lambda x: x[1]["count"], reverse=True))

count = 0
for name, det in main_characters_data.items():
    if count == 200: 
        break

    print(name)
    count += 1

In [None]:
from consolidate_main_char import consolidate_main_char

consolidate_main_char(characters_data_dir)

In [None]:
from get_relevant_sentences import get_relevant_sentences_in_book

text_dir = os.path.join("text", bookname)

get_relevant_sentences_in_book(
    ner_coref_data_dir,
    text_dir,
    characters_data_dir
)

In [None]:
import json
import os

import numpy as np
import pandas as pd
from textblob import TextBlob 
from tqdm import tqdm


def collate_relations(
    ner_coref_data_dir: str,
    characters_data_dir: str
) -> None:
    main_characters_aliases_file_path = os.path.join(characters_data_dir, 'main_characters_aliases.json')

    if not os.path.exists(main_characters_aliases_file_path):
        raise FileNotFoundError('Missing main_characters_aliases.json in given directory!')

    with open(main_characters_aliases_file_path, 'r') as file: 
        main_char_list = json.load(file)

    num_chars = len(main_char_list)
    num_chapters = len(os.listdir(ner_coref_data_dir))

    relations_arr = np.zeros((num_chars, num_chars, 2, num_chapters))
    relations_arr_avg = np.zeros((num_chars, num_chars))

    for chapter_num, chapter in tqdm(enumerate(os.listdir(ner_coref_data_dir)), total=num_chapters):
        relevant_sentences_file_path = os.path.join(ner_coref_data_dir, chapter, 'relevant_sentences.csv')
        df = pd.read_csv(relevant_sentences_file_path)
        
        for idx, row in df.iterrows():
            char_list = json.loads(row['characters'])
            if len(char_list) < 2:
                continue

            b = TextBlob(row['words'])
            sentiment = b.sentiment[0]

            for char1 in char_list:
                for char2 in char_list:
                    if char1 == char2:
                        continue

                    relations_arr[char1][char2][0][chapter_num] += sentiment**2
                    relations_arr[char1][char2][1][chapter_num] += 1

    character_relations_file_path = os.path.join(characters_data_dir, 'character-relations.npy')
    np.save(character_relations_file_path, relations_arr)

    print("Completed Sentiment Analysis! Computing averages...")

    for char1 in range(relations_arr_avg.shape[0]):
        for char2 in range(relations_arr_avg.shape[1]):
            if char1 == char2: 
                continue

            count = np.sum(relations_arr[char1][char2][1])

            if not count: 
                continue

            relations_arr_avg[char1][char2] = np.sum(relations_arr[char1][char2][0] * relations_arr[char1][char2][1])/count

    character_relations_avg_file_path = os.path.join(characters_data_dir, 'character-relations_avg.npy')
    np.save(character_relations_avg_file_path, relations_arr_avg)

    print("Completed!")


collate_relations(ner_coref_data_dir, characters_data_dir)

100%|██████████| 304/304 [00:10<00:00, 30.12it/s]

Completed Sentiment Analysis! Computing averages...
Completed!





In [None]:
relevant_sentences_file_path = os.path.join(ner_coref_data_dir, 'Part-304-Interlude__End', 'relevant_sentences.csv')

df = pd.read_csv(relevant_sentences_file_path)
row = df.iloc[4]
print(len(json.loads(row['characters'])))
print(row['words'])

1
“I’ve killed before.


In [48]:
character_relations_avg_file_path = os.path.join(characters_data_dir, 'character-relations_avg.npy')

relations_avg_arr = np.load(character_relations_avg_file_path)

In [50]:
main_characters_aliases_file_path = os.path.join(characters_data_dir, 'main_characters_aliases.json')

with open(main_characters_aliases_file_path, 'r') as file: 
    main_char_list = json.load(file)

for i in range(len(main_char_list)):
    print(main_char_list[i], relations_avg_arr[1][i])


['NARRATOR', 'Taylor', 'Taylor Hebert', 'Ms. Hebert', 'Skitter', 'Weaver'] 0.6802271848141603
['Tattletale', 'Lisa'] 0.0
['Grue', 'Brian'] 0.1924605308981884
['Bitch', 'Rachel', 'Rachel Lindt'] 0.208446564142903
['Krouse', 'Francis', 'Trickster'] 0.25602252918608437
['Coil', 'Thomas Calvert', 'Thomas', 'Calvert', 'Director Calvert', 'Commander Calvert'] 0.3073401569893443
['Lung', 'Kenta'] 0.10481878557399392
['Noelle', 'Echidna'] 0.21771046723888987
['Imp', 'Aisha'] 0.2446792563348697
['Regent', 'Alec'] 0.12677195800393445
['Jack', 'Jack Slash'] 0.1490146896076246
['Miss Militia', 'Hannah', 'Hana'] 0.46715924370504053
['Scion', 'the Warrior', 'The golden man', 'the golden man'] 0.20377839468897244
['Chevalier'] 0.16952900319789221
['Bonesaw', 'Riley'] 0.1865560990744677
['Defiant', 'Armsmaster', 'Collin', 'Colin'] 0.3934233644337365
['Amy', 'Amy Dallon', 'Panacea', 'Amelia', 'Ames'] 0.3496088215948188
['Golem', 'Theo', 'Theodore Anders', 'Theodore'] 0.031875
['Weld'] 0.217675438596491