In [2]:
import json
from collections import Counter

def process_book_file(filename):
  with open(filename, 'r') as file:
    data = json.load(file)
    return data

In [3]:
book_info = process_book_file('./booknlp/output/the_eye_of_the_world.book')

In [4]:
def get_counter_from_dependency_list(dep_list):
    counter=Counter()
    for token in dep_list:
        term=token["w"]
        tokenGlobalIndex=token["i"]
        counter[term]+=1
    return counter

def create_character_data(data, printTop):
    character_data = {}
    for character in data["characters"]:

        agentList=character["agent"]
        patientList=character["patient"]
        possList=character["poss"]
        modList=character["mod"]

        character_id=character["id"]
        count=character["count"]

        referential_gender_distribution=referential_gender_prediction="unknown"

        if character["g"] is not None and character["g"] != "unknown":
            referential_gender_distribution=character["g"]["inference"]
            referential_gender=character["g"]["argmax"]

        mentions=character["mentions"]
        proper_mentions=mentions["proper"]
        max_proper_mention=""
        
        #Let's create some empty lists that we can append to.
        poss_items = []
        agent_items = []
        patient_items = []
        mod_items = []
    
        # just print out information about named characters
        if len(mentions["proper"]) > 0:
            max_proper_mention=mentions["proper"][0]["n"]
            for k, v in get_counter_from_dependency_list(possList).most_common(printTop):
                poss_items.append((v,k))
                
            for k, v in get_counter_from_dependency_list(agentList).most_common(printTop):
                agent_items.append((v,k))     

            for k, v in get_counter_from_dependency_list(patientList).most_common(printTop):
                patient_items.append((v,k))     

            for k, v in get_counter_from_dependency_list(modList).most_common(printTop):
                mod_items.append((v,k))  

            
            
            
            # print(character_id, count, max_proper_mention, referential_gender)
            character_data[character_id] = {"id": character_id,
                                  "count": count,
                                  "max_proper_mention": max_proper_mention,
                                  "referential_gender": referential_gender,
                                  "possList": poss_items,
                                  "agentList": agent_items,
                                  "patientList": patient_items,
                                  "modList": mod_items
                                 }
                                
    return character_data

In [5]:
character_data = create_character_data(book_info, 0)

In [7]:
import pandas as pd

df_entities = pd.read_csv('./booknlp/output/the_eye_of_the_world.entities', sep='\t')

In [8]:
character_set = df_entities[(df_entities['cat'] == 'PER') & (df_entities['prop'] == 'PROP')] \
  .groupby('COREF')['text'].unique().drop_duplicates()
  

In [9]:
filtered_character_set = character_set[character_set.apply(lambda x: len(x) >= 4)]

character_dict = {name: idx for idx, names in filtered_character_set.items() for name in names}

In [10]:
character_dict

{'Dragon': 482,
 'the Dragon': 482,
 'The Dragon': 482,
 'the Dragon Reborn': 482,
 'the false Dragon': 482,
 'Dragon Reborn': 482,
 'The Dragon Reborn': 482,
 'Kinslayer': 483,
 'Lews Therin': 483,
 'Lews Therin Kinslayer': 483,
 'the Kinslayer': 483,
 'Rand al’Thor': 494,
 'Rand': 494,
 'Abruptly Rand': 494,
 'Even Rand': 494,
 'Reluctantly Rand': 494,
 'Tiredly Rand': 494,
 'Hastily Rand': 494,
 'Cautiously Rand': 494,
 'Abruptly': 494,
 'Painfully Rand': 494,
 'Casually Rand': 494,
 'Master Rand': 494,
 'Mat Cauthon': 495,
 'Mat': 495,
 'Abruptly Mat': 495,
 'Only Mat': 495,
 'Even Mat': 495,
 'the old Mat': 495,
 'Softly Mat': 495,
 'Hastily Mat': 495,
 'Cauthon': 495,
 'Egwene': 497,
 'Only Egwene': 497,
 'Shadow': 497,
 'the Shadow': 497,
 'Shadow Egwene': 497,
 'The Shadow': 497,
 'Pretty Egwene': 497,
 'Emond': 498,
 'the Emond': 498,
 'The Emond': 498,
 'the Emond ’s Fielders': 498,
 'the Light': 500,
 'The Light': 500,
 'Light': 500,
 'the Children of the Light': 500,
 'the 

In [11]:
import pickle

with open('character_dict.pkl', 'wb') as f:
  pickle.dump(character_dict, f)