In [None]:
import csv
import json
import os

import pandas as pd


book = "worm"
chapter = "Part-16-Insinuation_2_9.txt"

chapter_dir = os.path.join("output", book, chapter)
chapter_text_file = os.path.join("text", book, chapter)

tokens_df = pd.read_csv(os.path.join(chapter_dir, f"{chapter}.tokens"), delimiter='\t', quoting=csv.QUOTE_NONE)
entities_df = pd.read_csv(os.path.join(chapter_dir, f"{chapter}.entities"), delimiter='\t', quoting=csv.QUOTE_NONE)
quotes_df = pd.read_csv(os.path.join(chapter_dir, f"{chapter}.quotes"), delimiter='\t', quoting=csv.QUOTE_NONE) 

with open(os.path.join(chapter_dir, f"{chapter}.book"), "r") as f:
    book_json = json.load(f)

with open(chapter_text_file, 'r', encoding='utf-8') as f:
    chapter_text = f.read()

### NOTE TO SELF

Search for main characters throughout the whole book first, then do a reference reassignment to top 50-70 most common characters

In [None]:
main_characters = {}

for character in book_json['characters']:
    if character['mentions']['proper']:
        name = character['mentions']['proper'][0]['n']
    elif character['mentions']['common']:
        if character['count'] < 5: 
            continue

        name = character['mentions']['common'][0]['n']
    else:
        if character['count'] < 5: 
            continue

        name = character['mentions']['pronoun'][0]['n']
        if name == 'I': 
            name = "NARRATOR"
    
    main_characters[character['id']] = (name, character['count'])

In [116]:
sentence_info = {
    "words": [],
    "start_token_id": [],
    "end_token_id": [],
    "speaker": [],
    "characters": []
}

num_paragraphs = int(tokens_df.iloc[-1]["paragraph_ID"])
curr_entities_row_idx = 0
entities_row = entities_df.iloc[curr_entities_row_idx]
curr_quotes_row_idx = 0
quotes_row = quotes_df.iloc[curr_quotes_row_idx]

main_characters_coref = main_characters.keys()
    

for paragraph_num in range(num_paragraphs+1):

    paragraph_df = tokens_df[tokens_df["paragraph_ID"] == paragraph_num]

    # Ensure paragraphs are also different
    start_sentence_id = int(paragraph_df.iloc[0]["sentence_ID"])
    end_sentence_id = int(paragraph_df.iloc[-1]["sentence_ID"])

    for sentence_id in range(start_sentence_id, end_sentence_id+1):
        sentence_df = paragraph_df[paragraph_df["sentence_ID"] == sentence_id]

        byte_onset = int(sentence_df.iloc[0]["byte_onset"])
        byte_offset = int(sentence_df.iloc[-1]["byte_offset"])
        sentence = chapter_text[byte_onset:byte_offset]

        start_token_id = int(sentence_df.iloc[0]["token_ID_within_document"])
        end_token_id = int(sentence_df.iloc[-1]["token_ID_within_document"])

        characters = set()
        speaker = []
        
        while entities_row is not None and entities_row['end_token'] <= end_token_id:
            if entities_row['start_token'] >= start_token_id and entities_row['COREF'] in main_characters_coref:
                characters.add(int(entities_row['COREF']))
            
            curr_entities_row_idx += 1
            if curr_entities_row_idx >= len(entities_df): 
                entities_row = None
                break

            entities_row = entities_df.iloc[curr_entities_row_idx]


        while quotes_row is not None and ((
            quotes_row['quote_end'] > start_token_id and quotes_row['quote_end'] <= end_token_id
        ) or (
            quotes_row['quote_start'] >= start_token_id and quotes_row['quote_start'] < end_token_id
        )):
            speaker.append(int(quotes_row['char_id']))

            if quotes_row['quote_end'] > end_token_id:
                break
            
            curr_quotes_row_idx += 1
            if curr_quotes_row_idx >= len(quotes_df):
                quotes_row = None
                break
            
            quotes_row = quotes_df.iloc[curr_quotes_row_idx]


        if not speaker: 
            speaker = None 

        if speaker is None and len(characters) < 2: 
            continue

        sentence_info['words'].append(sentence)
        sentence_info['start_token_id'].append(start_token_id)
        sentence_info['end_token_id'].append(end_token_id)
        sentence_info['characters'].append(list(characters))
        sentence_info['speaker'].append(speaker)

In [117]:
sentences_df = pd.DataFrame.from_dict(sentence_info)
sentences_df

Unnamed: 0,words,start_token_id,end_token_id,speaker,characters
0,"As Brian and I returned to the loft, I felt mo...",2,18,,"[0, 1]"
1,It wasn’t just that I was going to be around B...,19,43,,"[0, 2, 3, 4]"
2,"As she saw me, she scowled, but didn’t say any...",151,164,,"[0, 2]"
3,"Alec grinned as I came back, but I couldn’t de...",165,191,,"[0, 4]"
4,I didn’t know him well enough to guess either ...,192,203,,"[0, 4]"
...,...,...,...,...,...
83,“Better than ever.,2273,2277,[0],[]
84,"I kind of made some friends,” I said.",2278,2288,[0],[0]
85,"My attention caught by the gun, I only barely ...",2337,2352,,"[0, 18]"
86,“What are they like?”,2353,2359,[0],[]


In [None]:
for character in book_json['characters']:
    if character['id'] != 42:
        continue

    print(character)