In [26]:
import spacy
import yaml
import csv
from pprint import pprint as print
from scipy.stats import wasserstein_distance
import numpy as np
import copy
import pandas as pd
import os

nlp = spacy.load("en_core_web_sm")
#spacy.explain('GPE') explains spacy abbreviations

In [27]:
def add_to_dict(doc, dict):
    for ent in doc.ents:
        #print(ent.text, ent.start_char, ent.end_char, ent.label_)
        name = ent.text
        label = ent.label_
        position = ent.start_char
    
        if label != 'PERSON':
            continue
    
        if ent.text not in dict:
            dict[name] = [position]
        else:
            dict[name].append(position)
            
    return dict

In [28]:
def load_data(doc):
    with open(doc) as f:
        data = f.read()
    
    #split data in half to fit spacy capacity
    midpoint = len(data) // 2
    first_half = data[:midpoint]
    second_half = data[midpoint:]
    
    doc1 = nlp(first_half)
    doc2 = nlp(second_half)
    
    return doc1, doc2

In [29]:
def get_most_common_names(dict):
    #CONVERTS TO LIST!
    most_common_names = sorted(dict.items(), key=lambda x: len(x[1]), reverse=True)
    most_common_names = most_common_names[0:10]

    #convert back to dict
    names_dict = {}
    for item in most_common_names:
        names_dict[item[0]] = item[1]
        
    return names_dict

In [30]:
def save_indexes(book, dict):
    with open(f'./indexes/{book}.csv', 'w') as f:
        for key in dict.keys():
            f.write("%s, %s\n" % (key, dict[key]))

In [35]:
#TODO: REFACTOR?
def distance(character1, character2, novel_length, t):
    character1_cpy = copy.deepcopy(character1)
    character2_cpy = copy.deepcopy(character2)

    if len(character1_cpy) < len(character2_cpy):
        character1_cpy, character2_cpy = character2_cpy, character1_cpy

    character1new = []
    used_list = []

    j_nearest = None

    for i in range(len(character2_cpy)):
        min_distance = float('inf')
        for j in range(len(character1_cpy)):
            curr_distance = abs(character1_cpy[j] - character2_cpy[i])
            if curr_distance < min_distance and j not in used_list:
                min_distance = curr_distance
                j_nearest = j
        character1new.append(character1_cpy[j_nearest])
        used_list.append(j_nearest)

    character1new.sort()

    n = len(character1new)
    
    #normalize both vectors
    for i in range(n):
        character1new[i] /= novel_length
        character2_cpy[i] /= novel_length

    for i in range(n):
        character1new[i] **= (1+t)
        character2_cpy[i] **= (1+t)

    return wasserstein_distance(character1new, character2_cpy)

In [32]:
def calculate_distances(appearances, novel_length):
    distances_t0 = np.zeros((10, 10))
    distances_tminus = np.zeros((10, 10))
    distances_tplus = np.zeros((10, 10))
    
    #TODO: distance matrix is symmetrical, optimize
    for i in range(10):
        for j in range(10):
            distances_t0[i][j] = distance(appearances[i], appearances[j], novel_length, 0)
            distances_tminus[i][j] = distance(appearances[i], appearances[j], novel_length, -0.1)
            distances_tplus[i][j] = distance(appearances[i], appearances[j], novel_length, 0.1)
            
    return distances_t0, distances_tminus, distances_tplus

In [33]:
def save_distances(book, distances_t0, distances_tplus, distances_tminus, characters):
    distances_t0 = pd.DataFrame(distances_t0, index=characters, columns=characters)
    distances_tminus = pd.DataFrame(distances_tminus, index=characters, columns=characters)
    distances_tplus = pd.DataFrame(distances_tplus, index=characters, columns=characters)
    distances_t0.to_csv("./distances/" + book + "_t0.csv")
    distances_tminus.to_csv("./distances/" + book + "_tminus1.csv")
    distances_tplus.to_csv("./distances/" + book + "_tplus1.csv")

In [37]:
folder_path = './books/'

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            dict = {}
            
            #remove .txt
            book = filename[:-4]
            
            #continue if the book was processed
            if (os.path.exists(f"./indexes/{book}.csv")):
                continue
            
            (doc1, doc2) = load_data(file_path)
            dict = add_to_dict(doc1, dict)
            dict = add_to_dict(doc2, dict)
            
            names_dict = get_most_common_names(dict)
            save_indexes(book, names_dict)
            
            characters = list(names_dict.keys())
            novel_length = len(doc1) + len(doc2)
            appearances = list(names_dict.values())

            distances_t0, distances_tminus, distances_tplus = calculate_distances(appearances, novel_length)
            save_distances(book, distances_t0, distances_tplus, distances_tminus, characters)