In [None]:
# Idea: entity recognition -> coreference resolution -> emotion analysis between entities

In [4]:
import ast
import os
import stanza
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from divide_chapters import _PREPROCESSED_PATH

In [5]:
text_df = pd.DataFrame(columns=["book", "chapter", "text"])

for book in os.scandir("../text_data/preprocessed"):
    chapters = os.listdir(book.path)
    chapters.sort(key=lambda x: int(x.split('_')[0]))
    for chapter in chapters:
        with open(os.path.join(book.path, chapter), 'r') as chpt_file:
            chpt_text = chpt_file.read()
            text_df.loc[len(text_df.index)] = pd.Series({
                "book": book.name,
                "chapter": chapter,
                "text": chpt_text
            })

text_df.chapter = text_df.chapter.str.replace('.txt', '')

  text_df.chapter = text_df.chapter.str.replace('.txt', '')


In [6]:
ner = stanza.Pipeline(lang='en', processors='tokenize,ner')

INFO:stanza:Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

INFO:stanza:Use device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [14]:
text_df['ner'] = text_df.text.apply(ner)

KeyboardInterrupt: 

In [6]:
text_df.to_csv("ner_dataset.csv")

In [117]:
from pathlib import Path

for idx, item in text_df.ner.iteritems():
    file_name = os.path.join("../ner_pickle", text_df.chapter[idx] + ".pickle",)
    if not os.path.isfile(file_name):
        Path(file_name).touch()
    with open(file_name, "wb") as file:
        pickle.dump(item, file)

AttributeError: 'DataFrame' object has no attribute 'ner'

In [28]:
ner_path = "../ner_pickle"
chapters = os.listdir(ner_path)
chapters.sort(key=lambda x: int(x.split('_')[0]))

df_ner = pd.DataFrame(columns=["chapter", "ner"])
for chapter in chapters:
    with open(os.path.join(ner_path, chapter), 'br') as chpt_file:
        chpt_ner = pickle.load(chpt_file)
        df_ner.loc[len(df_ner.index)] = pd.Series({
            "chapter": chapter,
            "ner": chpt_ner
        })
df_ner.chapter = df_ner.chapter.str.replace('.pickle', '')

  df_ner.chapter = df_ner.chapter.str.replace('.pickle', '')


In [29]:
data = pd.merge(df_ner, text_df[["chapter", "book"]], left_on="chapter", right_on="chapter")
data = data.set_index(["book", "chapter"]).sort_index(key=lambda x: x.map(lambda y: y.split('_')[-1]) if x[0].split('_')[0] == "red" else x.map(lambda y: int(y.split('_')[0])))

In [30]:
data_ner = data.ner.apply(lambda  res: [(ent.text,ent.type) for sent in res.sentences for ent in sent.ents])

In [31]:
data_comb = []
for item in data_ner:
    data_comb += item

In [32]:
ents_df = pd.DataFrame(data_comb)

In [41]:
ents_df.groupby([0,1]).size().index.get_level_values(0).size

2239

In [43]:
filter_list = ['ORG', 'PERSON', 'LOC', 'NORP', 'GPE', 'PRODUCT', 'FAC', 'WORK_OF_ART', 'EVENT']
ents_filtered = ents_df[ents_df[1].isin(filter_list)]
ents_filtered = ents_filtered.groupby([0,1]).size()
ents_filtered.shape

(1499,)

In [44]:
from Levenshtein import distance as lev

In [45]:
entities = ents_filtered.index.get_level_values(0).unique().to_series()
ent_lev = entities.map(lambda x: {comp_word: 1 - (lev(x, comp_word) / max(len(x), len(comp_word))) for comp_word in entities})

In [46]:
ent_lev_matrix = pd.DataFrame([*ent_lev], index=ent_lev.index)

In [47]:
plot_df = ent_lev_matrix.copy()
plot_df[plot_df < .8] = 0
plot_df[plot_df == 1] = 0
# plot_df = plot_df.loc[(plot_df != 0).any(axis=1), (plot_df != 0).any(axis=0)]

In [49]:
sim_sets = []

for name, item in plot_df.items():
    idx = item[item > 0].index.to_list()
    idx.append(name)
    idx = set(idx)
    if idx not in sim_sets:
        sim_sets.append(idx)

print(len(sim_sets))
sim_sets

999


[{'A Gold'},
 {'A Solar System', 'the Solar System', '—a Solar System'},
 {'Academians'},
 {'Academy'},
 {'Achilles'},
 {'Adam'},
 {'Adjudicators'},
 {'Adonis'},
 {'Adriatus'},
 {'Adrius'},
 {'Adrius au Augustus'},
 {'Against their greed And Down in the vale Hear'},
 {'Agea'},
 {'Agoge'},
 {'Agrippina'},
 {'Agrippina au Julii'},
 {'Ai'},
 {'Aja'},
 {'Aja au Grimmus'},
 {'Ajax Minor'},
 {'Alcibiades'},
 {'Alexander'},
 {'Alfrún'},
 {'Alia Snowsparrow'},
 {'Allmother'},
 {'Allmother Death'},
 {'Alone from the Abyss'},
 {'American', 'Americans'},
 {'An Iron Rain', 'an Iron Rain'},
 {'An Olympic Knight',
  'Olympic Knight',
  'an Olympic Knight',
  'my Olympic Knights'},
 {'Andromeda', 'Andromedus'},
 {'Andromeda Galaxy', 'the Andromeda Galaxy'},
 {'Antonia'},
 {'Antonia au Severus'},
 {'Antonia au Severus-Julii'},
 {'Antonius'},
 {'Apollo'},
 {'Apollo Castle'},
 {'Apollonian'},
 {'Apollo’s'},
 {'Apple'},
 {'ArchGovenor Nero au Augustus', 'ArchGovernor Nero au Augustus'},
 {'ArchGovernor',

In [1]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging



In [3]:
predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz")

KeyboardInterrupt: 

In [25]:
span_predictions = pd.DataFrame(columns=["book", "chapter", "top_spans", "antecedent_indices", "predicted_antecedents", "document", "clusters"])
# TODO: really slow => use batches
for idx, (book, chapter, text) in text_df[["book", "chapter", "text"]].iterrows():
    prediction = predictor.predict(text)
    span_predictions.loc[idx] = pd.Series({
        "book": book,
        "chapter": chapter,
        "top_spans": prediction["top_spans"],
        "antecedent_indices": prediction["antecedent_indices"],
        "predicted_antecedents": prediction["predicted_antecedents"],
        "document": prediction["document"],
        "clusters": prediction["clusters"]
    })

  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length
  num_effective_segments = (seq_lengths + self._max_length - 1) // self._max_length


KeyboardInterrupt: 

In [63]:
span_predictions.to_csv("spanbert_coref.csv")

In [60]:
import ast
span_predictions = pd.read_csv("spanbert_coref.csv", index_col=0)
span_predictions.top_spans = span_predictions.top_spans.apply(ast.literal_eval)
span_predictions.antecedent_indices = span_predictions.antecedent_indices.apply(ast.literal_eval)
span_predictions.predicted_antecedents = span_predictions.predicted_antecedents.apply(ast.literal_eval)
span_predictions.document = span_predictions.document.apply(ast.literal_eval)
span_predictions.clusters = span_predictions.clusters.apply(ast.literal_eval)

In [135]:
entity_dict = pd.DataFrame(columns=["chapter", "entity_names", "occurrences"])

for idx, (book, chapter, _, _, _, document, clusters) in span_predictions.iterrows():
    for cluster in clusters:
        entity_names = set()
        occurrences = set()
        for entity in cluster:
            occurrences.add((entity[0], entity[1]))
            entity_names.add(' '.join(document[entity[0]:entity[1]+1]))
        entity_dict.loc[entity_dict.shape[0]] = pd.Series({
            "chapter": chapter,
            "entity_names": list(entity_names),
            "occurrences": list(occurrences)
        })


In [136]:
sim_sets_df = pd.DataFrame({"sim_set": sim_sets})
sim_sets_df

Unnamed: 0,sim_set
0,{A Gold}
1,"{the Solar System, —a Solar System, A Solar Sy..."
2,{Academians}
3,{Academy}
4,{Achilles}
...,...
994,{the green canyon}
995,{the green plains}
996,{the north Metas}
997,{the twelve Olympic Knights}


In [137]:
def find_entity(list1, list2):
    return bool(set(list1).intersection(list2))

# finding entities found by spanbert in normalized entity lists
# index (spanbert) -> value (stanfordnlp)
entity_map = entity_dict.entity_names.apply(lambda x: sim_sets_df.sim_set[sim_sets_df.sim_set.apply(lambda y: find_entity(x, y))].index.values)

In [138]:
entity_hits = entity_map[entity_map.apply(len) != 0]

In [139]:
entity_hits = entity_hits.apply(lambda x: list([sim_sets_df.sim_set[idx] for idx in x][0])[0])

In [141]:
entity_hits = pd.DataFrame({"entity": entity_hits.to_list(), "span_idx": entity_hits.index.to_list()}).groupby("entity")["span_idx"].apply(list)

In [143]:
def join_occur(idx_list):
    occur = {}
    for idx in idx_list:
        chapter = entity_dict.chapter.loc[idx]
        occur[chapter] = entity_dict.occurrences.loc[idx]
    return occur

def get_chapters(idx_list):
    chpts = []
    for idx in idx_list:
        chpts.append(entity_dict.chapter.loc[idx])
    return chpts

entity_hits_df = pd.DataFrame(entity_hits)
entity_hits_df["occurrences"] = entity_hits.apply(join_occur)
entity_hits_df["chapters"] = entity_hits.apply(get_chapters)

In [145]:
t = pd.DataFrame(entity_hits_df.chapters.apply(lambda x: text_df.chapter.apply(lambda y: y in x)))
t.columns = text_df.chapter
entity_hits_df = entity_hits_df.join(t).drop(["chapters", "span_idx"], axis=1)

In [95]:
chapter_hits = entity_hits_df.drop("occurrences", axis=1)
# mask = chapter_hits.copy()
# chapter_hits[mask] = 1
# chapter_hits[~mask] = 0
entity_cross_matrix = pd.DataFrame(columns=["source", "target", "weight"])
for idx in chapter_hits.index.to_list():
    for idy in chapter_hits.index.to_list():
        if idx == idy:
            continue
        weight = sum(chapter_hits.loc[idy] & chapter_hits.loc[idx])
        if weight < 1 or entity_cross_matrix.source.isin([idy]).sum() > 15:
            continue
        entity_cross_matrix.loc[entity_cross_matrix.shape[0]] = pd.Series({
            "source": idx,
            "target": idy,
            "weight": weight
        })
entity_cross_matrix.to_csv("../entity_hits_1.csv", index=False)
# chapter_hits.to_csv("../entity_hits.csv")

In [96]:
entity_cross_matrix.shape

(6093, 3)

In [165]:
chpt_occur_df = pd.DataFrame(entity_hits_df.occurrences.apply(pd.Series))
chpt_occur_df.fillna(False, inplace=True)

In [187]:
window_size = 10  # 0=whole chapter, else #words
window_stride = 8
max_window_size = span_predictions.document.apply(len).max()

cur_window = 0

def in_window(locations) -> bool:
    if not locations:
        return False
    for loc in locations:
     if cur_window < loc[0] < cur_window + window_size: #or cur_window < loc[1] < cur_window + window_size:
         return True
    return False

window_df = {}
while cur_window < max_window_size:
    window_df["window_" + str(cur_window)] = chpt_occur_df.applymap(lambda x: in_window(x))
    cur_window += window_stride
# window_df = pd.DataFrame(window_df)

In [183]:
window_df['window_0']

Unnamed: 0_level_0,8_Alliance,36_Lord_of_War,42_Death_of_a_Gold,34_The_Northwoods,7_The_Afterbirth,9_The_Darkness,11_Red,27_Jelly_Beans,33_A_Dance,45_Helldivers,...,43_The_Sea,36_A_Second_Test,17_What_the_Storm_Brings,44_The_Poet,2_The_Township,9_The_Lie,14_Andromedus,4_The_Gift,49_Why_We_Sing,16_The_Game
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Achilles,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Adrius,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Adrius au Augustus,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Agea,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Agrippina,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
the Triumph Mask,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
the Valles Marineris,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
the Victory Armor,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
the green canyon,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [192]:
entity_list = []
for idx in chapter_hits.index.to_list():
    for idy in chapter_hits.index.to_list():
        if idx == idy:
            continue
        entity_list.append((idx, idy))
len(entity_list)

90902

In [199]:
entity_cross_list = []

t = pd.concat(list(window_df.values()), axis=1)
t.head()

Unnamed: 0_level_0,8_Alliance,36_Lord_of_War,42_Death_of_a_Gold,34_The_Northwoods,7_The_Afterbirth,9_The_Darkness,11_Red,27_Jelly_Beans,33_A_Dance,45_Helldivers,...,43_The_Sea,36_A_Second_Test,17_What_the_Storm_Brings,44_The_Poet,2_The_Township,9_The_Lie,14_Andromedus,4_The_Gift,49_Why_We_Sing,16_The_Game
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Achilles,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Adrius,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Adrius au Augustus,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Agea,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Agrippina,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [200]:
t.columns = list(range(t.shape[1]))

In [203]:
entity_cross_matrix = pd.DataFrame(columns=["source", "target", "weight"])
columns = list(range(t.shape[1]))
for idx, idy in entity_list:
    if entity_cross_matrix.source.isin([idy]).sum() > 0:
        continue

    weight = sum(t.loc[idy] & t.loc[idx])
    if weight < 0:
        continue
    entity_cross_matrix.loc[entity_cross_matrix.shape[0]] = pd.Series({
        "source": idx,
        "target": idy,
        "weight": weight
    })
# entity_cross_matrix.to_csv("../entity_hits_%i.csv" % window_size, index=False)

In [204]:
entity_cross_matrix.head()

Unnamed: 0,source,target,weight
0,Achilles,Adrius,1
1,Achilles,Adrius au Augustus,0
2,Achilles,Agea,0
3,Achilles,Agrippina,0
4,Achilles,Agrippina au Julii,0


In [208]:
entity_cross_matrix[entity_cross_matrix.weight > 10].to_csv("../entity_hits_final_2.csv", index=False)

In [222]:
combined_hits_df.to_csv("../combined_hits.csv")