In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx

In [2]:
!python3 -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
# Load spacy English languague model
NER = spacy.load("en_core_web_sm")

text = open('./data/burial-rites.txt').read()
doc = NER(text)

Load character names

In [22]:
character_df = pd.read_csv("characters.csv")

Find and filter all entities in book for characters only

In [23]:
sent_entity_df = []

# Loop through sentences, store named entity list for each sentence
for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})
    
sent_entity_df = pd.DataFrame(sent_entity_df)

In [6]:
sent_entity_df # Entities in every sentence

Unnamed: 0,sentence,entities
0,"(, About, Burial, Rites, \n)",[Burial Rites]
1,"(In, northern, Iceland, ,, 1829, ,, Agnes, Mag...","[Iceland, 1829, Agnes Magnúsdóttir, two]"
2,"(Agnes, is, sent, to, wait, out, the, time, le...","[Agnes, Jón Jónsson, two]"
3,"(Horrified, to, have, a, \n, convicted, murder...",[Agnes]
4,"(Only, \n, Tóti, ,, the, young, assistant, rev...","[Tóti, Agnes]"
5,"(As, the, \n, summer, months, fall, away, to, ...","[the\nsummer months fall, winter, Agnes]"
6,"(And, as, the, days, to, her, execution, draw,...","[the days, Burial Rites]"
7,"(In, beautiful, ,, cut, -, glass, prose, ,, \n...","[Hannah Kent, Iceland, one]"
8,"(‘, Burial, Rites, is, an, accomplished, gem, ...",[]
9,"(GERALDINE, BROOKS, \n, ‘, So, gripping, I, wa...",[BROOKS\n‘]


In [7]:
# Function to filter out non-character entities
def filter_entity(ent_list, character_df):
    return [ent for ent in ent_list 
            if ent in list(character_df.alias) 
            or ent in list(character_df.character_firstname)]

In [21]:
sent_entity_df['character_entities'] = sent_entity_df['entities'].apply(lambda x: filter_entity(x, character_df))

# Filter out sentences that don't have any character entities
sent_entity_df_filtered = sent_entity_df[sent_entity_df['character_entities'].map(len) > 0]

In [9]:
# Take only first name of characters
sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0] 
                                                                                                               for item in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sent_entity_df_filtered['character_entities'] = sent_entity_df_filtered['character_entities'].apply(lambda x: [item.split()[0]


In [10]:
pd.reset_option('^display.', silent=True)
sent_entity_df_filtered

Unnamed: 0,sentence,entities,character_entities
2,"(Agnes, is, sent, to, wait, out, the, time, le...","[Agnes, Jón Jónsson, two]",[Agnes]
3,"(Horrified, to, have, a, \n, convicted, murder...",[Agnes],[Agnes]
4,"(Only, \n, Tóti, ,, the, young, assistant, rev...","[Tóti, Agnes]","[Tóti, Agnes]"
5,"(As, the, \n, summer, months, fall, away, to, ...","[the\nsummer months fall, winter, Agnes]",[Agnes]
43,"(The, Land, Court, judge, convicted, Pétur, on...","[Land Court, Pétur, 5th, February last year, f...",[Pétur]
...,...,...,...
8504,"(All, place, names, used, in, this, novel, are...","[Agnes, this day]",[Agnes]
8505,"(Many, known, and, established, facts, about, ...",[Agnes],[Agnes]
8506,"(The, \n, family, at, the, farm, of, Kornsá, d...","[Kornsá, Agnes, Stóra-Borg, Agnes, Þórvarður J...","[Agnes, Agnes]"
8507,"(The, nature, of, their, relationship, ,, incl...","[first, Agnes]",[Agnes]


Create relationships

In [11]:
window_size = 5
relationships = []

for i in range(sent_entity_df_filtered.index[-1]):
    end_i = min(i+5, sent_entity_df_filtered.index[-1])
    char_list = sum((sent_entity_df_filtered.loc[i: end_i].character_entities), [])
    
    # Remove duplicated characters that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [12]:
relationship_df = pd.DataFrame(relationships)

In [13]:
pd.set_option('display.max_rows', None)
relationship_df

Unnamed: 0,source,target
0,Agnes,Tóti
1,Tóti,Agnes
2,Agnes,Tóti
3,Tóti,Agnes
4,Agnes,Tóti
5,Tóti,Agnes
6,Agnes,Tóti
7,Tóti,Agnes
8,Tóti,Agnes
9,Pétur,Natan


In [14]:
# Sort the cases with a->b and b->a
relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df

Unnamed: 0,source,target
0,Agnes,Tóti
1,Agnes,Tóti
2,Agnes,Tóti
3,Agnes,Tóti
4,Agnes,Tóti
5,Agnes,Tóti
6,Agnes,Tóti
7,Agnes,Tóti
8,Agnes,Tóti
9,Natan,Pétur


In [15]:
relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [20]:
# Create a graph from a pandas dataframe
G = nx.from_pandas_edgelist(relationship_df, 
                            source = "source", 
                            target = "target", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

In [19]:
# %%
from pyvis.network import Network
net = Network(notebook = True, width="1000px", height="700px", bgcolor='#222222', font_color='white')

node_degree = dict(G.degree)

#Setting up node size attribute
nx.set_node_attributes(G, node_degree, 'size')

net.from_nx(G)
net.show("br.html")

br.html


Most important characters in Burial Rites

In [18]:
# Degree centrality
degree_dict = nx.degree_centrality(G)
degree_dict

{'Agnes': 0.8684210526315789,
 'Tóti': 0.7105263157894737,
 'Natan': 0.6578947368421052,
 'Pétur': 0.23684210526315788,
 'Jón': 0.47368421052631576,
 'Margrét': 0.6052631578947368,
 'Steina': 0.3421052631578947,
 'Lauga': 0.3421052631578947,
 'Kristín': 0.2631578947368421,
 'Ingibjörg': 0.21052631578947367,
 'Jóas': 0.23684210526315788,
 'Inga': 0.21052631578947367,
 'Páll': 0.13157894736842105,
 'Róslín': 0.2894736842105263,
 'Fridrik': 0.39473684210526316,
 'Gudmundur': 0.2894736842105263,
 'Björn': 0.21052631578947367,
 'Sigga': 0.42105263157894735,
 'Thorvardur': 0.18421052631578946,
 'Sigrídur': 0.10526315789473684,
 'Haukur': 0.05263157894736842,
 'Gudrún': 0.15789473684210525,
 'Dagga': 0.10526315789473684,
 'Bjarni': 0.18421052631578946,
 'Helga': 0.10526315789473684,
 'Magnús': 0.13157894736842105,
 'Ingveldur': 0.07894736842105263,
 'Kjartan': 0.23684210526315788,
 'Gudbjörg': 0.15789473684210525,
 'Ragnar': 0.05263157894736842,
 'Rósa': 0.23684210526315788,
 'Karitas': 0.210