In [1]:
import pandas as pd
import numpy as np
import spacy
from pathlib import Path
import os
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import community as community_louvain
import re

In [None]:
# Load simple language model, need to download this spacy en dictionary ahead of time
NER = spacy.load("en_core_web_sm")

### Get the Books

In [None]:
cwd = Path.cwd()
bookdir = cwd / 'books'

# Traverse and get all names
p = Path(bookdir).glob('**/*')
booklist = [x for x in p if x.is_file()]

print(booklist)

In [None]:
book = booklist[0]
book_text = open(book).read()
book_doc = NER(book_text)

In [None]:
# Visualize using displacy
displacy.render(book_doc[0:200], style="ent", jupyter=True)

### Import Character List and Clean Up

In [None]:
# Load Character dfs
characters = pd.read_csv('./characters/characters.csv')
characters

In [None]:
# Remove brackets and text within brackets using regex
characters['character'] = characters['character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x))
# If name is split, ie Geralt of Rivia, get only first part
characters['character_firstname'] = characters['character'].apply(lambda x: x.split(' ', 1)[0])
# Make exception for "Two Tusks", because first name of "Two" causes problems
characters['character_firstname'] = characters['character_firstname'].replace(['Two'], 'Two Tusks')
characters.tail(30)

### Get Named Entity List Per Sentence

In [None]:
sentence_entities = []

# Loop and find all
for sentence in book_doc.sents:
    entity_list = [ent.text for ent in sentence.ents]
    sentence_entities.append({"sentence": sentence, "entities": entity_list})
    
sentence_entities = pd.DataFrame(sentence_entities)

In [None]:
sentence_entities

### Filter out all entities

In [None]:
def filter_entity(entity_list, characters):
    return [ent for ent in entity_list
            if ent in list(characters.character)
            or ent in list(characters.character_firstname)]

In [None]:
filter_entity(["Geralt", "Thu", "Ciri", "Nenneke", "Two"], characters)

### Apply filter function to dataframe

In [None]:
sentence_entities['character_entities'] = sentence_entities['entities'].apply(lambda x: filter_entity(x, characters))

# Remove sentences where there are no named entities
sentence_entities_filtered = sentence_entities[sentence_entities['character_entities'].map(len) > 0]

In [None]:
# Take only first name of character in entities
sentence_entities_filtered['character_entities'] = sentence_entities_filtered['character_entities'].apply(lambda x: [item.split()[0]
                                                                                               for item in x])


In [None]:
sentence_entities_filtered

### Create rolling windows throughout the text and do analysis

In [None]:
window_size = 5
relationships = []

for i in range(sentence_entities_filtered.index[-1]):
    end_index = min(i+5, sentence_entities_filtered.index[-1])
    character_list = sum((sentence_entities_filtered.loc[i: end_index].character_entities), [])
    
    # Remove duplicates that are next to each other:
    unique_characters = [character_list[i] for i in range(len(character_list))
                         if (i==0) or character_list[i] !=character_list[i-1]]
    
    if len(unique_characters) > 1:
        for index, a in enumerate(unique_characters[:-1]):
            b = unique_characters[index + 1]
            relationships.append({"source": a, "target": b})

In [None]:
relationships = pd.DataFrame(relationships)
relationships

### Aggregate the relationships

In [None]:
### First need to sort order so its uniform

relationships = pd.DataFrame(np.sort(relationships.values, axis=1), columns = relationships.columns)
relationships

In [None]:
# For each row make the value 1 and sum
relationships["value"] = 1
relationships = relationships.groupby(["source", "target"], sort=False, as_index=False).sum()

In [None]:
relationships

### Part 3: Graph visualization with networkx

In [None]:
# Create graph from df

G = nx.from_pandas_edgelist(relationships,
                            source="source",
                            target="target",
                            edge_attr="value",
                            create_using=nx.Graph())

In [None]:
plt.figure(figsize=(10,10))
pos = nx.kamada_kawai_layout(G)

nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos=pos)
plt.show()

### Use PyVis instead

In [None]:
from pyvis.network import Network
net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white")

# Make node size relative
node_degree = dict(G.degree)

# Setting up node size
nx.set_node_attributes(G, node_degree, "size")

# Pass in networkx Graph G
net.from_nx(G)
net.show("witcher.html")

### More fun analysis, attempting to find most important characters

In [None]:
# Degree centrality
degree_dict = nx.degree_centrality(G)
degree_dict

In [None]:
degrees = pd.DataFrame.from_dict(degree_dict, orient="index", columns=['centrality'])
# Plot top 10
plt.style.use('dark_background') #For dark backgrounds
degrees.sort_values('centrality', ascending=False)[0:9].plot(kind="bar")

In [None]:
# Closeness centrality
closeness_dict = nx.closeness_centrality(G)
closeness = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=['centrality'])
# Plot the top 10
closeness.sort_values('centrality', ascending=False)[0:9].plot(kind="bar")

In [None]:
# Betweeness centrality
betweenness_dict = nx.betweenness_centrality(G)
betweenness = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=['centrality'])
# Plot the top 10
betweenness.sort_values('centrality', ascending=False)[0:9].plot(kind="bar")

In [None]:
# Save centrality measure

nx.set_node_attributes(G, degree_dict, 'degree_centrality')
nx.set_node_attributes(G, closeness_dict, 'closeness_centrality')
nx.set_node_attributes(G, betweenness_dict, 'betweenness_centrality')

#### Community Detection

In [None]:
communities = community_louvain.best_partition(G)

In [None]:
communities

In [None]:
nx.set_node_attributes(G, communities, 'group')

In [None]:
com_net = Network(notebook = True, width="1000px", height="700px", bgcolor='#00000F', font_color='white')
com_net.from_nx(G)
com_net.show("witcher_communities.html")

### Misc

### Find this "two"

In [None]:
characters

In [None]:
find_two_chars = characters.loc[characters['character'].str.contains('Two', case=False)]
find_two_chars

In [None]:
find_two_src = relationships.loc[relationships['source'].str.contains('Two', case=False)]
find_two_tar = relationships.loc[relationships['target'].str.contains('Two', case=False)]
find_two_src

In [None]:
for p in Path(bookdir).iterdir():
    print(p)

In [None]:
booklist = [x for x in os.listdir(bookdir) if x.endswith(".txt")]
booklist

In [None]:
book = booklist[0]
book_text