In [2]:
import pandas as pd
import numpy as np 
import spacy
from flair.data import Sentence
from flair.models import SbequenceTagger
from spacy import displacy
import networkx as nx
import matplotlib.pyplot as plt
import os

In [32]:
def load_book_ner(book_dir,book_num):
    spacy.cli.download("en_core_web_sm")
    # Load spacy English languague model
    NER = spacy.load("en_core_web_sm")
    
 
    # Get all book files in the data directory
    all_books = [b for b in os.scandir(book_dir) if '.txt' in b.name]
    book=all_books[book_num]
    book_text=open(book).read()
    book_doc=NER(book_text)
    return book_doc

In [16]:
def filter_entity(ent_list,character_df):
    return [ent for ent in ent_list
            if ent in list(character_df['Character'])
            or ent in list(character_df['character_firstname'])]

In [60]:
def sent_entity_df(book_doc,character_df):
    import re
    sent_entity_df=[]

    for sent in book_doc.sents:
        entity_list=[ent.text for ent in sent.ents]
        sent_entity_df.append({'sentence':sent,'entites':entity_list})

    sent_entity_df=pd.DataFrame(sent_entity_df)
    sent_entity_df['character_entities']= sent_entity_df['entites'].apply(lambda x: filter_entity(x,character_df))
    sent_entity_df=sent_entity_df[sent_entity_df['character_entities'].map(len)>0]
    return sent_entity_df

In [79]:
def relationship_df(window,sent_entity_df):
    # for creating relationships with window size of 5 (the number of rows included in the relationship extraction)
    window_size=window
    relationships= []
    for i in range(sent_entity_df.index[-1]):
        end_i= min(i+5,sent_entity_df.index[-1])
        char_list=sum((sent_entity_df.loc[i:end_i].character_entities),[])


        #Removes duplicated characters that are next to each other
        char_unique=[char_list[i] for i in range(len(char_list))
                    if (i==0) or char_list[i] != char_list[i-1]]
        if len(char_unique)>1:
            for idx,a in enumerate(char_unique[:-1]):
                b=char_unique[idx + 1]
                relationships.append({'source':a,'target':b})
    relationship_df=pd.DataFrame(relationships)            
    relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
    relationship_df["value"]=1
    relationship_df=relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()
                
    return relationship_df


In [111]:
def show_relationships(relationship_df):
    # Create a graph from a pandas dataframe
    from pyvis.network import Network
    import community.community_louvain as community_louvain
    import networkx as nx
    G = nx.from_pandas_edgelist(relationship_df, 
                                source = "source", 
                                target = "target", 
                                edge_attr = "value", 
                                create_using = nx.Graph())
    communities = community_louvain.best_partition(G)
    nx.set_node_attributes(G, communities, 'group')
    com_net = Network(notebook = True, width="1000px", height="700px", bgcolor='#222222', font_color='white')
    com_net.from_nx(G)
    a=com_net.show("lotr.html")
    return a


In [None]:
##Testing the functio

In [50]:
char=pd.read_csv('Characters')

In [54]:
sents=sent_entity_df(book,char)

In [87]:
trial=relationships(5,sents)

In [112]:
show_relationships(trial)