# Named Entity Recognition


In [5]:
import spacy_transformers
import spacy 
from nltk import sent_tokenize

In [6]:
#!python -m spacy download en_core_web_trf

# Load Model 

"en_core_web_trf" model helps with NER's 

In [7]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [8]:
nlp_model = load_model()

# Load DataSet

In [9]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [10]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_dataset(dataset_path)

In [11]:
df.head()

Unnamed: 0,episode,script
0,1,"Gold Roger, the King of the Pirates,\n attaine..."
1,2,"Gold Roger, the King of the Pirates,\n attaine..."
2,3,"Gold Roger, the King of the Pirates,\n attaine..."
3,4,"Gold Roger, the King of the Pirates,\n attaine..."
4,5,"Gold Roger, the King of the Pirates,\n attaine..."


In [12]:
sample_script = df.iloc[2]['script']

In [13]:
sample_script

'Gold Roger, the King of the Pirates,\n attained everything this world has to offer.\n The words he uttered just before his death drove people to the seas.\n My treasure? If you want it, you can have it!\n Find it! I left everything this world has to offer there!\n And so men head for the Grand Line in pursuit of their dreams!\n The world has truly entered a Great Pirate Era!\n We\'re going to gather up all our dreams\n and set out in search of something to find\n ONE PIECE!\n Compasses only cause delays\n Delirious with fever, I take the helm\n If the dusty treasure map has been verified, it\'s not a legend!\n When it comes to personal storms,\n simply ride aboard someone else\'s biorhythm\n and pretend it isn\'t there!\n We\'re going to gather up all our dreams\n and set out in search of something to find\n A coin in my pocket, and do you wanna be my friend?\n We are, We are on the cruise!\n We are!\n Fire!\n What are you--?!\n Luffy-san!\n Straw Hat?!\n That doesn\'t work!\n I told 

In [14]:
sentences = sent_tokenize(sample_script)


In [15]:
sentences = sentences[30:200]

In [16]:
sentence = ".".join(sentences)

# Run Model 

In [17]:
doc = nlp_model(sentence)

In [18]:
doc.ents


(Luffy,
 the One Piece!.Here,
 three,
 three,
 Three,
 one,
 Straw Hat,
 the Gum-Gum Fruit,
 Zoro,
 "Axe-Hand,
 Morgan,
 Coby,
 Straw Hat!.Check,
 Navy,
 "Axe-Hand" Morgan!.--Dad!.Kill,
 Morgan,
 the Grand Line!.I,
 Buggy,
 Buggy the Immortal,
 three weeks!.Huh?.You're)

In [19]:
for entity in doc.ents:
    print(entity, entity.label_)

Luffy PERSON
the One Piece!.Here WORK_OF_ART
three CARDINAL
three CARDINAL
Three CARDINAL
one CARDINAL
Straw Hat PERSON
the Gum-Gum Fruit PRODUCT
Zoro PERSON
"Axe-Hand PERSON
Morgan PERSON
Coby PERSON
Straw Hat!.Check PERSON
Navy ORG
"Axe-Hand" Morgan!.--Dad!.Kill PERSON
Morgan PERSON
the Grand Line!.I FAC
Buggy PERSON
Buggy the Immortal PERSON
three weeks!.Huh?.You're DATE


In [26]:
def get_ners_inference(script):
    script_senetence = sent_tokenize(script)

    ner_output = [] 
    for sentence in script_senetence:
        doc = nlp_model(sentence)
        ners = set() 
        for entity in doc.ents:
            if entity.label_ =="PERSON":
                full_name = entity.text
                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)

    return ner_output

In [27]:
df1 = df.head(10)

In [28]:
df1['ners'] = df1['script'].apply(get_ners_inference)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['ners'] = df1['script'].apply(get_ners_inference)


In [29]:
df1['ners'][0]

[{'Gold'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'Luffy'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'Coby'},
 set(),
 set(),
 {'Alvida-sama'},
 set(),
 set(),
 {'Alvida-sama'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'Coby'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'Alvida'},
 set(),
 {'Coby'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'Monkey'},
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 {'Coby'},

# Character Network

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [36]:
def generate_character_network(df):
    windows = 10 
    entity_relationship = [] 

    for row in df['ners']:
        previous_entities_in_window = [] 

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            #Flatten 2D into 1D list 
            previous_entities_flattened = sum(previous_entities_in_window, [])
            
            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))

    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df




In [37]:
relationship_df = generate_character_network(df1)

In [38]:
relationship_df

Unnamed: 0,source,target,value
96,Luffy,Zoro,13
37,Buggy,Shanks,9
90,Luffy,Shanks,8
88,Luffy,Morgan,7
86,Luffy,Luffy-san!,7
...,...,...,...
25,Big,Zoro,1
23,Berry,Makino-san,1
21,Beast,Mohji,1
55,Carrot,Pepper,1


In [39]:
relationship_df[relationship_df['value']!=1]

Unnamed: 0,source,target,value
96,Luffy,Zoro,13
37,Buggy,Shanks,9
90,Luffy,Shanks,8
88,Luffy,Morgan,7
86,Luffy,Luffy-san!,7
...,...,...,...
4,"""Axe-Hand""",Zoro,2
22,Beast,Tamer,2
36,Buggy,Roronoa,2
24,Big,Luffy,2


In [40]:
#For Cleaner Purpose
relationship_df = relationship_df.head(90)

In [41]:
G = nx.from_pandas_edgelist(
    relationship_df, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("luffy_demo.html")

luffy_demo.html
