Named Entit recognition

In [1]:
import spacy
from nltk import sent_tokenize

Load model

In [2]:
def load_model():
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [3]:
nlp_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm


load Dataset

In [4]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_datasetnew

In [5]:
dataset_path = "../data/Subtitles/"
df = load_subtitles_datasetnew(dataset_path)

In [6]:
df.head()

Unnamed: 0,episode,start,end,text
0,Naruto Season 1 - 01.ass,5950,11990,"A long time ago, a powerful demon fox\Nappeare..."
1,Naruto Season 1 - 01.ass,12990,13990,"With its powerful tails,"
2,Naruto Season 1 - 01.ass,13990,17120,it could smash mountains\Nand create tidal waves.
3,Naruto Season 1 - 01.ass,17790,22580,A band of Ninjas rose to\Ndefend their village...
4,Naruto Season 1 - 01.ass,22990,25620,We have to wait until\Nthe Fourth Hokage gets ...


In [7]:
sample_script = df.iloc[0]['text']
sample_script

'A long time ago, a powerful demon fox\\Nappeared with nine tails.'

In [8]:
all_text = " ".join(df["text"].tolist())

In [9]:
all_text



In [10]:
sentences = sent_tokenize(all_text)

In [11]:
sentence = ".".join(sentences)

In [12]:
sentence



In [14]:
sentences = sentences[60:90]

Run Model

In [16]:
doc = nlp_model(" ".join(sentences))

In [17]:
doc.ents

(Fourth,
 nine,
 Leaf,
 Ninja Academy,
 Tomorrow,
 Iruka Sensei.,
 Mizuki Sensei,
 three,
 Naruto,
 just one)

In [18]:
for entity in doc.ents:
    print(entity, entity.label_)

Fourth ORDINAL
nine CARDINAL
Leaf PRODUCT
Ninja Academy ORG
Tomorrow DATE
Iruka Sensei. PERSON
Mizuki Sensei PERSON
three CARDINAL
Naruto PERSON
just one CARDINAL


In [19]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)
    ners = set()
    
    for sentence in script_sentences:
        doc = nlp_model(sentence)
        for entity in doc.ents:
            if entity.label_ == "PERSON":
                first_name = entity.text.split(" ")[0].strip()
                ners.add(first_name)

    return list(ners)


In [20]:
df = df.head(10)

In [21]:
df

Unnamed: 0,episode,start,end,text
0,Naruto Season 1 - 01.ass,5950,11990,"A long time ago, a powerful demon fox\Nappeare..."
1,Naruto Season 1 - 01.ass,12990,13990,"With its powerful tails,"
2,Naruto Season 1 - 01.ass,13990,17120,it could smash mountains\Nand create tidal waves.
3,Naruto Season 1 - 01.ass,17790,22580,A band of Ninjas rose to\Ndefend their village...
4,Naruto Season 1 - 01.ass,22990,25620,We have to wait until\Nthe Fourth Hokage gets ...
5,Naruto Season 1 - 01.ass,25870,28080,We can't let it get any closer\Nto our village!
6,Naruto Season 1 - 01.ass,31990,38660,One great Ninja was able to\Nimprison the mons...
7,Naruto Season 1 - 01.ass,38990,40990,but died in the process.
8,Naruto Season 1 - 01.ass,41330,47160,This Ninja was known as…\Nthe Fourth Hokage.
9,Naruto Season 1 - 01.ass,64580,65660,Naruto!


In [22]:
df['ners'] = df['text'].apply(get_ners_inference)

In [23]:
df

Unnamed: 0,episode,start,end,text,ners
0,Naruto Season 1 - 01.ass,5950,11990,"A long time ago, a powerful demon fox\Nappeare...",[]
1,Naruto Season 1 - 01.ass,12990,13990,"With its powerful tails,",[]
2,Naruto Season 1 - 01.ass,13990,17120,it could smash mountains\Nand create tidal waves.,[]
3,Naruto Season 1 - 01.ass,17790,22580,A band of Ninjas rose to\Ndefend their village...,[]
4,Naruto Season 1 - 01.ass,22990,25620,We have to wait until\Nthe Fourth Hokage gets ...,[]
5,Naruto Season 1 - 01.ass,25870,28080,We can't let it get any closer\Nto our village!,[]
6,Naruto Season 1 - 01.ass,31990,38660,One great Ninja was able to\Nimprison the mons...,[]
7,Naruto Season 1 - 01.ass,38990,40990,but died in the process.,[]
8,Naruto Season 1 - 01.ass,41330,47160,This Ninja was known as…\Nthe Fourth Hokage.,[]
9,Naruto Season 1 - 01.ass,64580,65660,Naruto!,[Naruto]


Character Network

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [25]:
def generate_character_network(df):

    windows=10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            previous_entities_in_window.append(list(sentence))
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in sentence:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))
    
    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df

In [26]:
relationship_df = generate_character_network(df)

In [27]:
relationship_df

Unnamed: 0,source,target,value
0,N,a,2
1,N,o,2
2,N,r,2
3,N,t,2
4,N,u,2
5,a,o,2
6,a,r,2
7,a,t,2
8,a,u,2
9,o,r,2


In [28]:
relationship_df = relationship_df.sort_values('value', ascending=False)
relationship_df = relationship_df.head(200)

In [29]:
G = nx.from_pandas_edgelist(
    relationship_df, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("naruto.html")

naruto.html
