# I. Import library

In [1]:
import spacy
from nltk import sent_tokenize

# import subprocess
import os
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path,'../'))
from utils import load_subtitles_dataset

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [2]:
# !python -m spacy download en_core_web_trf

# II. Load models

In [3]:
def load_model():
    nlp = spacy.load('en_core_web_trf')
    return nlp

In [4]:
spacy_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm


# III.Load Dataset

In [5]:
dataset_path = "../data/Subtitles"

In [6]:
df = load_subtitles_dataset(dataset_path)

In [7]:
df.head()

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...
2,32,"Press down hard on the gas\n That’s right, the..."
3,185,"Rock away your existence,\n Shouting that you ..."
4,191,"Rock away your existence,\n Shouting that you ..."


In [8]:
sample_script = df.iloc[0]['script']
sample_script

'We are Fighting Dreamers aiming high\n Fighting Dreamers don\'t care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else\'s map?\n An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!)\n There\'s nothing to lose, so let\'s GO!!!\n We are Fighting Dreamers aiming high\n Fighting Dreamers don\'t care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn) We\'re gonna do it and do our best!\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn) We\'re gonna do it and do our best! BANG!\n My bod

In [9]:
sentence = sent_tokenize(sample_script)

In [10]:
sentences = sentence[60:90]

In [11]:
sentence = ".".join(sentences)

In [12]:
sentence

'To think I’d already have my handicap spotted..You are such a disgraceful guy as always…\n Dang it..It kind of ticks me off when the Pervy Sage is mocked..Summoning…Jutsu!.The Nine-Tailed Fox kid from the Chunin Exam..I wonder if I should’ve killed him then, after all..For the sake of the Akatsuki organization,\n I acknowledged his achievement… and let him slide by..But the only ones who can break a Five-Pronged Spell\n are the Sannin members and the Third Hokage..Which means Jiraiya may have already taught it to Naruto..Perhaps..If he’s able to control the Nine-Tailed Fox’s power…\n Oh…\n Th-This is…\n It doesn’t quite look to be so..He isn’t endowed with Shinobi ability to begin with, so…\n Why?.He’s still got a long way to go, sure enough..Why?!.Hello..Hey, Gamatatsu..Why did you come out?.Oh, Brother Gamakichi..It’s the first time I’ve been Summoned..I-I wonder if I can do my best?.Idiot..Get some snacks or something and hide..What?.I can get snacks?.Goody-goody..Here I go..I’ll t

# IV. Run model

In [13]:
doc = spacy_model(sentence)

In [14]:
doc.ents

(the Pervy Sage,
 the Chunin Exam,
 Akatsuki,
 Five,
 Sannin,
 Third,
 Jiraiya,
 Naruto,
 Shinobi,
 Gamatatsu,
 Gamakichi,
 first,
 Jiraiya,
 Jiraiya,
 Sannin,
 Naruto)

In [15]:
for entity in doc.ents:
    print(entity.text, entity.label_)

the Pervy Sage PERSON
the Chunin Exam EVENT
Akatsuki ORG
Five CARDINAL
Sannin NORP
Third ORDINAL
Jiraiya PERSON
Naruto PERSON
Shinobi NORP
Gamatatsu PERSON
Gamakichi PERSON
first ORDINAL
Jiraiya PERSON
Jiraiya PERSON
Sannin NORP
Naruto PERSON


In [16]:
def generate_ner_inference(script):
    sentences = sent_tokenize(script)
    output = []
    for sentence in sentences:
        # script = ".".join(sentence)
        ners = set()
        doc = spacy_model(sentence)
        for entity in doc.ents:
            if entity.label_ == "PERSON":

                first_name = entity.text.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        output.append(ners)
    return output

#For example: after getting name and label, then filter label with PERSON, I will have name of all people
# E.g: set(Kakashi, Jiraiya, haha hoho...) -> set to get unique Name
# -> Then I need to get their first name
# E.g: set*(Kakashi, Jiraiya, haha...)
# -> Next, I add this set to the list
# -> [set*(Kakashi, Jiraiya, haha...),...]

NOTES: Remember to check the synonym name, for example Pervy sage = Jiraiya

In [17]:
df['ners'] = df['script'].apply(generate_ner_inference)

In [18]:
df

Unnamed: 0,episode,script,ners
0,94,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."
1,80,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."
2,32,"Press down hard on the gas\n That’s right, the...","[{}, {}, {}, {}, {}, {}, {Lee}, {}, {}, {}, {}..."
3,185,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
4,191,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Hinata},..."
...,...,...,...
213,154,I want to try and gather the unrestrained wind...,"[{}, {}, {}, {The}, {}, {}, {}, {}, {}, {my}, ..."
214,188,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {}, {}, {Choji}, {}, {}, ..."
215,99,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."
216,103,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."


In [19]:
def generate_character_network(df):
    n = 10 # Limit find relationships within 10 sentences (actually it's is the last 10 entity :v)

    entity_relationship = []

    for episode in df['ners']:

        entities = []
        for sentence in episode:
            entities.append(list(sentence))
            entities = entities[-n:]
            flattened_entities = sum(entities, [])

            for entity1 in sentence:
                for entity2 in flattened_entities:
                    if entity1 != entity2:
                        entity_relationship.append(sorted([entity1,entity2]))

    df = pd.DataFrame({"value": entity_relationship})
    df['source'] = df['value'].apply(lambda x : x[0])
    df['target'] = df['value'].apply(lambda x : x[1])
    df = df.groupby(['source', 'target']).count().reset_index()

    df = df.sort_values('value', ascending=False)

    return df

In [20]:
df = generate_character_network(df)

In [21]:
df

Unnamed: 0,source,target,value
2895,Naruto,Sasuke,891
3342,Sakura,Sasuke,527
2888,Naruto,Sakura,462
1517,Hinata,Naruto,345
3147,Orochimaru,Sasuke,280
...,...,...,...
1848,Ino,Shadow,1
683,Chishima,Tenten,1
2670,Lee,Tsunada,1
1849,Ino,Sharingan,1


In [22]:
df = df.sort_values('value', ascending=False)
df = df.head(200)

In [23]:
G = nx.from_pandas_edgelist(
    df,
    source='source',
    target='target',
    edge_attr='value',
    create_using=nx.Graph()
)
node_degree = dict(G.degree)
nx.set_node_attributes(G,node_degree,'size')
net = Network(notebook=True, width='1000px', height='700px', bgcolor="#222222", font_color="white", cdn_resources="remote")
net.from_nx(G) 
net.show("naruto.html")

naruto.html
