In [1]:
from glob import glob

ass_paths = glob("../data/Subtitles/*.ass")
srt_paths = glob("../data/Subtitles/*.srt")

subtitles_paths = sorted(ass_paths + srt_paths)



In [None]:
subtitles_paths[:5]

In [None]:
import re

scripts = []
episode_num = []

for path in subtitles_paths:
    with open(path, "r", encoding="utf-8", errors="ignore") as file:
        lines = file.readlines()

        # Skip header lines if it's an .ass file
        if path.endswith(".ass"):
            lines = lines[27:]
            rows = [",".join(line.split(",")[9:]) for line in lines]
            rows = [line.replace("\\N", " ") for line in rows]
            script = " ".join(rows)

        elif path.endswith(".srt"):
            # Basic .srt parsing: skip subtitle numbers and timestamps
            lines = [line.strip() for line in lines if line.strip()]
            rows = [
                line
                for line in lines
                if not re.match(r"^\d+$", line)
                and not re.match(r"^\d{2}:\d{2}:\d{2},\d{3}", line)
            ]
            script = " ".join(rows)

        # Try to extract episode number using regex
        filename = path.split("/")[-1]
        match = re.search(r"\d+", filename)
        if match:
            episode = int(match.group())
            scripts.append(script)
            episode_num.append(episode)
        else:
            print(f"⚠️ Couldn't find episode number in filename: {filename}")

In [None]:
%pip install pandas

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict({"episode": episode_num, "script": scripts})

In [None]:
df.head()

In [None]:
%pip install spacy

In [None]:
!python -m spacy download en_core_web_sm


In [None]:
import spacy

nlp = spacy.load("en_core_web_trf")

In [None]:
doc = nlp("Mark went to Germany")

In [None]:
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
%pip install nltk

In [None]:
from nltk import sent_tokenize


def get_ners(script):
    script_sentences = sent_tokenize(script)

    ner_output = []

    for sentence in script_sentences:
        doc = nlp(sentence)
        ners = set()
        for ent in docs.ents:
            if ent.label_ == "PERSON":
                full_name = ent.text
                first_name = full_name.split(" ")[0]
                ners.add(first_name)
        ner_output.append(list(ners))
    return ner_output

In [None]:
df["ners"] = df["script"].apply(get_ners)

In [None]:
window = 10
entity_relationship = []

for row in df["ners"]:
    previous_entitties_in_window = []

    for sentence in row:
        previous_entitties_in_window.append(sentence)
        previous_entities_in_window = previous_entities_in_window[-10:]

        previous_entities_flattened = sum(previous_entities_in_window, [])

        for entity in sentence:
            for entity_in_window in previous_entities_flattened:
                if entity != entity_in_window:
                    entity_rel = sorted([entity, entity_in_window])
                    entity_relationship.append(entity_rel)

In [None]:
relationship_df = pd.DataFrame({"value": entity_relationship})

In [None]:
relationship_df["source"] = relationship_df["value"].apply(lambda x: x[0])
relationship_df["target"] = relationship_df["value"].apply(lambda x: x[1])

In [None]:
relationship_df = relationship_df.groupby(["source", "target"]).count().reset_index()

In [None]:
relationship_df = relationship_df.sort_values("value", ascending=False)
relationship_df.head()

In [None]:
relationship_df = relationship_df.head(200)

In [None]:
import networkx as nx

G = nx.from_pandas_edgelist(
    relationship_df,
    source="source",
    target="target",
    edge_attr="value",
    create_using=nx.Graph(),
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
pos = nx.kamada_kawai_layout(G)
nx.draw(G, with_labels=True, node_color="skyblue", edge_cmap=plt.cm.Blues, pos=pos)
plt.show()

In [None]:
from pyvis.network import Network

net = Network(
    notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white"
)

node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, "size")

net.from_nx(G)
net.show("naruto.html")