In [1]:
from glob import glob
subtitles_paths = sorted(glob("../data/subtitles/*.ass"))

In [2]:
subtitles_paths[:5]

['../data/subtitles\\Naruto Season 1 - 01.ass',
 '../data/subtitles\\Naruto Season 1 - 02.ass',
 '../data/subtitles\\Naruto Season 1 - 03.ass',
 '../data/subtitles\\Naruto Season 1 - 04.ass',
 '../data/subtitles\\Naruto Season 1 - 05.ass']

In [3]:
scripts = []
episode_num = []
for path in subtitles_paths:
    
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = lines[27:]
        
        rows = [",".join(line.split(',')[9:]) for line in lines]
    
    rows = [line.replace("\\N", "") for line in rows]
    script = " ".join(rows)
    
    episode = int(path.split('-')[1].split('.')[0].strip())
    
    scripts.append(script)
    episode_num.append(episode)

In [4]:
import pandas as pd
df = pd.DataFrame.from_dict({'episode': episode_num, 'script': scripts})

In [5]:
df.head()

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon foxappeared ..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."


In [6]:
print(df.columns)


Index(['episode', 'script'], dtype='object')


In [7]:
import spacy
spacy.require_gpu()
nlp = spacy.load('en_core_web_trf')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  model.load_state_dict(torch.load(filelike, map_location=device))


In [8]:
import nltk
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
doc = nlp("Mark went to Germany")

for ent in doc.ents:
    print(ent.text, ent.label_)

Mark PERSON
Germany GPE


In [10]:
from nltk import sent_tokenize

def get_ners(script):
    # Debug: cek input
    if not isinstance(script, str):
        print("Error: script bukan string.")
        return []
    
    # Debug: Cek apakah script kosong
    if not script.strip():
        print("Warning: script kosong.")
        return []
    
    # Coba tokenisasi kalimat
    try:
        script_sentences = sent_tokenize(script)
        print(f"Jumlah kalimat: {len(script_sentences)}")
    except Exception as e:
        print(f"Error dalam sent_tokenize: {e}")
        return []
    
    ner_output = []
    
    for i, sentence in enumerate(script_sentences):
        print(f"Memproses kalimat ke-{i+1}: {sentence}")
        
        try:
            # Pastikan 'nlp' telah diinisialisasi
            doc = nlp(sentence)
            ners = set()
            
            for ent in doc.ents:
                if ent.label_ == 'PERSON':
                    full_name = ent.text
                    first_name = full_name.split(' ')[0]
                    ners.add(first_name)
            
            ner_output.append(list(ners))
            print(f"Nama-nama yang ditemukan di kalimat ke-{i+1}: {list(ners)}")
        
        except Exception as e:
            print(f"Error saat memproses NER untuk kalimat ke-{i+1}: {e}")
            ner_output.append([])  # Tambahkan daftar kosong jika terjadi error pada kalimat ini
    
    return ner_output


In [16]:
df['ners'] = df['script'].apply(get_ners)

Jumlah kalimat: 279
Memproses kalimat ke-1: A long time ago, a powerful demon foxappeared with nine tails.
Nama-nama yang ditemukan di kalimat ke-1: []
Memproses kalimat ke-2: With its powerful tails,
 it could smash mountainsand create tidal waves.
Nama-nama yang ditemukan di kalimat ke-2: []
Memproses kalimat ke-3: A band of Ninjas rose todefend their village from attack.
Nama-nama yang ditemukan di kalimat ke-3: []
Memproses kalimat ke-4: We have to wait untilthe Fourth Hokage gets here!
Nama-nama yang ditemukan di kalimat ke-4: []
Memproses kalimat ke-5: We can't let it get any closerto our village!
Nama-nama yang ditemukan di kalimat ke-5: []
Memproses kalimat ke-6: One great Ninja was able toimprison the monster,
 but died in the process.
Nama-nama yang ditemukan di kalimat ke-6: []
Memproses kalimat ke-7: This Ninja was known as…the Fourth Hokage.
Nama-nama yang ditemukan di kalimat ke-7: ['Ninja']
Memproses kalimat ke-8: Naruto!
Nama-nama yang ditemukan di kalimat ke-8: ['Narut

In [17]:
print(df.columns)

Index(['episode', 'script', 'ners'], dtype='object')


In [18]:
window = 10
entity_relationship = []

for row in df['ners']:
    previous_entities_in_window = []
    
    for sentence in row:
        previous_entities_in_window.append(sentence)
        previous_entities_in_window = previous_entities_in_window[-10:]
        
        previous_entities_flattend = sum(previous_entities_in_window, [])
        
        for entity in sentence:
            for entity_in_window in previous_entities_flattend:
                if entity != entity_in_window:
                    entity_rel = sorted([entity, entity_in_window])
                    entity_relationship.append(entity_rel)

In [19]:
relationship_df = pd.DataFrame({'value':entity_relationship})


In [20]:
relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])

In [21]:
relationship_df = relationship_df.groupby(['source','target']).count().reset_index()

In [22]:
relationship_df = relationship_df.sort_values('value',ascending=False)
relationship_df.head()

Unnamed: 0,source,target,value
3691,Naruto,Sasuke,803
4477,Sakura,Sasuke,492
3682,Naruto,Sakura,447
1848,Hinata,Naruto,326
153,Akamaru,Kiba,237


In [23]:
# Menyimpan ke dalam format CSV
relationship_df.to_csv("entity_relationships.csv", index=False)