In [1]:
from transformers import pipeline
from nltk import sent_tokenize # tokenize sequence: Separate text -> multiple sentences
import nltk
import torch
from glob import glob #get all files path we want
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#This model is used by NLTK for tasks like splitting text into sentences.
nltk.download('punkt_tab')#pre-trained tokenizer model 

[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1028)>


False

# I. Loading Model

In [3]:
model_name = 'facebook/bart-large-mnli'
device = 0 if torch.cuda.is_available() else 'cpu'

In [4]:
def load_model(device):
    theme_classifier = pipeline(
        'zero-shot-classification',
        model = model_name,
        device=device
    )
    return theme_classifier

In [None]:
theme_classifier = load_model(device)

In [None]:
theme_list =['friendship','hope','sacrifice','battle','self development','betrayal','love','dialogue']

In [None]:
theme_classifier(
    'I gave him a right hook then a left job',
    theme_list,
    multi_label=True
)

{'sequence': 'I gave him a right hook then a left job',
 'labels': ['battle',
  'self development',
  'sacrifice',
  'betrayal',
  'hope',
  'dialogue',
  'friendship',
  'love'],
 'scores': [0.8370532393455505,
  0.6021759510040283,
  0.11033692210912704,
  0.045420192182064056,
  0.038481198251247406,
  0.03672140836715698,
  0.005136602092534304,
  0.0022709285840392113]}

# II. Loading dataset

In [None]:
files = glob('../data/Subtitles/*.ass')

In [None]:
def load_subtitles_data(dataset_path):
    subtitle_paths = glob(dataset_path+'/*.ass')
    
    scripts = []
    episode_number = []

    for path in subtitle_paths:

        with open(path, 'r', encoding='utf-8') as file:
            lines = file.readlines()[27:]
            lines = [ ','.join(line.split(',')[9:]) for line in lines]
        lines = [line.replace('\\N', ' ') for line in lines]
        script = ' '.join(lines)

        episode = int(path.split('-')[-1].split('.')[0].strip())

        scripts.append(script)
        episode_number.append(episode)
        # print(episode_number)
    return pd.DataFrame.from_dict({'episode':episode_number,'script':scripts})
     

In [None]:
dataset_path = '../data/Subtitles/'

df = load_subtitles_data(dataset_path)

In [None]:
df.head(20)

Unnamed: 0,episode,script
0,1,"A long time ago, a powerful demon fox appeared..."
1,2,"C'mon!\n Running like a fugitive,\n Being chas..."
2,3,"C'mon!\n Running like a fugitive,\n Being chas..."
3,4,"C'mon!\n Running like a fugitive,\n Being chas..."
4,5,"C'mon!\n Running like a fugitive,\n Being chas..."
5,6,"C'mon!\n Running like a fugitive,\n Being chas..."
6,7,"C'mon!\n Running like a fugitive,\n Being chas..."
7,8,"C'mon!\n Running like a fugitive,\n Being chas..."
8,9,"C'mon!\n Running like a fugitive,\n Being chas..."
9,12,"C'mon!\n Running like a fugitive,\n Being chas..."


# III. Run Model


In [None]:
script = str(df[df['episode'] == 1]['script'])

In [None]:
script = df[df['episode'] == 1]['script'].values[0]
script_sentences = sent_tokenize(script)

In [None]:
#Batch sentence
sentence_batch_size = 20
script_batches = []
for index in range(0, len(script_sentences), sentence_batch_size):
    sent = ' '.join(script_sentences[index:index + sentence_batch_size])
    script_batches.append(sent)

In [None]:
theme_output = theme_classifier(
    script_batches[:2],
    theme_list,
    multi_label=True
)

In [None]:
theme_output

[{'sequence': "A long time ago, a powerful demon fox appeared with nine tails. With its powerful tails,\n it could smash mountains and create tidal waves. A band of Ninjas rose to defend their village from attack. We have to wait until the Fourth Hokage gets here! We can't let it get any closer to our village! One great Ninja was able to imprison the monster,\n but died in the process. This Ninja was known as… the Fourth Hokage. Naruto! Why did you do such a thing?! You're really gonna get it this time! I don't care! You know your problem? You can't do the things I do! Only I can do this! I'm better than all of you! Believe it! There's a problem, sir! Lord Hokage! What is it? Did that Naruto do something again?",
  'labels': ['dialogue',
   'betrayal',
   'battle',
   'sacrifice',
   'self development',
   'hope',
   'friendship',
   'love'],
  'scores': [0.9800742268562317,
   0.9396904110908508,
   0.8546884059906006,
   0.7349812984466553,
   0.7284976243972778,
   0.199097707867622

In [None]:
# Wrangle output
themes = {}
for output in theme_output:
    for label, score in zip(output['labels'],output['scores']):
        if label not in themes:
            themes[label] = []
        themes[label].append(score)

In [None]:
themes

{'dialogue': [0.9800742268562317, 0.9370126724243164],
 'betrayal': [0.9396904110908508, 0.6457259654998779],
 'battle': [0.8546884059906006, 0.6581317186355591],
 'sacrifice': [0.7349812984466553, 0.6258842349052429],
 'self development': [0.7284976243972778, 0.8678209185600281],
 'hope': [0.19909770786762238, 0.20423933863639832],
 'friendship': [0.059223175048828125, 0.086033396422863],
 'love': [0.040261924266815186, 0.02802056074142456]}

In [None]:
def get_themes_inference(script):
    script_sentences = sent_tokenize(script)

    #Batch sentence
    sentence_batch_size = 20
    script_batches = []
    for index in range(0, len(script_sentences), sentence_batch_size):
        sent = ' '.join(script_sentences[index:index + sentence_batch_size])
        script_batches.append(sent)

    #Run Model
    theme_output = theme_classifier(
        script_batches[:2],
        theme_list,
        multi_label=True
    )

    # Wrangle output
    themes = {}
    for output in theme_output:
        for label, score in zip(output['labels'],output['scores']):
            if label not in themes:
                themes[label] = []
            themes[label].append(score)

    themes = {key: np.mean(np.array(value)) for key, value in themes.items()}
    return themes

In [None]:
df.head(2)

{'dialogue': 0.958543449640274,
 'betrayal': 0.7927081882953644,
 'battle': 0.7564100623130798,
 'sacrifice': 0.6804327666759491,
 'self development': 0.798159271478653,
 'hope': 0.20166852325201035,
 'friendship': 0.07262828573584557,
 'love': 0.03414124250411987}