# Introduction
This notebook contains the following components:


1.   Code to extract data from the Critical Role wiki about Recaps, Summaries and Timeline events from campaing 1 and 2.
2.   Pre-processing functions to manipolate the data extracted from point 1, consisting in main group explicitation + coreference resolution
3.   Events extraction using Verbatlas




# Environment

In [None]:
# -------------  general 
import os
import sys
import subprocess
import numpy                                    as np
import pandas                                   as pd
from numpy.linalg       import norm
from google.colab       import files
from google.colab       import drive
from tqdm.notebook      import tqdm
import re 
import json
import torch                                     as T
import math
import copy
import time
import csv
import pickle
from collections        import defaultdict

# ------------- GT extraction

# handle HTML tree
import bs4
from bs4                import BeautifulSoup
from bs4 import Tag

# handle http request/response
import requests
from urllib.parse       import unquote

#-------------- Spacy (restart runtime for correct installation)

from spacy.cli.download import download         as spacy_download
PIPELINES_TYPE = ["en_core_web_sm","en_core_web_md","en_core_web_lg","en_core_web_trf"]
SIZE_IDX = 3
spacy_download(PIPELINES_TYPE[SIZE_IDX])    # the default used 
subprocess.run(['pip','install', 'spacy-transformers'])
import spacy_transformers
import spacy
spacy.prefer_gpu()
from spacy.matcher      import Matcher

# ------------- Coreference Resolution

import torchtext                                 as TT
subprocess.run(['pip', 'install', 'fastcoref'])
from fastcoref          import spacy_component                # custom spacy plug-in
import difflib                                   as dl

# ------------- Unification

subprocess.run(['pip', 'install', 'fuzzywuzzy'])
from fuzzywuzzy         import fuzz

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')
from nltk                 import word_tokenize, pos_tag, ne_chunk
from nltk.corpus          import stopwords
from nltk.tree            import Tree
from nltk.tokenize        import WhitespaceTokenizer
from nltk.tokenize        import sent_tokenize
stopWords = set(stopwords.words('english'))

# pytorch device
device = T.device("cuda:0" if T.cuda.is_available() else "cpu")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


## Mount drive

In [None]:
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


## Utilities & Paths 

In [None]:
"""
              choose here the arc & campaing number for GT
              count start from zero (covered first & second arc in the data folder of the project)
"""
arc_n = 0; campaign_n  = 0


"""
              Paths definition
"""

# drive links
PATH_PROJECT_FOLDER = "/content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/"
PATH_DATA_FOLDER = PATH_PROJECT_FOLDER + "data/"

# my links
if campaign_n == 0:
  PATH_GT = PATH_DATA_FOLDER + "GT/"
  # leo' links
  PATH_RECAP = PATH_DATA_FOLDER + "recap/"
  PATH_SUMMARIES = PATH_DATA_FOLDER + "summaries/"
  PATH_TIMELINE = PATH_DATA_FOLDER + "timeline/"

elif campaign_n == 1:
  PATH_GT = PATH_DATA_FOLDER + "GT2/"
  # leo' links
  PATH_RECAP = PATH_DATA_FOLDER + "recap_C2/"
  PATH_SUMMARIES = PATH_DATA_FOLDER + "summaries_C2/"
  PATH_TIMELINE = PATH_DATA_FOLDER + "timeline_C2/"
else:
  raise ValueError('Wrong selection for the campaign !')

PATH_GT_C = PATH_GT + "characters/"
PATH_GT_E = PATH_GT + "events/"
PATH_GT_R = PATH_GT + "roles/"
PATH_PPR = PATH_DATA_FOLDER + "ppr/"

# character extraction: https paths definition (all campaigns, all arcs)
HTTPS_LINK_EPISODES = "https://criticalrole.fandom.com/wiki/List_of_episodes"

# timeline's event extraction wiki link
# HTTPS_LINK_TIMELINE = "https://criticalrole.fandom.com/wiki/Time_Line"
HTTPS_LINK_TIMELINE = "https://web.archive.org/web/20230201215313/https://criticalrole.fandom.com/wiki/Timeline"  # web archive URL since it has been changed the timeline

# characters' roles extraction
LINK_CR1 = 'https://criticalrole.fandom.com/wiki/Main_Characters'  # all main characters, guest characters , most important NPC for each campaign
LINK_CR2 = 'https://criticalrole.fandom.com/wiki/Category:Non-player_characters' # all NPC
LINK_CR3 = 'https://criticalrole.fandom.com/wiki/Category:Allies' # all allies 
LINK_CR4 = 'https://criticalrole.fandom.com/wiki/Category:Antagonists' # all antogonist

# tuples to define start and end episodes for each arc in campaign one and two
eps_arc1 = [(1,23),(24,38),(39,83),(84,99),(100,115)]
eps_arc2 = [(1,25),(26,47),(48,69),(70,91),(92,112),(113,141)]

# variable used to preload html pages
LOAD = True

LINK_LIST_EPISODES = "https://criticalrole.fandom.com/wiki/List_of_episodes"
LINK_ORIGIN_EPISODE = "https://criticalrole.fandom.com/wiki/The_Story_of_Vox_Machina"

# root path used for drive access
ROOT_PATH= "/content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/"
ROOT_PATH_OS= "/content/drive/MyDrive/Colab\ Notebooks/NUANS/project_bisazza_casadei/"

DATA_PATH = os.path.join(ROOT_PATH, "data/")
DATA_PATH_OS = os.path.join(ROOT_PATH_OS, "data/")
RECAP_PATH = os.path.join(str(DATA_PATH), "recap/")
RECAP_PATH_OS = os.path.join(str(DATA_PATH_OS), "recap/")
RECAP_COREF_PATH = os.path.join(str(DATA_PATH), "recap/recap-coref-substitued/")
RECAP__COREF_PATH_OS = os.path.join(str(DATA_PATH_OS), "recap/recap-coref-substitued/")
RECAP_PATH_C2 = os.path.join(str(DATA_PATH), "recap_C2/")
RECAP_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "recap_C2/")
RECAP_COREF_PATH_C2 = os.path.join(str(DATA_PATH), "recap_C2/recap-coref-substitued/")
RECAP__COREF_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "recap_C2/recap-coref-substitued/")
ARC1_PATH = os.path.join(str(RECAP_PATH), "arc1.tsv")
ARC1_PATH_OS = os.path.join(str(RECAP_PATH_OS), "arc1.tsv")
TL_PATH = os.path.join(str(DATA_PATH), "timeline/")
TL_PATH_OS = os.path.join(str(DATA_PATH_OS), "timeline/")
TL_COREF_PATH = os.path.join(str(DATA_PATH), "timeline/timeline-coref-substitued/")
TL_COREF_PATH_OS = os.path.join(str(DATA_PATH_OS), "timeline/timeline-coref-substitued/")
TL_PATH_C2 = os.path.join(str(DATA_PATH), "timeline_C2/")
TL_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "timeline_C2/")
TL_COREF_PATH_C2= os.path.join(str(DATA_PATH), "timeline_C2/timeline-coref-substitued/")
TL_COREF_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "timeline_C2/timeline-coref-substitued/")
TL_events_PATH = os.path.join(str(DATA_PATH), "events/")
TL_events_PATH_OS = os.path.join(str(DATA_PATH_OS), "events/")
TL_events_PATH_C2 = os.path.join(str(DATA_PATH), "events_C2/")
TL_events_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "events_C2/")
SUM_PATH = os.path.join(str(DATA_PATH), "summaries/")
SUM_PATH_OS = os.path.join(str(DATA_PATH_OS), "summaries/")
SUM_COREF_PATH = os.path.join(str(DATA_PATH), "summaries/summaries-coref-substitued/")
SUM_COREF_PATH_OS = os.path.join(str(DATA_PATH_OS), "summaries/summaries-coref-substitued/")
SUM_PATH_C2 = os.path.join(str(DATA_PATH), "summaries_C2/")
SUM_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "summaries_C2/")
SUM_COREF_PATH_C2 = os.path.join(str(DATA_PATH), "summaries_C2/summaries-coref-substitued/")
SUM_COREF_PATH_OS_C2 = os.path.join(str(DATA_PATH_OS), "summaries_C2/summaries-coref-substitued/")
SUM_ARC1_PATH = os.path.join(str(SUM_PATH), "sum_arc1.tsv")
SUM_ARC1_PATH_OS = os.path.join(str(SUM_PATH_OS), "sum_arc1.tsv")
PPR_PATH = os.path.join(str(DATA_PATH), "ppr/")
PPR_PATH_OS = os.path.join(str(DATA_PATH_OS), "ppr/")

In [None]:
# General functions definition part 1

""" Generate tsv file """
def saveTSV(data, path, name, download = True):
  df = pd.DataFrame(data)
  df.to_csv(path_or_buf= path + name + ".tsv",sep="\t",header=False, index = False, encoding='utf-16')
  if download: files.download(path + name + ".tsv")

""" Load TSV file (return a dictionary)"""
def readTSV(path, name):
  df = pd.read_csv(path+name+'.tsv', sep='\t', header=None)
  list_sent = df.to_numpy()
  return list_sent.tolist()

""" Save JSON file """
def saveJSON(data, path, name, download = True):
  print(f"- saving JSON file: {name}.json, at: {path} ...")
  with open(path + name + '.json', 'w') as f:
    json.dump(data,f, indent= 4)
  if download: files.download(path + name + '.json')
    
""" Load JSON file (return a dictionary)"""
def loadJSON(path,name, need_ext = True, verbose = True):
  if verbose: print(f"- loading JSON file: {name}.json, at: {path} ...")
  if need_ext: full_name = name + '.json'
  else: full_name = name
  with open(path + full_name, 'r') as f:
    data = json.load(f)
  return data

""" Get soup object for URL """
def get_soup(url):
  # Send a GET request to the URL
  response = requests.get(url)

  html_text = bs_preprocess(response.text)
  # Parse the HTML of the response
  soup = BeautifulSoup(html_text, 'html.parser')
  return soup

""" Function used to clear the html text"""
def bs_preprocess(html):
    """remove distracting whitespaces and newline characters"""
    pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
    html = re.sub(pat, '', html)       # remove leading and trailing whitespaces
    html = re.sub('\n', ' ', html)     # convert newlines to spaces
                                      # this preserves newline delimiters
    html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
    html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
    return html 

""" Get arc number from the episode"""
def get_arc(episode, campaign = campaign_n):
  if campaign == 0: eps_arc = eps_arc1
  else: eps_arc = eps_arc2              # campaign 2
  for i,arc in  enumerate(eps_arc):
    if episode>=arc[0] and episode<= arc[1]:
      arc = i+1
      return arc

""" Create Folder at path """
def mkdir_cmd(path):
  cmd_mkdir_page_url = "mkdir "+ path
  try:
    !{cmd_mkdir_page_url}
  except:
    print("failed to create data dir")


""" Priting table not on the line """
def print_table(table):
  for i,(k,v) in enumerate(table.items()):
    print("{:<4} K: {:<30} V: {:<30}".format(str(i)+")",str(k),str(v)))

""" Priting list not on the line """
def print_list(list):
  for i,elem in enumerate(list):
    print("{:<4} {:<30}".format(str(i)+")",str(elem)))



"""
              Creation directories in the file system
"""
def create_directories():

  mkdir_cmd(ROOT_PATH_OS + "data")
  data_dir = os.listdir(PATH_DATA_FOLDER)
  mkdir_cmd(DATA_PATH_OS + "summaries")

  # create directory for pre-processing
  if not('ppr' in data_dir):
    os.mkdir(PATH_PPR)

  if campaign_n == 0:
    if not('GT' in data_dir):
      os.mkdir(PATH_GT)

      # cascade creation
      os.mkdir(PATH_GT_C)
      os.mkdir(PATH_GT_E)
      os.mkdir(PATH_GT_R)

  elif campaign_n == 1:
    
    if not('GT2' in data_dir):
      os.mkdir(PATH_GT)

      # cascade creation
      os.mkdir(PATH_GT_C)
      os.mkdir(PATH_GT_E)
      os.mkdir(PATH_GT_R)

In [None]:
create_directories()

In [None]:
# General functions definition part 2

def check_ids_in_jsons(json_files):
    # create a set to store all sentence ids
    all_ids = set()
    
    # loop through each JSON file
    for (json_file,name) in json_files:
        # loop through each sentence in the JSON file
        for sentence in json_file['sentences']:
            # get the sentence id
            for e in sentence['elements']:
              e_id = e['id']
              
              # if the sentence id is already in the set, raise an error
              for id in e_id:
                if id in all_ids:
                  print(ValueError(f"Duplicate sentence id {id} found in JSON files {name}"))
              
                # add the sentence id to the set
                all_ids.add(id)
            all_ids = set()
    # if we've reached this point, all sentence ids are unique
    print("All sentence ids are unique")

def adjust_ids(json_files, path):
    # loop through each JSON file
    for (json_file,name) in json_files:
        # loop through each sentence in the JSON file
        for sentence in json_file['sentences']:
            # initialize the starting id for the sentence
            start_id = 0 # or any other number you choose
            
            # loop through each element in the sentence
            for element in sentence['elements']:
                # get the list of ids for the element
                ids = element['id']
                try:
                  len_ids = len(element['token'].split())
                except: 
                  len_ids = 1
                # create a new list of ids, with unique, sequential values
                new_ids = list(range(start_id, start_id + len_ids))
                
                # update the element's id field with the new list of ids
                element['id'] = new_ids
                
                # update the starting id for the next element
                start_id += len(ids)
        saveJSON(json_file,name, path, download = False)
                
    print("IDs have been adjusted successfully.")


def write_recap_to_tsv(file, Recap, num):
  with open(file,"a") as tsv:
      tsv_writer = csv.writer(tsv, delimiter='\t')
      tsv_writer.writerow([num, Recap])

def get_ep_num(link):
    """
    Given a link, returns the episode number in the format of 'SXXEXX'.
    """
    ep_siblings = list(link.next_siblings)
    for ep_n in ep_siblings:
        if ep_n.name == "small":
            ep_num = ep_n.get_text()
            ep_num = ep_num[1:-1]
            return ep_num
    return None

def get_recap_url(link):
    """
    Given a link, returns the full URL for the recap page.
    """
    href = link.get('href')
    if href[:5] != "https":
        url = "https://criticalrole.fandom.com" + href
        return url
    return None

def get_recap_heading(soup):
    """
    Given a BeautifulSoup object, returns the heading for the recap section.
    """
    if soup.find('span', {'id': 'Recap'}) is not None:
        return soup.find('span', {'id': 'Recap'}).parent
    elif soup.find('span', {'id': 'Previously_on_Critical_Role'}) is not None: 
        return soup.find('span', {'id': 'Previously_on_Critical_Role'}).parent
    return None

def get_part_1_id(soup, recap_heading):
    """
    Given a BeautifulSoup object and the heading for the recap section, returns the id for the first part of the episode.
    """
    if soup.find('span', {'id': 'Recap'}) is not None:
        navigation_recap = soup.find('a',{'href':'#Recap'})
        navigation_recap_check = '#Recap'
    elif soup.find('span', {'id': 'Previously_on_Critical_Role'}) is not None:
        navigation_recap = soup.find('a',{'href':'#Previously_on_Critical_Role'})
        navigation_recap_check = '#Previously_on_Critical_Role'
    try:
      navigation_part_1 = navigation_recap.parent
    except:
      print("No navigation_recap")
      return None
    next_nav = False
    part_1_id = ''
    while navigation_part_1:
        if navigation_part_1.name == 'li':
            if next_nav:
                part_1_id = navigation_part_1.a['href']
                next_nav = False
            if navigation_part_1.a['href'] == navigation_recap_check: 
                next_nav = True
        navigation_part_1 = navigation_part_1.next_sibling
    return part_1_id[1:]

def get_part_i_heading(soup, part_1_id):
    """
    Given a BeautifulSoup object and the id for the first part of the episode, returns the heading for the first part of the episode.
    """
    return soup.find('span', {'id': part_1_id}).parent

def get_filtered_siblings(recap_heading, part_i_heading):
    """
    Given the heading for the recap section and the heading for the first part of the episode, returns a list of siblings that come after the recap heading but before the first part heading.
    """
    siblings = list(recap_heading.next_siblings)
    return [sibling for sibling in siblings if sibling != part_i_heading]

def get_p_elements(sibling, part_i_heading):
    """
    Given the first sibling after the recap heading and the heading for the first part of the episode, returns a list of p elements that come between the recap heading and the first part heading.
    """
    p_elements = []
    while sibling:
        if sibling.name == 'p':
            p_elements.extend(sibling)
        elif sibling == part_i_heading:
            break
        sibling = sibling.next_sibling
    return p_elements

def load_links_soups(PATH_GT_R,PATH_GT_C,arc_n = [0]):
  # -----------    load files
  roles_array_list=[]
  for arc in arc_n:
    try:
      tmp = loadJSON(PATH_GT_C,'gt_characters_arc'+str(arc +1)) #+str(arc_n +1)
      # build dictionary with pre-load the soup
      roles_array = []
      for elem in tqdm(tmp):
        for item in tmp[elem]:
          if not(item['link-u']=='empty'):
            roles_array.append([item,get_soup(item['link-u'])])
          else:
            roles_array.append([item,'empty'])                      
    except Exception as e :
      raise ValueError(e)
    roles_array_list.append(roles_array)  

  try:
    tmp = loadJSON(PATH_GT_R,'all_NPCs')
    # build dictionary with pre-load the soup
    allNPCs_array = []
    for elem in tqdm(tmp):
      if not(elem['link']=='empty'):
        allNPCs_array.append([elem,get_soup(elem['link'])])
      else:
        allNPCs_array.append([elem,'empty'])
  except Exception as e :
    raise ValueError("Complete list of NPC file not found")
  return roles_array_list,allNPCs_array

def load_soup(arc_n=0):
  roles_array_list=[]
  for arc in arc_n:
    if LOAD: 
      with open(DATA_PATH+'recap/roles_array_'+str(arc +1)+'.pkl', 'rb') as f:
        roles_array = []
        while True:
            try:
                elem = pickle.load(f)
                roles_array.append(elem)
            except EOFError:
                break
      roles_array_list.append(roles_array)

  if LOAD: 
    with open(DATA_PATH+'recap/allNPCs_array.pkl', 'rb') as f:
      allNPCs_array = []
      while True:
          try:
              elem = pickle.load(f)
              
              allNPCs_array.append(elem)
          except EOFError:
              break
  return roles_array_list, allNPCs_array

# Dataset extraction

## Shared functions

In [None]:
# Definition of shared functions for the data extraction

"""
      Function to clean string for characters' name
"""
def clean_name(text):
  text = re.sub('\(.*?\)','',text) # remove parantheses
  text = re.sub('\[\w*\]','',text)# remove numbered link trace
  text = re.sub(' +',' ',text) # remove extra spaces
  text = re.sub(" , ",", ", text) # correct spacing for commas
  text = re.sub("\'","'",text) # remove  \'
  return text.strip()


"""
      Function that inserts other names used in the episodes for the identification of a character.
      These information are added in the charcter dictionary.
      If not extra names find in the character page inserted the 'empty' value.
"""

def has_AKA(tag):
  return tag.has_attr('data-source') and tag.attrs['data-source']=='AKA'

def insertOtherNames(soup, character):
  
  aka_elem = soup.find_all(has_AKA)
  aka = 'empty' # initially empty value

  if not(aka_elem == []):  #first look if the <div> for aka exists otherwise insert 'empty' as value
    div = aka_elem[0]
    aka = []
    div = list(div.children)[1]
    elems = list(div.children)

    if(type(elems[0]) != bs4.element.NavigableString or len(elems)>1):
      elem = elems[0]
      if elem.name == 'ul':
        for li in list(elem.children):
          text  = ''
          for elem in list(li.children):
            if text == '' or text[-1] == ' ':
              text += elem.string
            else: 
              text += ' '+ elem.string
          text = clean_name(text)
          aka.append(text)

      else:   # not a list but composed of more than one html element
        text = ''
        for child in elems:
          if child.name == 'br':
            aka.append(text)
            text = ''
            continue
          if text == '' or text[-1] == ' ':
            text += child.string
          else: 
            text += ' '+ child.string
          text = clean_name(text)
        aka.append(text)

    elif(div.string != None):  # just one aka case (directly contained in the second div)
       aka.append(clean_name(div.string))

    else:
      raise ValueError(f"element diffrent from ul: {div.name}")

  character['aka'] = aka
  return 

## Episodes' links  retrieving

In [None]:
"""
      Retrieve list of episodes for the arc_n -1 
"""
def get_episodes(arc_n):

  # aux function used to scan the tables in the page 
  def scan_table(arc, table_C, links_eps):
    EP_NUM = None
    for link in table_C.find_all('a'):
      if link.get('href')[:5] != "https":
        href=link.get('href')
        url = "https://criticalrole.fandom.com" + href

        # get episode number 
        ep_siblings  = list(link.next_siblings)
        for ep_n in ep_siblings:
          if ep_n.name == "small":
              EP_NUM=ep_n.get_text()[1:-1]

        links_eps[EP_NUM] = url


  soup_C = get_soup(HTTPS_LINK_EPISODES) # soup valid for all the campaigns
  links_eps = {}

  tables_C = soup_C.find_all("table", {"class": "wikitable"})
  
  if campaign_n == 1:

    arc = len(eps_arc1)
    table_C = tables_C[arc]
    
    # 1) search for the table with the first episode, to know when to start storing data

    tmp = {}
    scan_table(arc, table_C, tmp)
    first_ep = int(list(tmp.keys())[0].split('x')[1]) # is 1

    while(first_ep != eps_arc2[arc_n][0]):
      tmp = {}
      arc += 1
      table_C = tables_C[arc]
      scan_table(arc, table_C, tmp)
      first_ep = int(list(tmp.keys())[0].split('x')[1]) 

    # 2) start storing data and stop when last episode of the current arc is reached

    scan_table(arc, table_C, links_eps)
    last_ep = int(list(links_eps.keys())[-1].split('x')[1])

    while(last_ep != eps_arc2[arc_n][1]):
      arc += 1
      table_C = tables_C[arc]
      scan_table(arc, table_C, links_eps)
      last_ep = int(list(links_eps.keys())[-1].split('x')[1])

  else:
    arc = arc_n
    table_C = tables_C[arc]
    scan_table(arc, table_C, links_eps)
  
  # verify correctness on first and last episode retrieved
  episodes = list(links_eps.keys())
  start_ep = int(episodes[0].split('x')[1])
  end_ep = int(episodes[-1].split('x')[1])

  if campaign_n == 0:
    assert start_ep == eps_arc1[arc_n][0]
    assert end_ep   == eps_arc1[arc_n][1]
  elif campaign_n == 1:
    assert start_ep == eps_arc2[arc_n][0]
    assert end_ep   == eps_arc2[arc_n][1]

  print(f"link extracted: {len(links_eps.items())}")
  return links_eps

links_eps = get_episodes(arc_n)
print_table(links_eps)

link extracted: 23
0)   K: 1x01                           V: https://criticalrole.fandom.com/wiki/Arrival_at_Kraghammer
1)   K: 1x02                           V: https://criticalrole.fandom.com/wiki/Into_the_Greyspine_Mines
2)   K: 1x03                           V: https://criticalrole.fandom.com/wiki/Strange_Bedfellows
3)   K: 1x04                           V: https://criticalrole.fandom.com/wiki/Attack_on_the_Duergar_Warcamp
4)   K: 1x05                           V: https://criticalrole.fandom.com/wiki/The_Trick_about_Falling
5)   K: 1x06                           V: https://criticalrole.fandom.com/wiki/Breaching_the_Emberhold
6)   K: 1x07                           V: https://criticalrole.fandom.com/wiki/The_Throne_Room
7)   K: 1x08                           V: https://criticalrole.fandom.com/wiki/Glass_and_Bone
8)   K: 1x09                           V: https://criticalrole.fandom.com/wiki/Yug%27Voril_Uncovered
9)   K: 1x10                           V: https://criticalrole.fandom.com

## All NPCs

In [None]:
"""
      Extract the whole list of NPC and save in a dictionary
      Optional: save all in a JSON file
"""
def extraction_allNPC(verbose = False, includesAka = True):
  NPC_characters = []
  link_page = LINK_CR2
  while(True):
    if link_page != None:
      soup = get_soup(link_page) # get main first page for the NPCs characters
      div_alphaList = soup.find_all(class_= 'category-page__members')[0]

      # get next link page 
      a_link = soup.find_all(class_='category-page__pagination-next wds-button wds-is-secondary')
      if a_link == []: link_page = None
      else: link_page = a_link[0].get('href')

      if verbose: print(f'next link is:{link_page}')

      for div_letter in list(div_alphaList.children):
        # skip the page header with this try-catch statement
        try:
          ul_elem = list(div_letter.children)[1]
        except:
          continue
        
        for li in list(ul_elem.children):
          a_elem = list(li.children)[1]

          try:
            link = 'https://criticalrole.fandom.com' + str(a_elem.get('href'))
          except:
            link = "empty"
          
          character_name = clean_name(a_elem.text)
          npc_character = {'name': character_name,'link':link, 'type':'npc'}

          # includes aka from the character
          if includesAka:
            if link != 'empty':
              soup_character = get_soup(link)
              print(link)
              insertOtherNames(soup_character,npc_character)
            else:
              npc_character['aka'] = 'empty'

          if verbose: print(npc_character)
          NPC_characters.append(npc_character) 
    else:
      break

  return NPC_characters

allNPCs  = extraction_allNPC (verbose = True)
print_list(allNPCs)

# Save
saveJSON(allNPCs, PATH_GT_R, 'all_NPCs', download = False)

next link is:https://criticalrole.fandom.com/wiki/Category:Non-player_characters?from=Dispater
https://criticalrole.fandom.com/wiki/Abban
{'name': 'Abban', 'link': 'https://criticalrole.fandom.com/wiki/Abban', 'type': 'npc', 'aka': 'empty'}
https://criticalrole.fandom.com/wiki/Abdar
{'name': 'Abdar', 'link': 'https://criticalrole.fandom.com/wiki/Abdar', 'type': 'npc', 'aka': 'empty'}
https://criticalrole.fandom.com/wiki/Abrianna_Mirimm
{'name': 'Abrianna Mirimm', 'link': 'https://criticalrole.fandom.com/wiki/Abrianna_Mirimm', 'type': 'npc', 'aka': 'empty'}
https://criticalrole.fandom.com/wiki/Acek_Orattim
{'name': 'Acek Orattim', 'link': 'https://criticalrole.fandom.com/wiki/Acek_Orattim', 'type': 'npc', 'aka': 'empty'}
https://criticalrole.fandom.com/wiki/Adeen_Tasithar
{'name': 'Adeen Tasithar', 'link': 'https://criticalrole.fandom.com/wiki/Adeen_Tasithar', 'type': 'npc', 'aka': 'empty'}
https://criticalrole.fandom.com/wiki/Adelaide_Bluebutton
{'name': 'Adelaide Bluebutton', 'link': 

## Characters' role

### Main

In [None]:
"""
      Extraction of main characters
"""

# modify this function if you want to extract from a different campaign
def extract_main(verbose=True, only_gallery = True):
  if campaign_n == 0:
    name_campaign = 'Campaign_One:_Vox_Machina'
  elif campaign_n == 1:
    name_campaign = 'Campaign_Two:_The_Mighty_Nein'

  main_characters = []
  soup = get_soup(LINK_CR1)

  # extraction of main characters
  span_section = soup.find_all(id = name_campaign)[0]
  h2_main = span_section.parent
  elem_main = list(h2_main.next_siblings)[1:3]

  # extracting from the paragraph
  if not(only_gallery):
    p_main = elem_main[0]

    for child in list(p_main.children):
      if type(child) is bs4.element.NavigableString: continue

      try:
        link = 'https://criticalrole.fandom.com' + str(child.get('href'))
      except:
        link = "empty"
      
      character_name = clean_name(child.text)
      main_character = {'name':character_name,'link':link, 'type':'main'}

      # check if is a pg or a member of the cast, in this case skip the pg
      soup_pg = get_soup(link)
      a_cast = soup_pg.find_all(title ='Category:Cast')
      if a_cast != []:
        continue

      if verbose: print(main_character)
      main_characters.append(main_character)

  # extracting from the gallery
  gallery_main = elem_main[1]
  div_children = list(gallery_main.children)
  for div_child in div_children:
    div_caption = list(div_child.children)[1]
    a_caption = next(next(next(div_caption.children).children).children)
    try:
      link = 'https://criticalrole.fandom.com' + str(a_caption.get('href'))
    except:
      link = "empty"
    character_name = clean_name(a_caption.text)
    main_character = {'name':character_name,'link':link, 'type':'main'}
    if verbose: print(main_character)
    main_characters.append(main_character)

  return main_characters

main_characters = extract_main(verbose = False,  only_gallery = False)

print_list(main_characters)

0)   {'name': 'Trinket', 'link': 'https://criticalrole.fandom.com/wiki/Trinket', 'type': 'main'}
1)   {'name': 'Doty', 'link': 'https://criticalrole.fandom.com/wiki/Doty', 'type': 'main'}
2)   {'name': 'Grog Strongjaw', 'link': 'https://criticalrole.fandom.com/wiki/Grog_Strongjaw', 'type': 'main'}
3)   {'name': 'Keyleth', 'link': 'https://criticalrole.fandom.com/wiki/Keyleth', 'type': 'main'}
4)   {'name': 'Percival de Rolo', 'link': 'https://criticalrole.fandom.com/wiki/Percival_de_Rolo', 'type': 'main'}
5)   {'name': 'Pike Trickfoot', 'link': 'https://criticalrole.fandom.com/wiki/Pike_Trickfoot', 'type': 'main'}
6)   {'name': 'Scanlan Shorthalt', 'link': 'https://criticalrole.fandom.com/wiki/Scanlan_Shorthalt', 'type': 'main'}
7)   {'name': 'Taryon Darrington', 'link': 'https://criticalrole.fandom.com/wiki/Taryon_Darrington', 'type': 'main'}
8)   {'name': 'Tiberius Stormwind', 'link': 'https://criticalrole.fandom.com/wiki/Tiberius_Stormwind', 'type': 'main'}
9)   {'name': "Vax'ildan"

In [None]:
"""
      Extraction of guest characters (secondary)
"""
# modify this function if you want to extract from a different campaign
def extract_guest(verbose=True):
  guest_characters = []
  soup = get_soup(LINK_CR1)

  # extraction of main characters

  if campaign_n == 0:
    id_span = 'Guest_player_characters_of_Campaign_One'
  elif campaign_n == 1:
    id_span = 'Guest_player_characters_of_Campaign_Two'

  span_section = soup.find_all(id = id_span)[0]
  h3_guest = span_section.parent
  gallery_elem = list(h3_guest.next_siblings)[1]
  if not(gallery_elem.name == 'div'):
    raise ValueError("Problem with the extraction of the gallery element from HTML")
  
  # get element from the gallery
  div_children = list(gallery_elem.children)
  for div_child in div_children:
    div_caption = list(div_child.children)[1]
    center_elem = list(div_caption.children)[0]
    a_caption = next(center_elem.children)

    try:
      link = 'https://criticalrole.fandom.com' + str(a_caption.get('href'))
    except:
      link = "empty"
    
    character_name = clean_name(a_caption.text)
    guest_character = {'name':character_name,'link':link, 'type':'guest'}

    if verbose: print(guest_character)
    guest_characters.append(guest_character)
    
  return guest_characters

guest_characters = extract_guest(verbose =False)

print_list(guest_characters)

0)   {'name': 'Arkhan', 'link': 'https://criticalrole.fandom.com/wiki/Arkhan', 'type': 'guest'}
1)   {'name': 'Garthok', 'link': 'https://criticalrole.fandom.com/wiki/Garthok', 'type': 'guest'}
2)   {'name': 'Gern Blanston', 'link': 'https://criticalrole.fandom.com/wiki/Gern_Blanston', 'type': 'guest'}
3)   {'name': 'Kashaw Vesh', 'link': 'https://criticalrole.fandom.com/wiki/Kashaw_Vesh', 'type': 'guest'}
4)   {'name': 'Kerrek', 'link': 'https://criticalrole.fandom.com/wiki/Kerrek', 'type': 'guest'}
5)   {'name': 'Lillith Daturai', 'link': 'https://criticalrole.fandom.com/wiki/Lillith_Daturai', 'type': 'guest'}
6)   {'name': 'Lionel Gayheart', 'link': 'https://criticalrole.fandom.com/wiki/Lionel_Gayheart', 'type': 'guest'}
7)   {'name': 'Lyra', 'link': 'https://criticalrole.fandom.com/wiki/Lyra', 'type': 'guest'}
8)   {'name': 'Shale', 'link': 'https://criticalrole.fandom.com/wiki/Shale', 'type': 'guest'}
9)   {'name': 'Sprigg', 'link': 'https://criticalrole.fandom.com/wiki/Sprigg', '

### NPC

In [None]:
"""
      Extraction of NPC (the most importants) for the first campaign
"""
# modify this function if you want to extract from a different campaign
def extract_npc(verbose=True):
  npc_characters = []
  soup = get_soup(LINK_CR1)
  # retieve npc elem h3
  if campaign_n == 0:
    id_span = 'Notable_NPCs_of_Campaign_One'
  elif campaign_n == 1:
    id_span = 'Notable_NPCs_of_Campaign_Two'

  span_section = soup.find_all(id = id_span)[0]
  h3_npc = span_section.parent

  gallery_elem = list(h3_npc.next_siblings)[1]
  if not(gallery_elem.name == 'div'):
    raise ValueError("Problem with the extraction of the gallery element from HTML")
  
  # get element from the gallery
  div_children = list(gallery_elem.children)
  for div_child in div_children:
    div_caption = list(div_child.children)[1]
    center_elem = list(div_caption.children)[0]
    a_caption = next(center_elem.children)

    try:
      link = 'https://criticalrole.fandom.com' + str(a_caption.get('href'))
    except:
      link = "empty"
    
    character_name = clean_name(a_caption.text)
    npc_character = {'name':character_name,'link':link, 'type':'npc'}

    if verbose: print(npc_character)
    npc_characters.append(npc_character)
    
  return npc_characters

npc_characters = extract_npc(verbose =False)

print_list(npc_characters)

0)   {'name': 'Allura Vysoren', 'link': 'https://criticalrole.fandom.com/wiki/Allura_Vysoren', 'type': 'npc'}
1)   {'name': 'Artagan', 'link': 'https://criticalrole.fandom.com/wiki/Artagan', 'type': 'npc'}
2)   {'name': 'Cassandra de Rolo', 'link': 'https://criticalrole.fandom.com/wiki/Cassandra_de_Rolo', 'type': 'npc'}
3)   {'name': 'Groon', 'link': 'https://criticalrole.fandom.com/wiki/Groon', 'type': 'npc'}
4)   {'name': 'Jarett Howarth', 'link': 'https://criticalrole.fandom.com/wiki/Jarett_Howarth', 'type': 'npc'}
5)   {'name': "J'mon Sa Ord", 'link': 'https://criticalrole.fandom.com/wiki/J%27mon_Sa_Ord', 'type': 'npc'}
6)   {'name': 'Kaylie', 'link': 'https://criticalrole.fandom.com/wiki/Kaylie', 'type': 'npc'}
7)   {'name': 'Kima of Vord', 'link': 'https://criticalrole.fandom.com/wiki/Kima_of_Vord', 'type': 'npc'}
8)   {'name': 'Nahla', 'link': 'https://criticalrole.fandom.com/wiki/Nahla', 'type': 'npc'}
9)   {'name': 'Osysa', 'link': 'https://criticalrole.fandom.com/wiki/Osysa',

### Ally

In [None]:
"""
      Extraction of allies
"""

def extraction_allies(verbose = False):
  ally_characters = []
  soup = get_soup(LINK_CR3)
  div_alphaList = soup.find_all(class_= 'category-page__members')[0]
  for div_letter in list(div_alphaList.children):
    ul_elem = list(div_letter.children)[1]
    for li in list(ul_elem.children):
      a_elem = list(li.children)[1]

      try:
        link = 'https://criticalrole.fandom.com' + str(a_elem.get('href'))
      except:
        link = "empty"
      
      character_name = clean_name(a_elem.text)
      ally_character = {'name': character_name,'link':link, 'type':'ally'}

      if verbose: print(ally_character)
      ally_characters.append(ally_character)

  return ally_characters

ally_characters  = extraction_allies (verbose = False)

print_list(ally_characters)

0)   {'name': 'Adella', 'link': 'https://criticalrole.fandom.com/wiki/Adella', 'type': 'ally'}
1)   {'name': 'Allura Vysoren', 'link': 'https://criticalrole.fandom.com/wiki/Allura_Vysoren', 'type': 'ally'}
2)   {'name': 'Artagan', 'link': 'https://criticalrole.fandom.com/wiki/Artagan', 'type': 'ally'}
3)   {'name': 'Assum Emring', 'link': 'https://criticalrole.fandom.com/wiki/Assum_Emring', 'type': 'ally'}
4)   {'name': 'Bertrand Bell', 'link': 'https://criticalrole.fandom.com/wiki/Bertrand_Bell', 'type': 'ally'}
5)   {'name': 'Calliope Clay', 'link': 'https://criticalrole.fandom.com/wiki/Calliope_Clay', 'type': 'ally'}
6)   {'name': 'Cassandra de Rolo', 'link': 'https://criticalrole.fandom.com/wiki/Cassandra_de_Rolo', 'type': 'ally'}
7)   {'name': 'Cerkonos', 'link': 'https://criticalrole.fandom.com/wiki/Cerkonos', 'type': 'ally'}
8)   {'name': 'Clarabelle Clay', 'link': 'https://criticalrole.fandom.com/wiki/Clarabelle_Clay', 'type': 'ally'}
9)   {'name': 'Clarota', 'link': 'https://c

### Antogonist

In [None]:
"""
      Extraction of antagonist
"""

def extraction_antagonists(verbose = False):
  antagonist_characters = []

  # skip chroma group and add each element after
  skip_chroma = ['Category:Chroma Conclave','Chroma Conclave']
  link_chroma = 'https://criticalrole.fandom.com/wiki/Category:Chroma_Conclave'

  soup = get_soup(LINK_CR4)
  div_alphaList = soup.find_all(class_= 'category-page__members')[0]
  for div_letter in list(div_alphaList.children):
    ul_elem = list(div_letter.children)[1]
    for li in list(ul_elem.children):
      a_elem = list(li.children)[1]
      # print('         '+ str(a_elem))
      if a_elem.string in skip_chroma: continue

      try:
        link = 'https://criticalrole.fandom.com' + str(a_elem.get('href'))
      except:
        link = "empty"

      character_name = clean_name(a_elem.text)
      antagonist_character = {'name':character_name,'link':link, 'type':'antagonist'}

      if verbose: print(antagonist_character)
      antagonist_characters.append(antagonist_character)

  # insert chroma faction
  soup = get_soup(link_chroma)
  ul_elem = soup.find_all(class_ ='category-page__trending-pages')[0]
  for li_elem in list(ul_elem.children)[1:]:
    a_elem = list(li_elem.children)[0]

    try:
      link = 'https://criticalrole.fandom.com' + str(a_elem.get('href'))
    except:
      link = "empty"
    figure_elem = a_elem.next_element
    figcaption_elem = list(figure_elem.children)[1]

    character_name = clean_name(figcaption_elem.string)
    antagonist_character = {'name':character_name,'link':link, 'type':'antagonist'}

    if verbose: print(antagonist_character)
    antagonist_characters.append(antagonist_character)

  return antagonist_characters

antagonist_characters = extraction_antagonists(verbose =False)

print_list(antagonist_characters)

0)   {'name': 'Algar Dyomin', 'link': 'https://criticalrole.fandom.com/wiki/Algar_Dyomin', 'type': 'antagonist'}
1)   {'name': 'Anna Ripley', 'link': 'https://criticalrole.fandom.com/wiki/Anna_Ripley', 'type': 'antagonist'}
2)   {'name': 'Avantika', 'link': 'https://criticalrole.fandom.com/wiki/Avantika', 'type': 'antagonist'}
3)   {'name': 'Briarwoods', 'link': 'https://criticalrole.fandom.com/wiki/Briarwoods', 'type': 'antagonist'}
4)   {'name': 'Brimscythe', 'link': 'https://criticalrole.fandom.com/wiki/Brimscythe', 'type': 'antagonist'}
5)   {'name': 'Byron Anders', 'link': 'https://criticalrole.fandom.com/wiki/Byron_Anders', 'type': 'antagonist'}
6)   {'name': 'Clarota', 'link': 'https://criticalrole.fandom.com/wiki/Clarota', 'type': 'antagonist'}
7)   {'name': 'Clasp', 'link': 'https://criticalrole.fandom.com/wiki/Clasp', 'type': 'antagonist'}
8)   {'name': "Conan O'Brien", 'link': 'https://criticalrole.fandom.com/wiki/Conan_O%27Brien_(character)', 'type': 'antagonist'}
9)   {'na

### Merge and filter data

In [None]:
"""
      Merge all the data from character extraction
"""
role_characters  = [*main_characters,*guest_characters,*npc_characters,*ally_characters,*antagonist_characters]


# aux functions for filtering

def has_data_source_first(tag):
  return tag.has_attr('data-source') and tag.attrs['data-source']=='First'

def has_data_source_Last(tag):
  return tag.has_attr('data-source') and tag.attrs['data-source']=='Last'

def get_match(elem,matching):
  result = []
  aux_get_match(elem,matching,result)
  return result

def aux_get_match(elem,matching, result):
  try:
    children = list (elem.children)
  except:
    children = []
  try:
    res = matching(elem)
  except:
    res = None

  if res != None:
    result.append(res)

  if  children== []:
    return
  else:
    for child in children:
      aux_get_match(child, matching, result)

"""
      Filter function for characters of the chosen arc
"""
def filterCharArc(verbose = False):
  if verbose: print(f"Getting data from the PGs' pages.\nStarting with {len(role_characters)} characters\n")
  results_page = []
  to_remove = []
  count_removed = 0

  # ---------------------------------- (step 1) ----------------------------------------------------

  for idx, character in enumerate(role_characters):
    if verbose: print(character)

    soup = get_soup(character['link'])
    result_Page = {}
    result_Page['char'] = character 

    inserted = False #boolean flag

    try:
      first_elem = soup.find_all(has_data_source_first)[0]
    except:
      first_elem = 'empty'
    try:
      last_elem =  soup.find_all(has_data_source_Last)[0]
    except:
      last_elem = 'empty'
    
    # if you don't find first appearance div, remove character
    if first_elem == 'empty':
      to_remove.append(character)
      count_removed +=1
      if verbose: print(f"--> Not found first appearance for the character: {character['name']}, removing from the data...")
      continue
    
    firstLast_elems = [('first',first_elem),('last',last_elem)]

    # extract episode and campaign number from last and first appearance

    for elem in firstLast_elems:

      # if no last episode, empty for 'last' in this page
      if elem[0] == 'last' and elem[1] == 'empty':
        result_Page[elem[0]] = 'empty'
        results_page.append(result_Page)
        if verbose: print(f"--> inserting character: {character['name']}")
        inserted = True
        break

      div = list(elem[1].children)[1]
      matching = lambda e: re.search('^\(\wx\d*\)$',e.string)
      result  = get_match(div,matching)

      try:
        result = result[0].group()
        text = result.replace('(','').replace(')','')
        campaign = text.split('x')[0]
        episode = text.split('x')[1]
        result_Page[elem[0]] = (campaign,episode)
      except:
        result_Page[elem[0]] = 'empty'

    if not(inserted):
      if result_Page['first'] != 'empty' or result_Page['last'] != 'empty':
        if verbose: print(f"--> inserting character: {character['name']}")
        inserted = True
        results_page.append(result_Page)
      else:
        to_remove.append(character)
        count_removed +=1
        if verbose: print(f"--> found incompatible first and last appearance for the character: {character['name']}, removing from the data...")
    
    # includes other name for the character
    if inserted:
      insertOtherNames(soup,character) 

  for rem in to_remove:
    role_characters.remove(rem)
  
  
  if verbose:
    print(f'removed {count_removed} characters for lack of information')
    print(f'Current characters number: {len(role_characters)}')
  
  # ---------------------------------- (step 2) ----------------------------------------------------

  # now analyze the data retrieved (results_page) to filter for the characters in the right arc
  count_removed = 0
  for idx,data in enumerate(results_page): 
    removed = False
    pg = data['char']
    first = data['first']
    last = data['last']

    # initialize aux data for first (campaign, arc, episode)
    f_c = 0; f_a = 0; f_e = 0; 
    # initialize aux data for last (campaign, arc, episode)
    l_c = 0; l_a = 0; l_e = 0; 

    # [case 1]: both first and last appearance in the special episode: (remove)
    if (first[0] == 'S' or first[0] == 's') and (last[0] == 'S' or last[0] == 's'):
      try:
        role_characters.remove(pg)
      except:
        pass
      if verbose: print(f"removing character: {pg['name']} for first and last appearance in 'S'")
      continue

    # elaborate data with 'S' and 'empty' for first 
    if first== 'empty' or first[0] == 'S':
      f_c = 1; f_a = 1; f_e = 1;
    else:
      f_c = int(first[0])
      if f_c == 1:
        f_e = int(first[1])
        f_a = get_arc(f_e)
      elif f_c == 2:
        f_e = int(first[1])
        f_a = get_arc(f_e)
      else:    
        # if not in the first campaing i assume inf for episode and arc, i just have to know that is 
        # a successive episode respect the arc_n                   
        f_e = math.inf 
        f_a = math.inf
    
    # elaborate data with 'S' and 'empty' for last appearance
    if last == 'empty' or last[0] == 'S':
      l_c = math.inf; l_a = math.inf; l_e = math.inf;
    else:
      l_c = int(last[0])
      if l_c == 1:
        l_e = int(last[1])
        l_a = get_arc(l_e)
      elif l_c == 2:
        l_e = int(last[1])
        l_a = get_arc(l_e)
      else: 
        l_e = math.inf
        l_a = math.inf
    
    # [case 2]: first appearance not in the current campaign and precedent
    if f_c > campaign_n +1:
      try:   # try-except to avoid problem if character already removed
        role_characters.remove(pg)
      except:
        pass
      count_removed += 1
      removed = True
      if verbose: print(f"removing character: {pg['name']} for first appearance not in correct campaign")
      continue

    # [case 3]: last appearance not in the current campaign
    if l_c < campaign_n +1:
      try:   # try-except to avoid problem if character already removed
        role_characters.remove(pg)
      except:
        pass
      count_removed += 1
      removed = True
      if verbose: print(f"removing character: {pg['name']} for last appearance not in correct campaign")
      continue

    # [case 4]: first appearance in next arc respect arc_n +1
    if f_a > arc_n+1 and (f_c == campaign_n+1):
      try:
        role_characters.remove(pg)
      except:
        pass
      count_removed += 1
      removed = True
      if verbose: print(f"removing character: {pg['name']} for first appearance in next arcs of: {arc_n +1}")
      continue

    elif l_c == campaign_n +1:
    # [case 5]: last appearance in previous arc respect arc_n+1 (useful for selected arc_n+1 grater than 1)
      if l_a < arc_n+1:
        try:
          role_characters.remove(pg)
        except:
          pass
        count_removed += 1
        removed = True
        if verbose: print(f"removing character: {pg['name']} for last appearance not previous arcs of: {arc_n +1}")
        continue

  print(f"removed: {count_removed}")
  return

# filter for to remove characters not in the  right arc (arc_n +1),
# also enrich entries with other names used for the identification if present
filterCharArc(verbose =True)

# output result
print('\nExtracterd roles:')
print_list(role_characters)

# save on drive
saveJSON(role_characters, PATH_GT_R, 'gt_roles_arc'+str(arc_n+1), download = False)

Getting data from the PGs' pages.
Starting with 186 characters

{'name': 'Trinket', 'link': 'https://criticalrole.fandom.com/wiki/Trinket', 'type': 'main'}
--> inserting character: Trinket
{'name': 'Doty', 'link': 'https://criticalrole.fandom.com/wiki/Doty', 'type': 'main'}
--> inserting character: Doty
{'name': 'Grog Strongjaw', 'link': 'https://criticalrole.fandom.com/wiki/Grog_Strongjaw', 'type': 'main'}
--> inserting character: Grog Strongjaw
{'name': 'Keyleth', 'link': 'https://criticalrole.fandom.com/wiki/Keyleth', 'type': 'main'}
--> inserting character: Keyleth
{'name': 'Percival de Rolo', 'link': 'https://criticalrole.fandom.com/wiki/Percival_de_Rolo', 'type': 'main'}
--> inserting character: Percival de Rolo
{'name': 'Pike Trickfoot', 'link': 'https://criticalrole.fandom.com/wiki/Pike_Trickfoot', 'type': 'main'}
--> inserting character: Pike Trickfoot
{'name': 'Scanlan Shorthalt', 'link': 'https://criticalrole.fandom.com/wiki/Scanlan_Shorthalt', 'type': 'main'}
--> inserting 

## Characters for episode

In [None]:
"""
      Extract characters from the episode page with the layout using single column
"""

# --- aux function n°1 to handle the case of list in a list
def nestedList_extraction(elem, characters_ep, typology):

  for entries in list(elem.children): # scan rows of the list 

    tags_children = list(entries.children)
    if tags_children[0].name == 's': return  # crossed out text, remove from pgs

    # analyze children tags since both plain text + link text present 
    links = []
    text = ''
    for tag_children in tags_children: # scan elements of the entry
      
      if tag_children.name == 'ul':
        nestedList_extraction(tag_children, characters_ep)
        continue
      
      if type(tag_children) is bs4.element.NavigableString: # plain text 
        text += (str(tag_children) + " ")

      else: 
        
        try:
          text += tag_children.text
        except:
          pass

        try:
          links.append("https://criticalrole.fandom.com" + tag_children['href'])
        except:
          pass

      # divide text in name + extra:
      extra = "empty"
      if (re.search('\w*:\w*',text)):
        splitted_text = text.split(':')
        text = splitted_text[0]
        if len(splitted_text) > 1:
          extra = clean_name(splitted_text[1].strip())

      matched = re.search('\(.*\)',text)
      if matched != None:
        if extra != "empty":
          extra = (matched.group() + " " + extra).strip()
        else:
          extra = (matched.group()).strip()
        text = text.replace(matched.group(),'')
      

      if typology in ["Vox_Machina",'The_Mighty_Nein', 'Player_Characters']:
        typology = "main"
      if len(links) == 0:
        links = "empty"

      # definition of the character table
      character = {'name':clean_name(text),'type':typology,'links':links,'extra':extra}
      characters_ep.append(character)

  return 

# --- aux function n° 2: Insert the entity that represent the main group 

def insert_GroupEntity(characters_ep, ep_link, verbose = False):
  # insert entity for the group
  if campaign_n == 0:
    id_span = 'Vox_Machina'
  elif campaign_n == 1:
    id_span = 'The_Mighty_Nein'

  try: 
    soup_ep = get_soup(ep_link) #soup for the campaign 1
    if campaign_n == 1:
      try:
        span_group = soup_ep.find_all(id = id_span)[0]
      except:
        span_group = soup_ep.find_all(id = 'Player_Characters')[0]
    else:
      span_group = soup_ep.find_all(id = id_span)[0]

    a_group = span_group.contents[0].contents[0]

    group_link = "https://criticalrole.fandom.com" + a_group['href']
    if campaign_n == 0:
      group_name = a_group['title']  #i.e. Vox Machina

    if campaign_n == 1:   # since inconsisten name is often used in the a tag
      group_name = 'The Mighty Nein'

    # get string with all names of the party in this episode
    group_members = ''
    for character in characters_ep:
      if character['type'] == 'main':
        if group_members != '': group_members += ', ' + character['name']
        else: group_members += character['name']

    group_entry = {'name':group_members,'type':'empty','links':group_link,'extra':'empty', \
                  'role':'empty','aka': [group_name],'link-u': group_link}

    characters_ep.insert(0,group_entry)
  except:
    # no needed to insert the additional entry since no members are present (like for special episode)
    pass

  return 

def extract_characters_layout1(ep_link, verbose = False):
  characters_ep = []
  soup_ep = get_soup(ep_link) #soup for the campaign 1
  span_charaters_ep = soup_ep.find_all(id = "Featured_Characters")

  h2_characters = span_charaters_ep[0].parent
  sibling = h2_characters.next_sibling
  while(True):

    # exit condition
    try:
      first_child = list(sibling.children)[0]
      first_child_attrs = first_child.attrs
    except Exception as e: # skip found element not of interest
      sibling = sibling.next_sibling
      continue
    if 'id' in first_child_attrs:
      if first_child['id']== "Inventory" or first_child['id']== "Quotations": break # no more characters info in the page, exit from the while-loop

    # skip conditions (not a list of names)
    if sibling.name != "ul":
      sibling = sibling.next_sibling
      continue

    # extract type
    try:
      prev_simbling = sibling.previous_sibling
      while(prev_simbling.name == 'figure'):  # skip the figures in the html
        prev_simbling = prev_simbling.previous_sibling

      typology = list(prev_simbling.children)[0]['id']
    except:
      typology = "empty"

    # extract characters info 
    for elem in list(sibling.children):


      tags_children = list(elem.children)
      if tags_children[0].name == 's': continue # crossed out text, remove from pgs

      # analyze children tags since both plain text + link text present 
      links = []
      text = ''
      for tag_children in tags_children:
       

        if tag_children.name == 'ul':
          nestedList_extraction(tag_children, characters_ep, typology)
          continue

        if type(tag_children) is bs4.element.NavigableString: # plain text 
          text += (str(tag_children) + " ")

        else: # is a tag element
          try:
            text += tag_children.text
          except:
            pass
          try:
            links.append("https://criticalrole.fandom.com" + tag_children['href'])
          except:
            pass

      # divide text in name + extra:
      # 2 patterns for extra, right side after colon or in round brackets
      extra = "empty"
      if (re.search('\w*:\w*',text)):
        splitted_text = text.split(':')
        text = splitted_text[0]
        if len(splitted_text) > 1:
          extra = clean_name(splitted_text[1].strip())

      matched = re.search('\(.*\)',text)
      if matched != None:
        if extra != "empty":
          extra = (matched.group() + " " + extra).strip()
        else:
          extra = (matched.group()).strip()
        text = text.replace(matched.group(),'')
      

      if typology in ["Vox_Machina",'The_Mighty_Nein', 'Player_Characters']:
        typology = "main"
      if len(links) == 0:
        links = "empty"

      # definition of the character table
      if typology == "main":
        character = {'name':clean_name(text),'type':typology,'links':links,'extra':extra
                     ,'role': ["main"]}
      else:
        character = {'name':clean_name(text),'type':typology,'links':links,'extra':extra}
      characters_ep.append(character)
      
      if verbose:
        print_table(character)
        print()

    # next sibling for the while loop 
    sibling = sibling.next_sibling

  return characters_ep


"""
      Extract characters from the episode page with the layout using double column
"""
def extract_characters_layout2(ep_link, verbose = False):
  characters_ep = []
  soup_ep = get_soup(ep_link) # soup for the campaign 1
  span_charaters_ep = soup_ep.find_all(id = "Featured_Characters")
  h2_characters = span_charaters_ep[0].parent
  sibling = h2_characters.next_sibling

  divs_children = list(sibling.children)[:2]
  for div in divs_children:
    children = list(div.children)
    for child in children:

      # skip condition (not a list of names)
      if child.name != "ul" :continue

      # # extract type
      try:
        typology = list(child.previous_sibling.children)[0]['id']
      except:
        typology = "empty"

      # extract characters info 
      for elem in list(child.children):
        tags_children = list(elem.children)
        if tags_children[0].name == 's': continue

        # analyze children tags since both plain text + link text present 
        links = []
        text = ''
        for tag_children in tags_children:

          if tag_children.name == 'ul':
            nestedList_extraction(tag_children, characters_ep, typology)
            continue

          if type(tag_children) is bs4.element.NavigableString: # plain text 
            text += (str(tag_children) + " ")

          else: # is a tag element
            if tag_children.name == "sup":continue
            try:
              text += tag_children.text
            except:
              pass
            try:
              links.append("https://criticalrole.fandom.com" + tag_children['href'])
            except:
              pass

        # divide text in name + extra:
        # 2 patterns for extra, right side after colon or in round brackets
        extra = "empty"
        if not (re.search('\w*:\w*',text)):
          splitted_text = text.split(':')
          extra = "empty"
          text = splitted_text[0]
          if len(splitted_text) > 1:
            extra = clean_name(splitted_text[1].strip())
        
        matched = re.search('\(.*\)',text)
        if matched != None:

          if extra != "empty":
            extra = (matched.group() + " " + extra).strip()
          else:
            extra = (matched.group()).strip()
          text = text.replace(matched.group(),'')
        
        if typology in ["Vox_Machina",'The_Mighty_Nein', 'Player_Characters']:
          typology = "main"
        if len(links) == 0:
          links = "empty"
        

        # definition of the character table
        if typology == "main":
          character = {'name':clean_name(text),'type':typology,'links':links,'extra':extra
                     ,'role': ["main"]}
        else:
          character = {'name':clean_name(text),'type':typology,'links':links,'extra':extra}
          
        characters_ep.append(character)

        if verbose:
          print_table(character)
          print()

  return characters_ep



"""
      Fuction to launch the character extraction in each episode of the selected arc.
"""
def launchExtractionC(verbose = False):
  characters = {} # dictionary used to save the data, the charactes typologies: main(from the Vox Campaign), new, returning, mentioned.

  # get episodes link
  links_eps = get_episodes(arc_n)

  # loop over episodes 
  for ep_number,ep_link in tqdm(links_eps.items()):

    if verbose: print(f"extracting characters from: {ep_link}\n")
    characters_ep = extract_characters_layout1(ep_link, verbose = verbose)

    # problems when web page is designed with a different layout, try this (like for episode 1x18)
    if len(characters_ep) == 0:
      if verbose: print(f"trying again for episode {ep_number}, it contains a different format")
      characters_ep = extract_characters_layout2(ep_link, verbose = verbose)
      
    if len(characters_ep) == 0:
      raise ValueError(f"Problem in the extraction of the characters, check the web page format for {ep_number}")

    insert_GroupEntity(characters_ep, ep_link)

    characters[ep_number] = characters_ep

  if verbose: print_table(characters)
  return characters

In [None]:
# Load files and pre-load all the soup elements for each character page available (used as equality criterion)

try:
  tmp = loadJSON(PATH_GT_R,'gt_roles_arc'+str(arc_n +1))

  # build dictionary with pre-load the soup
  roles_array = []
  for elem in tqdm(tmp):
    if not(elem['link']=='empty'):
      roles_array.append([elem,get_soup(elem['link'])])
    else:
      roles_array.append([elem,'empty'])                      
except Exception as e :
  raise ValueError("Characters' role file not found")

try:
  tmp = loadJSON(PATH_GT_R,'all_NPCs')
  
  # build dictionary with pre-load the soup
  allNPCs_array = []
  for elem in tqdm(tmp):
    if not(elem['link']=='empty'):
      allNPCs_array.append([elem,get_soup(elem['link'])])
    else:
      allNPCs_array.append([elem,'empty'])
except Exception as e :
  raise ValueError("Complete list of NPC file not found")

- loading JSON file: gt_roles_arc1.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/GT/roles/ ...


  0%|          | 0/64 [00:00<?, ?it/s]

- loading JSON file: all_NPCs.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/GT/roles/ ...


  0%|          | 0/1035 [00:00<?, ?it/s]

In [None]:
"""
      Function that takes the information from the characters' role extraction and included directly data
      into characters for episode file
"""
def includeRoles(gt_c, verbose  =True):

  # --------- aux functions used to understand correspondence of characters between files
  def check_match(compare_attribute, item):
    if (('link' in list(compare_attribute.keys())) and (item['link'] == compare_attribute['link'])):
      return True
    elif (item['name'].strip().lower() == compare_attribute['name'].strip().lower()):
      return True
    else:
      for elem in item['aka']:
        if compare_attribute['name'].strip().lower() == elem.strip().lower():
          return True
    return False

  def check_match2(soup_character_ep, soup_file):
    if ((soup_character_ep != 'empty') and (soup_file != 'empty')):
      if soup_character_ep == soup_file:
        return True
      else:
        return False
    else:
      return False
  # ---------
  
  for ep, characters in tqdm(gt_c.items()):
    if verbose: print(f'Episode: {ep}')

    for character in characters:
      if verbose: print(f"Character-before: {character}")

      # first step: get the correct attribute for the matching of the characters in other files
      compare_attribute = {}

      # get right link and name
      if character['links'] != 'empty':   # try the comparison with link
        name_parts = character['name'].split(' ')
        name_parts = [name.strip().lower() for name in name_parts]
        found  = False
        for link in character['links']:
          if (found): break
          name_link = unquote(link).split('/')[-1].strip().lower()
          for name_part in name_parts:

            if name_part in name_link:
              compare_attribute['link'] =  link
              compare_attribute['name'] = character['name'].strip().lower()
              found  = True
              break

      if compare_attribute == {}: # if not possible using link, use the only the name
        compare_attribute['name'] =  character['name'].strip().lower()

      if compare_attribute == {}:
        raise ValueError(f'No camparison attribute has been found in the episode: {ep}')

      if verbose: print(f"Compare attribute: {compare_attribute}")

      # second step find merging with the file with the roles
      merged  = False

      # initialize with empty value
      if not ('role' in list(character.keys())):
        character['role'] = 'empty'

      # handle the first entry with already available values for 'aka' and 'link-u'
      if not('aka' in list(character.keys())):
        character['aka'] = 'empty'
      if not('link-u' in list(character.keys())): 
        character['link-u'] = 'empty' # get link for unification 

      # get soup from the character of this iteration 
      if 'link' in list(compare_attribute.keys()):
        try:
          soup_character_ep = get_soup(compare_attribute['link'])
        except:
          soup_character_ep = 'empty'
      else:
        soup_character_ep = 'empty'


      # ----------------------------------[Unification on Roles]------------------------------------
      for item_array in roles_array:
        item = item_array[0]
        soup = item_array[1]

        if check_match(compare_attribute, item):
            merged = True

            # character can have more than one role
            if character['role'] == 'empty':
              character['role'] = []
            
            if not(item['type'] in character['role']):
              character['role'].append(item['type'])
            character['aka'] = item['aka']

            # extra-part
            character['name'] = item['name']
            character['link-u'] = item['link']

        if check_match2(soup_character_ep, soup):   # this match in another condition for reasoning of efficiency
            # if still no match try the unification inspecting the soup
            merged = True

            if character['role'] == 'empty':
              character['role'] = []
            
            if not(item['type'] in character['role']):
              character['role'].append(item['type'])

            character['aka'] = item['aka']

            # extra-part
            character['name'] = item['name']
            character['link-u'] = item['link']

      # ----------------------------------[Unification on NPCs]-------------------------------------

      # third step: if no match in the previous step, look if it's part of the whole NPC list
      for item_array in allNPCs_array:
        item = item_array[0]
        soup = item_array[1]
        if check_match(compare_attribute, item):

          if character['role'] == 'empty':
              character['role'] = []

          if not(item['type'] in character['role']):
            character['role'].append(item['type'])

          if(not(merged)):
            merged = True
            character['aka'] = item['aka']
            # extra-part
            character['name'] = item['name']
            character['link-u'] = item['link']

        if check_match2(soup_character_ep, item):

          if character['role'] == 'empty':
              character['role'] = []
              
          if not(item['type'] in character['role']):
            character['role'].append([item['type']])
          
          if(not(merged)):
            character['aka'] = item['aka']
            
            # extra-part
            character['name'] = item['name']
            character['link-u'] = item['link']
      
      if verbose: print(f"Character-after: {character}\n")
  return

In [None]:
# launching extraction and includes extra info to the characters
characters_ep = launchExtractionC(verbose = False)
includeRoles(characters_ep, verbose = False)

print_table(characters_ep)

# Save in drive folder
saveJSON(characters_ep, PATH_GT_C, 'gt_characters_arc' + str(arc_n+1), download = False)

link extracted: 23


  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

0)   K: 1x01                           V: [{'name': "Keyleth, Percival de Rolo, Scanlan Shorthalt, Tiberius Stormwind, Grog Strongjaw, Pike Trickfoot, Trinket, Vax'ildan, Vex'ahlia", 'type': 'empty', 'links': 'https://criticalrole.fandom.com/wiki/Vox_Machina', 'extra': 'empty', 'role': 'empty', 'aka': ['Vox Machina'], 'link-u': 'https://criticalrole.fandom.com/wiki/Vox_Machina'}, {'name': 'Keyleth', 'type': 'main', 'links': ['https://criticalrole.fandom.com/wiki/Keyleth'], 'extra': 'empty', 'role': ['main'], 'aka': ['Kiki', 'Minxie', 'Majesty or Highness', 'Antlers', 'Chief T', 'Tempest', 'Lady Keyleth'], 'link-u': 'https://criticalrole.fandom.com/wiki/Keyleth'}, {'name': 'Percival de Rolo', 'type': 'main', 'links': ['https://criticalrole.fandom.com/wiki/Percival_de_Rolo'], 'extra': 'empty', 'role': ['main'], 'aka': ['Percival de Rolo', 'Percy', 'No Mercy Percy', 'Freddy', 'Whitey', 'Four-Eyes'], 'link-u': 'https://criticalrole.fandom.com/wiki/Percival_de_Rolo'}, {'name': 'Scanlan Shor

In [None]:
# How to load json file, simply use as dictionary
gt_c = loadJSON(PATH_GT_C,'gt_characters_arc' + str(arc_n+1))

"""
gt_c structure:
KEY = episode number -> VALUE = ARRAY Characters
ARRAY Characters, each dictionary with the following keys: 'name', 'type', 'links','extra'
"""


print_table(gt_c)

- loading JSON file: gt_characters_arc1.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/GT/characters/ ...
0)   K: 1x01                           V: [{'name': "Keyleth, Percival de Rolo, Scanlan Shorthalt, Tiberius Stormwind, Grog Strongjaw, Pike Trickfoot, Trinket, Vax'ildan, Vex'ahlia", 'type': 'empty', 'links': 'https://criticalrole.fandom.com/wiki/Vox_Machina', 'extra': 'empty', 'role': 'empty', 'aka': ['Vox Machina'], 'link-u': 'https://criticalrole.fandom.com/wiki/Vox_Machina'}, {'name': 'Keyleth', 'type': 'main', 'links': ['https://criticalrole.fandom.com/wiki/Keyleth'], 'extra': 'empty', 'role': ['main'], 'aka': ['Kiki', 'Minxie', 'Majesty or Highness', 'Antlers', 'Chief T', 'Tempest', 'Lady Keyleth'], 'link-u': 'https://criticalrole.fandom.com/wiki/Keyleth'}, {'name': 'Percival de Rolo', 'type': 'main', 'links': ['https://criticalrole.fandom.com/wiki/Percival_de_Rolo'], 'extra': 'empty', 'role': ['main'], 'aka': ['Percival de Rolo', 'Percy',

## Timeline's events

In [None]:
"""
      Extract events of the right arc from the timeline 
"""

# Aux functions 
"check if navigable string elem has the episode number"
def is_formatEp(elem):
  
  # look for text inside tags
  if type(elem) == bs4.element.NavigableString:
    text = elem.string
  else:
    return False

  matched = re.search('^\(\d*x\d*\)$',text)
  if matched != None: 
    return True
  else: return False

""" get episode, campaing, and arc from  text of the format: (Campaign)x(Episode) """
def extract_ep(text, verbose = False):

  text = text.replace('(','').replace(')','')
  campaign = int(text.split('x')[0])
  episode = int(text.split('x')[1])
  if campaign == 0:
    eps_arc = eps_arc1
  elif campaign_n == 1:
    eps_arc = eps_arc2
  else:
    eps_arc = eps_arc1

  for i,ep_arc in enumerate(eps_arc):
    if (episode >= ep_arc[0]) and (episode <= ep_arc[1]):
      arc = i+1
  if verbose: print(f'found event from campaign n°{campaign}, episode n°{episode}, and arc n°{arc}')
  return episode, arc, campaign

""" get tag or return False if is NavigableString type"""
def check_tag(elem, tag):
  if type(elem) == bs4.element.NavigableString: return False
  else: 
    if elem.name == tag:
      return True
    else:
      return False

""" aux function used to handle the case of a <ul> tag list inside another list"""
def insertInfoList(elem, events_info, prev_text):
  for li in list(elem.children):
    text = ''
    links = []
    for elem in list(li.children):
      if text == '' or text[-1] == ' ':
          text += elem.string
      else: 
        text += ' '+ elem.string

      if elem.name == 'a':
        if web_archive:
          tmp_link =  elem['href'].split('https')[1]
          tmp_link = 'https' + tmp_link
          links.append(tmp_link)
        else: 
          links.append('https://criticalrole.fandom.com' + elem['href'])
      
    if links == []: links = 'empty'

    if text != '':
      new_text  = ''
      if text == '' or text[-1] == ' ':
        new_text = prev_text + text
      else: 
        new_text += prev_text +' '+ text
      print(f" text from the list tag -->   {new_text}")
      print(f"links from the list tag -->   {links}")
      event_info = {'text':new_text, 'links':links}
      events_info.append(event_info)
  return 


# ------------ extraction function

def extract_eventTL(verbose = False, web_archive = True):
  events_TL = []

  # get HTML from HTTP response and build BeautifulSoup object from constuctor
  soup = get_soup(HTTPS_LINK_TIMELINE)

  if campaign_n == 0:
    id_tl = 'Campaign_One:_Vox_Machina'
  elif campaign_n == 1:
    id_tl = 'Campaign_Two:_The_Mighty_Nein'

  # extract event from timeline for the selected campaing
  span_TL = soup.find_all(id= id_tl)[0]

  # get table tag
  h3_TL = span_TL.parent
  if campaign_n == 0:
    div_table_TL = next(next(h3_TL.next_siblings).next_siblings)
  elif campaign_n == 1:
    div_table_TL = next(h3_TL.next_siblings)
  tbody_TL = next(next(div_table_TL.children).children)

  rows_table = list(tbody_TL.children)[1:]

  # loop over rows 
  for i,row_table in enumerate(rows_table):
    

    # retrieve the 2 columns
    row_table_col1 = next(row_table.children)
    row_table_col2 = row_table_col1.next_sibling

    # ---------------------------------- get episode from 1st column ------------------------------
    try:
      p_col1 = list(row_table_col1.children)[2]
    except:
      p_col1 = list(row_table_col1.children)[1]
    elem = p_col1

    eps_tmp = []
    while (not(check_tag(elem,'td'))):

      elem = elem.next_element
      if is_formatEp(elem):
        ep_tmp, arc_tmp, _ = extract_ep(elem.string, verbose)
        if not (ep_tmp in eps_tmp):
          eps_tmp.append(ep_tmp)
    
    # skip if this row describe episode of a previous arc
    if arc_n +1 > arc_tmp:
      continue

    # ----------------------------------get events from the 2nd column -----------------------------
    first_elem  = list(row_table_col2.children)[0]
    elem = first_elem

    # build the list for the events info
    events_info = []

    # first couple of variables for event not in a list
    links = []
    text_notlist = ''

    # ----------- first part event extraction,  look for string element or <a> marker --------------

    for elem in list(row_table_col2.children):

      # list element are handled in the below part of this code, here just html text and <a>
      if check_tag(elem,'ul'):
        continue

      try:
        if text_notlist == '' or text_notlist[-1] == ' ':
          text_notlist += elem.string
        else: 
          text_notlist += ' '+ elem.string

        if type(elem)!= bs4.element.NavigableString:
          if elem.name == 'a':
            if web_archive:
              tmp_link =  elem['href'].split('https')[1]
              tmp_link = 'https' + tmp_link
              links.append(tmp_link)
            else: 
              links.append('https://criticalrole.fandom.com' + elem['href'])
      except Exception as error:
        # handle special cases

        for b_elem in list(elem.children):
          if text_notlist == '' or text_notlist[-1] == ' ':
           text_notlist += b_elem.string
          else: 
           text_notlist += ' '+ b_elem.string

          if type(b_elem)!= bs4.element.NavigableString:
            if b_elem.name == 'a':
              if web_archive:
                tmp_link =  b_elem['href'].split('https')[1]
                tmp_link = 'https' + tmp_link
                links.append(tmp_link)
              else: 
                links.append('https://criticalrole.fandom.com' + b_elem['href'])


    if links == []: links = 'empty'
    if text_notlist != '':
      if verbose: print(f" text from the non-list tag -->   {text_notlist}")
      if verbose: print(f"links from the non-list tag -->   {links}")
      event_info = {'text':text_notlist, 'links':links}
      events_info.append(event_info)


    # ----------- second part event extraction, look for a possible list in the row: <ul> tag ------

    # I restart from the first child of <td> since the list can be both first or next child
    found = False
    for elem in list(row_table_col2.children):
      if (check_tag(elem,'ul')):
            ul_event = elem
            found = True
      if found: break


    if found:
      # scan the list
      for elem_list in list(ul_event.children):
        # initialize the info variables
        text = ''
        links = []
        for elem in list(elem_list.children):

          # possibility to find a list inside another list, use previous text int list's point
          if check_tag(elem,'ul'):
            insertInfoList(elem, events_info, text)
            text = ''
            continue

          if text == '' or text[-1] == ' ':
            text += elem.string
          else: 
            text += ' '+ elem.string
          if elem.name == 'a':
            if web_archive:
              tmp_link =  elem['href'].split('https')[1]
              tmp_link = 'https' + tmp_link
              links.append(tmp_link)
            else: 
              links.append('https://criticalrole.fandom.com' + elem['href'])
        
        if links == []: links = 'empty'

        if text != '':
          if verbose: print(f" text from the list tag -->   {text}")
          if verbose: print(f"links from the list tag -->   {links}")
          event_info = {'text':text, 'links':links}
          events_info.append(event_info)
    
    if verbose: print(f'\n-------{i+1}-------\n')
    
    # exit if this row describe episode of a next arc
    if arc_n+1 < arc_tmp:
      return events_TL

    # otherwise we are extrating from the right arc, save data
    for ep in eps_tmp:
      event_TL = {}
      event_TL['episode'] = ep

      event_TL['description'] = events_info
      events_TL.append(event_TL)

  return events_TL

In [None]:
# launch extraction
events_TL = extract_eventTL(verbose = False, web_archive = True)

# Save on the drive folder 
saveJSON(events_TL, PATH_GT_E, 'tl_events_arc' + str(arc_n+1), download = False)

## Utilities extraction

In [None]:
character_dictionary_list = []
for arc in arc_n:
  character_dictionary_list.append(loadJSON(PATH_GT2_C,'gt_characters_arc'+str(arc +1)))

npc_dictionary ={}
npc_dictionary = loadJSON(PATH_GT2_R,'all_NPCs')

""" 
      look for char item/npc
"""
def get_field_from_char_item(field, item):
  if item:
    return item[field]
  else : return None

def look_for_char_item(token, ep_num, character_dictionary, debug = False, no_aka=False, no_vox = True):
  """
  look_for_char_item(token, ep_num), is used to search for a character in the character_dictionary 
  list, which is a list of dictionaries that contain information about the characters in the series. 
  The function takes in a token, which is a string that represents the name of the character, and 
  an ep_num, which is an integer that represents the episode number of the series. 
  The function then iterates through the list of dictionaries, checking for a match between 
  the token and the name field of each dictionary. If a match is found, the function returns 
  the dictionary that contains the matching character.
  """
  if debug : print("[0.1.1] look_for_char_item ... for ", token)
  token = token.lower()
  if token in stopWords: 
    if debug: print("[0.1.1.1]stopword ", token)
    return None
  for item in character_dictionary[ep_num]:
    if no_vox:
      if item['aka'] == ["Vox Machina"]:
        if debug: print("[0.1.1.2] skippink vox machina")
        continue #CAMBIARE QUI IL [1:] IN (NESSUN SLICE) QUANDO SI VUOLE SOSTITUIRE IL LISTONE
      if  item['aka'] == ["The Mighty Nein"]:
        continue
    name = item['name'].lower() 
    match_val = fuzz.token_set_ratio(token,name )
    if match_val>99 and  item['aka'] != ["Vox Machina"] and item['aka'] != ["The Mighty Nein"]:
        # messo qui perchè altrimenti metcha i nomi del party con il primo
      if debug : print(f"[0.1.1.a] look_for_char_item ... match_val = {match_val} for  ",token, item)
      return item
    else:
      if no_aka:return None
      for i in item['aka']:
        aka_match_val = fuzz.token_set_ratio(token,i )
        if aka_match_val>99:
          if debug : print(f"[0.1.1.b] look_for_char_item ...match_val = {aka_match_val} for",token, i, item)
          return item
  return None

def look_for_npc_item(token, link, debug= False):
  """
  look_for_npc_item(token, link), is used to search for an NPC in the npc_dictionary list. 
  The function takes in a token, which is a string that represents the name of the NPC, 
  and a link, which is a string that represents the URL of the NPC's page on the website. 
  The function then iterates through the list of dictionaries, checking for a match between 
  the token and the name field of each dictionary. If a match is found, the function returns 
  the dictionary that contains the matching NPC.
  """
  token = token.lower()
  if token in stopWords: 
    # print("stopword ", token)
    return None
  for item in npc_dictionary:
    name = item['name']
    match_val = fuzz.token_set_ratio(token,name )
    if match_val>99:
      if debug : print(f"[0.1.2.a] look_for_char_item ... match_val = {match_val} for  ",token, item)
      return item
    else:
      for i in item['aka']:
        aka_match_val = fuzz.token_set_ratio(token,i )
        if aka_match_val>99:
          if debug : print(f"[0.1.2.b] look_for_char_item ...match_val = {aka_match_val} for",token, i, item)
          return item
  return None

def look_for_char_by_link(roles_array, span_link,span=None, debug=False):
  """
  look_for_char_by_link(span_link,span=None), is used to search for a character or NPC 
  in the roles_array and allNPCs_array list using the link. The function takes in a span_link, 
  which is a string that represents the URL of the character or NPC's page on the website and a span, 
  which is a string that represents the name of the character or NPC. The function then iterates 
  through the list of arrays, checking for a match between the span_link and the link field of 
  each array. If a match is found, the function returns the array that contains the matching 
  character or NPC.
  """
  span_link = "https://criticalrole.fandom.com" + span_link
  if debug : print("[0.1.2] look_for_char_by_link ... for ", span , span_link)
  for item in roles_array:
    if item[0]['aka'] == ["Vox Machina"]:
      continue
    if  item[0]['aka'] == ["The Mighty Nein"]:
        continue
    if item[0]['link-u']==span_link:
      if debug :  print("[0.1.2.1] look_for_char_by_link ...BY LINK for \n" ,span, item[0])
      return item[0] ,"char"
  for item in allNPCs_array:
    if item[0]['aka'] == ["Vox Machina"]:
      continue
    if item[0]['link']==span_link:
      if debug : print("[0.1.2.2] look_for_char_by_link ...BY LINK for \n" ,span, item[0])
      return item[0] ,"npc"

  try:
    string2 = get_soup(span_link)
    for item in roles_array:
      if item[0]['aka'] ==[ "Vox Machina"]:
        continue
      if  item[0]['aka'] == ["The Mighty Nein"]:
        continue
      string1 = item[1]
      if string1 == 'empty' : continue

      if string1.find(rel="canonical") == string2.find(rel="canonical") :

        return item[0] ,"char"
    for item in allNPCs_array:
      if item[0]['aka'] == ["Vox Machina"]:
        continue
      if  item[0]['aka'] == ["The Mighty Nein"]:
        continue
      string1 = item[1]
      if string1 == 'empty' : continue

      if string1.find(rel="canonical") == string2.find(rel="canonical") :
        return item[0], "npc"

    return None, ""
  except: 
    return None, ""


"""
        Cleans a token by removing punctuation and similar noise.
"""
def clean_token(token):
    return token.strip(" `.,:;!?")

def check_token_in_link_span(idx, token, links, current_link, current_link_start, current_link_end):
    """
    Given the current token, checks if it is part of a link span.
    check_token_in_link_span(idx, token, links, current_link, current_link_start, current_link_end), 
    is used to check if the token is part of a link span. The function takes in an idx, which is an 
    integer representing the current index of the token being processed, a token, which is a string 
    representing the current token, a links, which is a list of tuples representing the start and 
    end indices of links in the text, a current_link, which is a string representing the current 
    link, a current_link_start, which is an integer representing the start index of the current 
    link, and a current_link_end, which is an integer representing the end index of the current link. 
    The function then iterates through the links, checking if the current index of the token is 
    within a link's start and end indices. If it is, the function updates the current link and 
    its start and end indices, and returns True. Otherwise, it returns False.
    """
    in_span = False
    for start, end in links:
        if start <= idx < end:
            if current_link_start != start or current_link_end != end:
                current_link_start = start
                current_link_end = end
                current_link = links[(start, end)]
                in_span = True
            break
        else:
            current_link = None
            current_link_start = None
            current_link_end = None
    return current_link, current_link_start, current_link, in_span

def get_links_in_text(element):
    """
    Given an element containing links, returns a dictionary containing the positions of the words in the text
    as keys and the hrefs as values.

    get_links_in_text(element), is used to extract the positions and URLs of links within 
    a given HTML element. The function takes in an element, which is an HTML element containing links. 
    The function initializes an empty dictionary called links and two variables current_position and 
    current_position which will be used to keep track of the position of links in the element's text. 
    The function then iterates through all children of the element, checking if any child is an a 
    element, which represents a link in HTML. If a child is an a element, the function extracts the 
    href attribute and the text of the link. It then finds the position of the link in the element's 
    text and adds the position and href to the dictionary as key-value pair.
    """
    links = {}
    current_position = 0
    for child in element.children:
        if child.name == "a":
            # Get the href and text of the link
            href = child['href']
            link_text = child.get_text()
            
            # Get the position of the link in the element's text
            start = element.get_text().index(link_text, current_position)
            end = start + len(link_text)
            # Add the position and href to the dictionary
            links[(start, end)] = href
            current_position = end
            
    return links


def get_recap_text(p_elements):
    """
    Given a list of p elements, returns the concatenated text of all elements with a name of 'i' and a 
    dictionary of links in the text.

    get_recap_text(p_elements), is used to extract the concatenated text of all elements with a 
    name of 'i' and a dictionary of links in the text. The function takes in a list of p_elements, 
    which are HTML elements containing text. The function initializes an empty string called 
    recap_text and an empty dictionary called links. It then iterates through the list of 
    p_elements, checking if any element has a name of 'i'. If an element has a name of 'i', 
    the function extracts the text and links from the element using the get_links_in_text() 
    function defined above. The function then updates the current position of the text and 
    adds the text to the recap_text string. Finally, the function returns the concatenated 
    text and dictionary of links.
    """
    recap_text = ""
    current_position = 0
    links = {}
    for element in p_elements:
        if element.name == "i":
            element_text = element.get_text()
            element_links = get_links_in_text(element)
            for start_end, href in element_links.items():
                start, end = start_end
                links[(current_position+start, current_position+end)] = href
            current_position += len(element_text)
            if recap_text == "":
                recap_text = element_text
            else:
                recap_text += element_text
    return recap_text, links

 ## Recaps

### Utilities for Json Formatting

In [None]:
def check_span_overlap(start, end , span):
  """
  check_span_overlap(start, end , span), is used to check if there is an overlap between two spans. 
  The function takes in 3 arguments, a start and end which are integers representing the start 
  and end position of the first span, and span, which is a tuple representing the start and 
  end position of the second span. The function checks if the first span starts before the 
  second span and ends within the second span, if the first span starts within the second 
  span and ends after the second span, if the first span starts before the second span and 
  ends after the second span and if the first span starts after the second span and ends 
  within the second span. If any of these conditions are met, the function returns True, 
  otherwise it returns False.
  """
  if start <= span[0]:
    if end <= span[1]  and end > span[0]:
      #case s---s0---e---s1
      return True
    elif end > span[1]:
      #case s---s0---s1---e
      return True
  elif  start > span[0] and start < span[1]:
    if end <= span[1]  and end > span[0]:
      #case s0--s----e---s1
      return True
    elif end > span[1]:
      #case s---s0---s1---e
      return True
  return False

In [None]:
def chunk_len(chunk):
  """
  chunk_len(chunk), is used to calculate the length of a chunk of text. 
  The function takes in a chunk, which is a sequence of words, and calculates the 
  length of the chunk by iterating through the words, adding the length of each word 
  and the number of spaces between them. The function returns the total length of the chunk.
  """
  word_length = 0
  for token, tag in chunk.leaves():
    word_length += len(token)
    if len(chunk.leaves()) > 1:
      word_length+=1 #for the spaces
  return word_length #per l'ulitmo space

In [None]:
nlp = spacy.load(PIPELINES_TYPE[3])

# Define a function to match named entities with names in the dataset using Spacy's Matcher
def spacy_matcher(text, links, character_dictionary, debug = True):
    """
    spacy_matcher(text, links, character_dictionary), is used to extract named entities from a text 
    using spaCy's Matcher. The function takes in 3 arguments, a text which is a string of text, 
    a links which is a dictionary of links in the text and character_dictionary which is a list
    of dictionaries containing information about characters in the text. The function first loads 
    the spaCy's english model, and then creates a doc object of the text.
    It initializes two variables, named_entities which is an empty list and idx which is an integer 
    representing the current index.
    The function then iterates through all the tokens in the doc, using a for loop. For each token, 
    it checks if the token is part of a named entity by checking the ent_type_ attribute of the token. 
    If the token is part of a named entity, the function gets the start and end positions of the 
    entity in the original text by using the text_with_ws attribute of the token and index() function. 
    The function then checks if the entity is in a link by iterating through the links dictionary, 
    using the check_span_overlap() function to check if there is any overlap between the entity's 
    span and the link's span. If the entity is within a link, the function skips the entity, 
    otherwise it adds the entity to the named_entities list.
    At the end of the function, it will return the named_entities list, which contains all the 
    named entities that are not within links.
    """
    doc = nlp(text)
    
    # Initialize current index
    idx = 0
    named_entities =[]
    if debug: print("[1.2.1]spacy_matcher... Iterate over tokens in the doc ...")
    # Iterate over tokens in the doc
    prev_end=100
    span=""
    for token in doc:
            
        



        # Check if the token is part of a named entity
        if debug: print("[1.2.1.DEBUG]spacy_matcher... TOKEN:" , token, "token.ent_type_: ", token.ent_type_)
        start = text.index(token.text_with_ws, idx)
        end = start + len(token.text_with_ws)
        if debug: print(f"[1.2.1.a]debug start, end {(start, end)}")
        if token.ent_type_ != "":
            # Get the start and end positions of the entity in the original text
           
            if debug: print("[1.2.1.DEBUG]spacy_matcher...is entity:  START,END " , (start, end))

            overlap=False
            # Check if the entity is in a link
            for link_pos in links.keys():
                if check_span_overlap(start, end, link_pos) == True:
                    if debug: print(f"[1.2.1.DEBUG]spacy_matcher... Entity {token.text} at {(start, end)} is in span {link_pos} with link {links[link_pos]}")
                    overlap= True
                    span =""
                    break
            if debug: print("[1.2.1.DEBUG]spacy_matcher... overlap == ", overlap)
            if overlap == False: 
              if debug: print(f"[1.2.1.DEBUG]spacy_matcher... prev_end {prev_end},start {start}, span {span}" )
              if prev_end == start:
                #span
                old_span = span
                span+=token.text
                span += " "
                if debug: print(f"[1.2.1.DEBUG]spacy_matcher... adding {token.text} to existing span{old_span} >> {span}")
              elif prev_end != start:
                
                span = token.text
                span += " "
                
                if debug: print(f"[1.2.1.DEBUG]spacy_matcher...starting new span : {span}, with indexes {(start, end)}") 
            span_start = start
            prev_end = end
            if debug: print(f"[1.2.1.b]debug start, end {(start, end)}")
        else: 
          if span!= "":
            if token.text  in stopWords:
              if debug: print(f"[1.2.1.c]debug start, end {(start, end)}")
              #this might be a intermediate token in a span like the 'of' in 'Lady Kima of Vord' 
              #we want to add this to the span if the next word is a entities
              next_token = doc[token.i+1]
              if next_token.ent_type_ != "":
                # Add the current token to the span
                span += token.text + " "
                span_start = start
                prev_end = end
                if debug: print(f"[1.2.1.d]debug start, end {(start, end)}")
              else:
                # If the next token is not a named entity, add the current span to the list of named entities
                span = span[:-1]
                if span not in named_entities:
                  if debug: print(f"[1.2.1]debug span_start, prev_end {(span_start, prev_end)}")
                  if debug: print(f"[1.2.1]spacy_matcher... add span:[{span}] to list of named entities , with indexes {(span_start, prev_end)}" )
                  span_end = prev_end
                  named_entities.append(((span_start,span_end),span))
                  span = ""
           
            
            # else:
            #   if token.text not in named_entities:
            #     named_entities.append(token.text)
        # Update the current index
        
        idx = end    
            
           
    
    return named_entities, doc

    

In [None]:

# Tokenize text
def ner_and_fuzzy(text, links, ep_num, ep_list, char_links,character_dictionary, debug  =True):
  """
  The ner_and_fuzzy() function takes in a text, links, ep_num, ep_list, and char_links as its parameters. 
  It first calls the spacy_matcher() function to get a list of named entities from the text and a 
  doc object. It then prints the named entities. The function then uses the fuzzywuzzy library to 
  match the named entities with names in the dataset using fuzzywuzzy.
  It iterates over the named entities and for each named entity, it uses a for loop to iterate 
  over the ep_list, which is a list of episode numbers. For each episode number in the ep_list, 
  it uses another for loop to iterate over the character_dictionary, which is a dictionary of 
  characters, where the keys are episode numbers and the values are lists of character objects. 
  The function then uses the fuzz.token_set_ratio() function to find the match ratio between the 
  named entity and the character's name. If the match ratio is 100, the named entity is a match 
  with the character's name and the function sets the match_ variable to the named entity, the 
  character object and the match ratio. The function then appends the match to the matches list 
  along with the position of the named entity in the text.
  """
  
  # print("[1.1]spacy_matcher ....")
  named_entities , doc = spacy_matcher(text, links, character_dictionary, debug)
  if debug: print("[1.2]named_enities: ", named_entities)
  # Match named entities with names in the dataset using fuzzywuzzy
  matches = []


  for pos, entity in named_entities:
    
    


    if entity.lower() in stopWords: continue
    match_ = None
    FOUND = False
    for key in ep_list:
      if FOUND == False:
        for item in character_dictionary[key]:
            if item['aka'] == ["Vox Machina"]:
                continue
            # if debug: print(f"item aka : {item['aka']}")
            if FOUND == False:
              name_match = fuzz.token_set_ratio(entity, item['name'])
              if name_match >99 : 
                FOUND = True
                match_ = (entity,item ,name_match)
                matches.append((pos,match_))
                if debug: print("[1.3]fuzz... ",entity , "match with name :", match_ , "at dictionary ep item :" , key)
                break
        if FOUND == False:
          for item in character_dictionary[key]:
              # if debug: print(f"item aka : {item['aka']}")
              if item['aka'] == ["Vox Machina"]:
                continue
              aka_match = max([fuzz.token_set_ratio(entity, aka) for aka in item['aka']]) if item['aka'] else 0
              if aka_match > 99:
                FOUND = True
                if debug: print("[1.3]fuzz...", entity, "match with aka ", (entity,item, aka_match), "at dictionary ep item :" , key)
                matches.append((pos,(entity,item['name'], aka_match)))
                break
  if debug: print("[1] matches : ", matches)
  if debug: print("[1] char_links: ", char_links)
  return matches

### Utilities for Recaps Extraction

In [None]:
def chek_more_spans(tokens, links, recap, span_link, debug= False):
  """
  The chek_more_spans() function takes in tokens, links, recap, and span_link as its parameters. 
  It uses a for loop to iterate over the tokens. For each token, it uses the re.compile() 
  function to match the span and the re.finditer() function to find all matches of the pattern 
  in the text. It then uses another for loop to iterate over the matches and check if there is 
  any overlap between the match and the links. If there is no overlap, it adds the match to the 
  links dictionary with the start and end positions of the match as the key and the span_link as 
  the value. The function then returns the updated links dictionary.
  """
  for token in tokens:
    if debug: print("[0.1.3]chek_more_spans ... token", token)
    # Compile the regular expression to match the span

    token_ = token.strip("[] ")
    pattern = re.compile(re.escape(token_))

    # Iterate over all matches of the pattern in the text
    for match in re.finditer(pattern, recap):
        start = match.start()
        end = match.end()
        # print("match, needs to check for overlap",start , end , recap[start:end],span_link)
        overlap=False
        for k1, k2 in links.keys():
          overlap = check_span_overlap(k1 ,k2 , (start,end))
          if overlap == True: 
            if debug: print("[0.1.3]chek_more_spans ...  overlap" , (k1 ,k2 , (start,end)), "\n  token , recap[start:end]", token , recap[start:end] )
            break
        if overlap==False:
            if debug: print("[0.1.3]chek_more_spans ... ADDING IN PLACE OF ", recap[start:end] ,(start, end), span_link)
            links[(start, end)] =  span_link
            
        
  return links

In [None]:
def extends_links(roles_array, span_list, ep_num , text , ep_list, links,  character_dictionary, debug = False,):   
    """
    The extends_links function takes as input a list of spans (span_list), an episode number 
    (ep_num), a text, a list of episodes (ep_list), and a dictionary of links (links). 
    The function iterates through the span_list and for each span, it looks for a matching 
    character item in the episode list by calling the look_for_char_item and look_for_char_by_link 
    functions. If a matching character item is found, the function gets the name and aka fields
    from the item and uses them to check for any additional span matches in the text by calling 
    the chek_more_spans function. The function then adds these additional matches to the links 
    dictionary and returns it, along with the char_links dictionary.
    """   
    if debug: print("[0.1]extends_links... span_list",span_list)
    char_links = links
    for span,span_link in span_list:
      char_item = look_for_char_item(span, ep_num, character_dictionary, debug = debug)
      tokens = []
      char_tokens = []
 
      if char_item:
        token = get_field_from_char_item('name',char_item)
        tokens.append(token)
        char_tokens.append(token)
        aka_tok=get_field_from_char_item('aka',char_item)
        if aka_tok != "empty" : 
          tokens+=aka_tok
          char_tokens+=aka_tok
      else:
        char_item , _=look_for_char_by_link(roles_array, span_link, span , debug = debug)
        if char_item:
          token = get_field_from_char_item('name',char_item)
          tokens.append(token)
          char_tokens.append(token)
          aka_tok=get_field_from_char_item('aka',char_item)
          if aka_tok != "empty" : 
            tokens+=aka_tok
            char_tokens+=aka_tok
        else:
          #add the span token to the list to take into account those elements that are not char , 
          #but are still to be extended so that the next system will not take them into account and miss classify them 
          #e.g Kraghemmer (the place)  as Tiberius Stormwind (ehich as as aka Tiberius kraghmer)
          tokens.append(span)
      if debug: print("[0.1.2]extends_links...tokens ", tokens, "\n [0.1.2]extends_links... calling chek_more_spans")
      if len(tokens)>1:
        links = chek_more_spans(tokens, links, text, span_link)
        char_links=chek_more_spans(char_tokens, links, text, span_link, debug)
      elif len(tokens)==1:
        links = chek_more_spans(tokens, links, text, span_link)
        char_links=chek_more_spans(char_tokens, links, text, span_link, debug)
    return links , char_links


In [None]:

def extend_spans_in_text(roles_array, text, links, ep_num , ep_list,  character_dictionary, debug = True):
    """
    Given a text and a dictionary of links, returns a dictionary containing the tokens of the text
    and the links associated with them.

    The extend_spans_in_text function takes as input a text, a dictionary of links, an episode number, 
    and a list of episodes. It tokenizes the text, and then iterates through the tokens, checking 
    if each token is part of a span in the links dictionary. If a token is part of a span, the 
    function adds it to a span variable and continues iterating until the span ends. 
    Once a span ends, the function appends the span and its associated link to a span_list. 
    It then calls the extends_links function and passes it the span_list, episode number, text,
    episode list, and links dictionary. The function returns the updated links dictionary 
    returned by the extends_links function.
    """
    dictionary = []
    idx = 0
    current_link = None
    current_link_start = None
    current_link_end = None
    in_span=False
    prev_in_span = False
    span = ""
    span_link = ""
    span_list=[]
    for token in nltk.word_tokenize(text):
        token = clean_token(token)
        if prev_in_span== False: span = ""
        if token == "": 
          idx += len(token)  + 1
          continue
        current_link, current_link_start, current_link_end , in_span= check_token_in_link_span(idx, token, links, current_link, current_link_start, current_link_end)
        idx += len(token) + 1 # +1 to account for the space between tokens
        if in_span== True:
          span += " "
          span +=token
          prev_in_span = True
          span_link = current_link
          continue
        if prev_in_span == True and in_span == False:
          #the span has ended
          # print(f"span {span} , link: {span_link}")
          if (span,span_link) not in span_list: span_list.append((span, span_link))
          
          prev_in_span = False
          continue
    if debug: print("[0.1] calling extends_links eith span_list", span_list)
    links , char_links = extends_links(roles_array, span_list, ep_num , text , ep_list, links,  character_dictionary, debug = debug)
    if debug: print("[0.2]links", links)
    return links  , char_links



### Json Formatting

In [None]:


def build_json(roles_array,character_dictionary,ep_num, text, matches, char_links, debug=False):
    """
    This code uses NLTK and SpaCy to tokenize text, and then it uses the matches returned from the 
    ner_and_fuzzy function and the char_links dictionary to add information to the tokens.
    It first initializes a json object with the key 'sentences' which is an empty list. 
    It uses the SpaCy library to tokenize the text into sentences and assigns an id to each sentence. 
    Then it iterates over each sentence tokenizing them and adding the token, label, role, and link 
    information to the json object.
    It uses the 'matches' and 'char_links' data to check if a token is part of a named entity or 
    a character and if so, it updates the json object with the relevant information. If the 
    token is not part of a named entity or a character, it continues to the next token. 
    The function returns the json object with the tokenized text and the relevant information added.
    """
    # Initialize the json object
    json_obj = {'sentences': []}
       
    doc = nlp(text) 
    # Initialize current index
    spacy_idx = 0
    overall_substitutions= {}
    for i,sent in enumerate(doc.sents):
      # print(sent)
      # Initialize the sentence json object
      sentence_obj = {'id': i, 'elements': []}
      SPAN=False
      ADD_TOKEN=True
      sub_id = 0
      prev_link = ""
      offset_sent = 0 # la differenza tra il itme_name e lo span di tokens che va a sostituire
      
      for j,token_ in enumerate(sent):
        
        token=str(token_)

        #debug
        # debug = False
        # if token=='Tiberius': debug = True



        # if debug: print("[0] : ADD_TOKEN ", ADD_TOKEN, "SPAN", SPAN)
        # Check if the token is part of a named entity
        if debug:  print("[1.2.1.DEBUG.a]spacy_matcher... TOKEN:" , token)
        start = text.index(token_.text_with_ws, spacy_idx)
        end = start + len(token_.text_with_ws)
        if debug:  print(f"[1.2.1.DEBUG.b] spacy: token {token} ,  text[{start}:{end}]: {text[start:end]}" )
        spacy_idx=end
        if SPAN == False: 
          # we initialize the object because there is not a span going on
          ADD_TOKEN = True
          element_obj = {'pos': (start,end),'id': [j+offset_sent], 'token': token, 'label': '', 'role': [], 'link': ''}
        else:
          ADD_TOKEN = False
          sub_id +=1

        # if debug:  print("[1] : ADD_TOKEN ", ADD_TOKEN, "SPAN", SPAN)

        # Check if the word's index is in the matches list
        CYCLE_FOUND = False
        for match_ in matches:
            """
            0 A tuple of two integers representing the start and end index of the match in the text
            1 A tuple containing the following:
              1-0 A string representing the text of the match
              1-1 A dictionary containing various information about the match, such as:
                The name of the match
                The type of the match (e.g. "Mentioned")
                A list of links related to the match
                Extra information (e.g. "empty")
                A list of alternate names for the match
                A list of roles associated with the match
                A link-u URL
              1-2 An integer representing the match's score
            """
            if check_span_overlap(start, end , match_[0]):
                if debug:  print("[1.a] : overlap in MATCHES , match: ", match_)
                SPAN = True
                CYCLE_FOUND = True
                if debug:  print("[1.b] : ADD_TOKEN ", ADD_TOKEN, "SPAN", SPAN)
                # print(" match_[1][1]",  match_[1][1])
                if isinstance(match_[1][1], dict):
                  element_obj['token'] = match_[1][1]['name']
                  element_obj['role'] = match_[1][1]['role']
                  element_obj['link'] = match_[1][1]['links'][0]
                  element_obj['label'] = 'PERSON'
                  if j not in element_obj['id']: 
                    if debug: print(f"[1.b.ID] adding {j} to {element_obj['id']}")
                    element_obj['id'].append(j)
                  if token not in overall_substitutions.keys():
                    if token not in overall_substitutions.keys():
                      if token not in stopWords:
                        if element_obj['link']!='e' and element_obj['link']!="":
                          match_fuz = fuzz.token_set_ratio(token, element_obj['token'])
                          if match_fuz > 99 : 
                            overall_substitutions[token] = element_obj
                    # print(f"overall_substitutions[{token}] : {overall_substitutions[token]}")
            # else: 
        """ if CYCLE_FOUND == False: 
          if SPAN == True:
            SPAN=False
            ADD_TOKEN = True
            prev_link=""
            element_obj = {'pos': (start,end),'id': [j], 'token': token, 'label': '', 'role': [], 'link': ''}

            if debug:  print("[1.c] : ADD_TOKEN ", ADD_TOKEN, "SPAN", SPAN)
        """
                  
        # if debug:  print("[2] : ADD_TOKEN ", ADD_TOKEN, "SPAN", SPAN)
        # Check if the word's index is in the char_links dictionary
        if element_obj['label'] == "":
          CYCLE_FOUND = False
        
        for index, link in char_links.items():
            # if debug: print(f"[2.0] start {start}, end {end}, index {index}")
            if check_span_overlap(start, end , index):
                if debug: print(f"[2.1.y]pos_link -> text[index] : {text[index[0]:index[1]]}")
                # print("link: ",link ,"at index" , index)
                CYCLE_FOUND = True
                if debug: print(f"[2.1.z]: ADD_TOKEN:", ADD_TOKEN, "SPAN:", SPAN , "prev_link:",prev_link ,"link", link)
                if (SPAN==True) and  (prev_link != "") and (link != prev_link):
                  #if the two consecutive words have different links than they are not in the same span
                  if debug: print(f"[2.1b] : two different links in a span: {link} != {prev_link}")
                  SPAN =False
                  sentence_obj['elements'].append(element_obj)
                  ADD_TOKEN = True
                  element_obj = {'pos': (start,end),'id': [j], 'token': token, 'label': '', 'role': [], 'link': ''}
                else:
                  #reset the prev link because SPAN might be False at this point 
                  if debug: print(f"[2.1c] : making prev_link = '' again (before was: {prev_link})")
                  prev_link = ""
                if SPAN == False:
                  item, source = look_for_char_by_link(roles_array, link)
                  if item:
                    element_obj['token'] = item['name']

                    if source == "char": 
                      element_obj['role'] = item['role']
                      element_obj['link'] = item['link-u']
                    else :  
                      element_obj['role'] = item['type']
                      element_obj['link'] = item['link']
                    element_obj['label'] = 'PERSON'
                    if token not in overall_substitutions.keys():
                      if token not in stopWords:
                        if element_obj['link']!='e' and element_obj['link']!="":
                          match_fuz = fuzz.token_set_ratio(token, element_obj['token'])
                          if match_fuz > 99 : 
                            overall_substitutions[token] = element_obj
                      # print(f"overall_substitutions[{token}] : {overall_substitutions[token]}")

                    SPAN= True
                    prev_link=link
                    if debug:  print("[2.1a] : ADD_TOKEN: ", ADD_TOKEN, "SPAN:", SPAN , "prev_link:",prev_link)
                else : #spann == True
                #cerco di aggiungere l'id della parola che salto (perchè accorpata nell'entità)
                #alla lista degli id di quella prima 
                  if j not in element_obj['id']: 
                      
                      if debug: print(f"[1.b.ID] adding {j} to {element_obj['id']}")
                      element_obj['id'].append(j)
        # else:
        # questo dovrebbe avvenire DOPO che il ciclo è finito
        if CYCLE_FOUND == False: 
          if SPAN == True:
            if debug:  print("[2.1d] NO OVERLAP- putting SPAN to False and ADD_TOKEN to True")
            SPAN=False
            ADD_TOKEN = True
            prev_link=""
            element_obj = {'pos': (start,end),'id': [j], 'token': token, 'label': '', 'role': [], 'link': ''}
                
        # if debug:  print("[3] : ADD_TOKEN ", ADD_TOKEN, "SPAN", SPAN)
        # Add the element to the sentence json object
        if ADD_TOKEN == True: 
          len_item_name = len(str(element_obj['token']).split())
          len_span_tokens = len(element_obj["id"])
          offset_span  = len_item_name-len_span_tokens
          offset_sent += offset_span
          #  modificare campo id 
          if offset_span<0:
            element_obj["id"] = element_obj["id"][:offset_span]
          elif offset_span > 0:
            for  i in range(offset_span):
              element_obj["id"].append(element_obj["id"][-1]+1)
          # if debug:
          #ONLY FOR DEBUG OF TIBERIUS 
          if element_obj['label'] == "":
            # if debug : print(f"[4.a] lookin for a item for the token: {token}")

            item = look_for_char_item(token , ep_num, character_dictionary, debug = False, no_aka = True)
            # print(f"[DEBUG TIBERIUS.2]  item {item}")
            if item:
              if debug: print(f"[4.b] recovered a char for token: [{token}] : {item}")
              element_obj['token'] = item['name']
              element_obj['role'] = item['role']
              element_obj['link'] = item['link-u']           
              element_obj['label'] = 'PERSON'
                    


          sentence_obj['elements'].append(element_obj)
          # print(element_obj)
        
      # Add the sentence to the json object
      json_obj['sentences'].append(sentence_obj)
    # Return the json object
    # print(f" overall_substitutions.keys() : {overall_substitutions.keys()}")
    for sentence in json_obj['sentences']:
      for element in sentence['elements']:
        if element['label']=="":
          if element['token'] in overall_substitutions.keys():
            print(f" {element['token']}  -> {overall_substitutions[element['token']]}")
            element_obj = overall_substitutions[element['token']]
            element['token'] = element_obj['token']
            element['role'] = element_obj['role']
            element['link'] = element_obj['link']
            element['label'] = element_obj['label']
    return json_obj
   


### Recaps Extraction

In [None]:
def get_recaps_fuzzy(roles_array_list, table, character_dictionary_list,ind_=[0], arc_n={}, campaign = 1, start_from = None):
    """
    Scrapes the recaps for episodes on the Critical Role Fandom website.

    The function get_recaps_fuzzy() is a web scraping function that scrapes the recaps for 
    episodes on the Critical Role Fandom website. It goes through a list of episode links and 
    for each episode link, it gets the episode number, the full URL for the recap page, and creates 
    a BeautifulSoup object for the URL. Then it gets the heading for the recap section, the id 
    for the first part of the episode, a list of siblings that come after the recap heading but 
    before the first part heading, a list of p elements that come between the recap heading and 
    the first part heading, and the concatenated text of all elements with a name of 'i'.

    It then writes the concatenated text to a tsv file and creates a json object for the 
    concatenated text. The json object is built by extending the spans in the text and 
    matching named entities with names in the dataset using fuzzywuzzy and by matching 
    the words in the text with character links. The json object is then saved to a file. 
    The function also keeps a track of the episode numbers for which it was unable to get 
    the recap heading, the id for the first part of the episode, or the p elements and keeps 
    them in the list jumped_episodes.
    """
    jumped_episodes = []
    iter = 0
    for ind in sorted(arc_n.keys()):
      for table_ind in arc_n[ind]:
        for link in tqdm(table[table_ind].find_all('a')):
            roles_array = roles_array_list[ind]
            character_dictionary = character_dictionary_list[ind]
            # Get episode number from the link
            ep_num = get_ep_num(link)
            if start_from!= None and ep_num!=None:
              if int(ep_num[-2:])<start_from:
                continue
            # Get the full URL for the recap page
            url = get_recap_url(link)
            if not url:
                continue
            # Get the BeautifulSoup object for the URL
            soup = get_soup(url)
            # Get the heading for the recap section
            recap_heading = get_recap_heading(soup)
            if not recap_heading:
                print("Error: Recap heading not found.")
                jumped_episodes.append(ep_num)
                continue
            # Get the id for the first part of the episode
            part_1_id = get_part_1_id(soup, recap_heading)
            if not part_1_id:
                jumped_episodes.append(ep_num)
                continue
            # Get the heading for the first part of the episode
            part_i_heading = get_part_i_heading(soup, part_1_id)
            # Get a list of siblings that come after the recap heading but before the first part heading
            filtered_siblings = get_filtered_siblings(recap_heading, part_i_heading)
            # Get a list of p elements that come between the recap heading and the first part heading
            p_elements = get_p_elements(filtered_siblings[0], part_i_heading)
            # Get the concatenated text of all elements with a name of 'i'
            Recap, links = get_recap_text(p_elements)

            if campaign == 2: 
              save_path = RECAP_PATH_C2
            else:
              save_path = RECAP_PATH
            ARC_PATH =  os.path.join(str(save_path), "arc"+str(ind+1)+".tsv")
            # write_recap_to_tsv(ARC_PATH, Recap, ep_num) #DECOMMENT TO WIRTE ON TSV
            
            ep_list= [ep_num]
            for k in sorted(character_dictionary.keys()):
              if k not in ep_list:
                ep_list.append(k)
              else: break

            #write json
            extended_spans_links, char_links = extend_spans_in_text(roles_array,Recap, links, ep_num , ep_list , character_dictionary,debug=False)

            matches = ner_and_fuzzy(Recap, extended_spans_links,ep_num , ep_list, char_links,character_dictionary,debug=False)
            json_file = f'{ep_num}-recap-json'


            json_obj = build_json(roles_array,character_dictionary,ep_num, Recap, matches, char_links,debug=False)

            
            saveJSON(json_obj,json_file, save_path, download = False)

In [None]:
# Launch

#campaign 1 
# arc_n ={0:[0], 1:[1], 2:[2]}
#campaign 2 indices in table [5,6,7,8,9] (arc 1 (5,6,7) [ep 1- 25] -2 (8-9) [ep 26-47])
# arc_n ={0:[5,6,7], 1:[8,9]}


"""Decomment here to call the execution of get_recaps_fuzzy """
# get_recaps_fuzzy(roles_array_list, table,character_dictionary_list, ind_=[5,6,7,8,9], arc_n ={0:[5,6,7], 1:[8,9]}  , campaign=2, start_from = 0)

## Summaries

### Utilities for Summaries Extraction

In [None]:
def find_ids(soup_obj, ids=[]):
  """
  find_ids(soup_obj, ids=[]) takes in a BeautifulSoup object and an optional list of ids as input. 
  It recursively searches through all the child elements of the soup object, checking if they have 
  an 'id' attribute. If they do, the id is appended to the list of ids. At the end of the function, 
  the list of ids is returned.
  """
  if isinstance(soup_obj, Tag):
    # if soup_obj.get('class')=='crquotebox crquoteboxcenter':
    #   return ['cite']
    if soup_obj.get('id'):
      ids.append(soup_obj['id'])
      # print(soup_obj)
    for child in soup_obj.find_all():
      # if 'cite' in find_ids(child, []) : break
      # print(f"cihld : {child}")
      find_ids(child, ids)
  return ids



In [None]:
def get_summary_links(soup, recap_heading):
    """
    Given a BeautifulSoup object and the heading for the recap section, returns the id for the 
    first part of the episode.

    get_summary_links(soup, recap_heading) takes in a BeautifulSoup object and the 
    heading for the recap section as input. It starts by checking if the soup object 
    contains the id 'Recap' or 'Previously_on_Critical_Role' and assigns that id to the 
    variable 'navigation_recap_check'. Then it finds the first 'a' tag with the href 
    attribute equal to the value of 'navigation_recap_check' and assigns it to the 
    variable 'navigation_recap'. Then it gets the parent of the 'navigation_recap' which 
    is 'navigation_part_1' variable. The function then iterates through the siblings of 
    the 'navigation_part_1' variable and when it encounters a 'li' tag, it checks the href 
    attribute of the 'a' tag within the 'li' tag. If the href attribute starts with '#' and 
    does not contain the value 'Break', 'Q&A', 'Post-Show' or 'Afterword', it appends the
    value of the href attribute after the '#' to the 'summary_links' list. 
    If the href attribute contains any of the above values, it appends the value of the 
    href attribute after the '#' to the 'summary_links' list with a value of 0. 
    The function then returns the 'summary_links' list.
    """
    if soup.find('span', {'id': 'Recap'}) is not None:
        navigation_recap = soup.find('a',{'href':'#Recap'})
        navigation_recap_check = '#Recap'
    elif soup.find('span', {'id': 'Previously_on_Critical_Role'}) is not None:
        navigation_recap = soup.find('a',{'href':'#Previously_on_Critical_Role'})
        navigation_recap_check = '#Previously_on_Critical_Role'
    try:
      navigation_part_1 = navigation_recap.parent
    except:
      print("No navigation_recap")
      return None
    next_nav = False
  
    summary_links=[]
    while navigation_part_1:
      
      if navigation_part_1.name == 'li':
        
        if next_nav:
          nav_title = navigation_part_1.a['href'][1:]
          if  nav_title != 'Break' and nav_title != 'Q&A' and  nav_title != 'Post-Show' and  nav_title != 'Afterword':
            # print("parent_nav : ", navigation_part_1.a['href'][1:])

            summary_links.append((nav_title,1))

            li_elements = navigation_part_1.find_all('li')
            for li_el in li_elements: 
              # print("sons nav",li_el.a['href'][1:])
              summary_links.append((li_el.a['href'][1:],1))
            
            
          else : summary_links.append((nav_title,0))
          # next_nav = False
        if navigation_part_1.a['href'] == navigation_recap_check: next_nav = True
      navigation_part_1=navigation_part_1.next_sibling

    return summary_links

In [None]:
def extract_summary_paragraph(start_heading, end_heading, current_position):
    """
    extract_summary_paragraph(start_heading, end_heading, current_position) takes in a starting 
    heading, an ending heading and the current position of the text as input. 
    It starts by initializing an empty string 'sum', a dictionary 'links' and a boolean 
    variable 'break_' as false. It then iterates through the siblings of the 'start_heading' 
    variable and when it encounters a 'p' tag, it appends the text of the 'p' tag to the 'sum' 
    variable and calls the get_links_in_text(sibling) function passing the sibling as input. 
    The returned value is a dictionary of links, which is then added to the 'links' variable. 
    Once the function encounters the 'end_heading' variable, it sets the 'break_' variable to 
    true and breaks the loop. After the loop ends, the function returns the 'sum' variable and 
    the 'links' dictionary.
    """
    links = {}
    sibling = start_heading.next_sibling
    sum = ""
    break_ = False
    while sibling:
        if isinstance(sibling, Tag): 
            matching_tags = sibling.find_all(id=end_heading)
            if len(matching_tags) > 0:
                break_ = True
                break
            elif sibling.name == 'p':
                sum +=sibling.get_text() 
                # get links in text 
                element_links = get_links_in_text(sibling)
                for start_end, href in element_links.items():
                    start, end = start_end
                    links[(current_position+start, current_position+end)] = href
                current_position += len(sibling.get_text()) 
                # print("[0.2] extract_summary_paragraph ... links(AFTER get_links_in_text)->" , links)
  
        if break_ : break
        sibling = sibling.next_sibling 
    # print("[0b.1.1] extract_summary_paragraph ... links " , links)
    return sum, links , current_position


In [None]:
def extract_summaries(summary_links, result_soup):
    """
    extract_summaries() takes in a list of summary links and a BeautifulSoup object and 
    returns a list of filtered siblings for each summary link. It iterates through the 
    siblings of the starting heading and stops at the first instance of the ending heading. 
    It also extracts the summary from the soup object, and returns the summary and links in 
    text as a tuple.

    Parameters:
        summary_links (list): A list of tuples, where each tuple contains the title of the 
        summary link and a check value.
        result_soup (bs4.BeautifulSoup): The result soup obtained from web scraping.

    Returns:
        filtered_siblings (list): A list of filtered siblings for each summary link.
    """
    filtered_siblings = []
    summary  =""
    summary_links_in_text  ={}
    current_position = 0         

    for s in range(len(summary_links)):
        sum_link=summary_links[s]
        title = sum_link[0]
        check = sum_link[1]
        if check == 0 : continue
        start_heading = result_soup.find('span', {'id':  title}).parent
        if s < len(summary_links)-1 : 
          end_heading = summary_links[s+1][0]
        else: 
          end_heading = 'Featured_Characters'

        sum , links,current_position = extract_summary_paragraph(start_heading, end_heading, current_position)
        summary += sum
        summary_links_in_text.update(links)
        # print("[0b.1]extract_summaries... summary_links_in_text ,",summary_links_in_text)
    return summary , summary_links_in_text


### Summaries Extraction

In [None]:
def get_summaries(roles_array_list,character_dictionary_list,  table, ind_=[0],  arc_n={}, campaign = 1,start_from = None):
  """
  get_summaries() takes in a table and an index and scrapes the summaries for episodes on 
  the Critical Role Fandom website. It uses the extract_summaries() function to get the 
  summary and links in text. It also calls other helper functions to get the episode number, 
  URL, and BeautifulSoup object for the URL. It also calls other functions to write the summary 
  to a tsv file, and to get the fuzzy matched characters and build the json object.
  This code also uses a jumped_episodes list to keep track of episodes that were not found 
  and a variable iter to limit the number of iterations.

  """
  jumped_episodes = []
  iter = 0
  for ind in sorted(arc_n.keys()):
    for table_ind in arc_n[ind]:
        for link in tqdm(table[table_ind].find_all('a')):
          roles_array = roles_array_list[ind]
          character_dictionary = character_dictionary_list[ind]
          # Get episode number from the link
          ep_num = get_ep_num(link)

          if start_from!= None and ep_num!=None:
            if int(ep_num[-2:])<start_from:
              continue
          # Get the full URL for the recap page
          url = get_recap_url(link)
          if not url:
              continue
          # Get the BeautifulSoup object for the URL
          print(url)
          soup = get_soup(url)
          # Get the heading for the recap section
          recap_heading = get_recap_heading(soup)
          if not recap_heading:
              print("Error: Recap heading not found.")
              jumped_episodes.append(ep_num)
              continue
          # Get the id for the first part of the episode
          """parte modificata per sumamries """
          summary_links = get_summary_links(soup, recap_heading)
          ep_summary, links= extract_summaries(summary_links, soup)
          #test
          print(f"{ep_num}, {ep_summary}")
          if campaign == 2: 
              save_path = SUM_PATH_C2
          else:
              save_path = SUM_PATH
          ARC_PATH =  os.path.join(str(save_path), "arc"+str(ind+1)+".tsv")
          write_recap_to_tsv(ARC_PATH, ep_summary, ep_num) 

          ep_list= [ep_num]
          for k in sorted(character_dictionary.keys()):
            if k not in ep_list:
              ep_list.append(k)
            else: break
          # write json

          extended_spans_links, char_links = extend_spans_in_text( roles_array, ep_summary, links, ep_num ,ep_list,character_dictionary,debug=False)

          matches = ner_and_fuzzy(ep_summary, extended_spans_links,ep_num , ep_list,  char_links,character_dictionary,debug=False)
          json_file = f'{ep_num}-summary-json'
          json_obj = build_json(roles_array,character_dictionary,ep_num,ep_summary, matches, char_links,debug=False)



          saveJSON(json_obj,json_file, save_path, download = False)


In [None]:
# Launch

 #campaign 1 
# arc_n ={0:[0], 1:[1], 2:[2]}
#campaign 2 indices in table [5,6,7,8,9] (arc 1 (5,6,7) [ep 1- 25] -2 (8-9) [ep 26-47])
arc_n ={0:[5,6,7], 1:[8,9]}


"""Decomment here to call the execution of get_recaps_fuzzy """
# get_summaries(roles_array_list,character_dictionary_list, table, ind_=[0,1,2],arc_n =arc_n, campaign=2, start_from = 0)

## Timeline's events data managing

### Utilities for Timeline data managing

In [None]:

""" get soup object for URL """
def get_soup_(url):
  # Send a GET request to the URL
  response = requests.get(url)

  html_text = bs_preprocess_(response.text)
  # Parse the HTML of the response
  soup = BeautifulSoup(html_text, 'html.parser')
  return soup

""" function used to clear the html text"""
def bs_preprocess_(html):
    """remove distracting whitespaces and newline characters"""
    pat = re.compile('(^[\s]+)|([\s]+$)', re.MULTILINE)
    html = re.sub(pat, '', html)       # remove leading and trailing whitespaces
    html = re.sub('\n', ' ', html)     # convert newlines to spaces
                                      # this preserves newline delimiters
    html = re.sub('[\s]+<', '<', html) # remove whitespaces before opening tags
    html = re.sub('>[\s]+', '>', html) # remove whitespaces after closing tags
    return html 

"check if navigable string elem has the episode number"
def is_formatEp(elem):
  
  # look for text inside tags
  if type(elem) == bs4.element.NavigableString:
    text = elem.string
  else:
    return False

  matched = re.search('^\(\d*x\d*\)$',text)
  if matched != None: 
    # print(matched.group())
    return True
  else: return False

""" get episode, campaing, and arc from  text of the format: (Campaign)x(Episode) """
def extract_ep(text, verbose = False):
  text = text.replace('(','').replace(')','')
  campaign = int(text.split('x')[0])
  episode = int(text.split('x')[1])
  for i,ep_arc in enumerate(eps_arc):
    if (episode >= ep_arc[0]) and (episode <= ep_arc[1]):
      arc = i+1
  if verbose: print(f'found event from campaign n°{campaign}, episode n°{episode}, and arc n°{arc}')
  return episode, arc, campaign

""" get tag or return False if is NavigableString type"""
def check_tag(elem, tag):
  if type(elem) == bs4.element.NavigableString: return False
  else: 
    if elem.name == tag:
      return True
    else:
      return False

""" aux function used to handle the case of a <ul> tag list inside another list"""
def insertInfoList(elem, events_info, prev_text, current_position, debug = False):
  for li in list(elem.children):
    text = ''
    links = []
    for elem in list(li.children):
      if text == '' or text[-1] == ' ':
          add = elem.string
          text += elem.string
      else: 
          add = ' '+ elem.string
          text += ' '+ elem.string
      """modifiche leo"""
      if type(elem)!= bs4.element.NavigableString: 
        current_position, global_span = leo_check(elem, current_position,add)
        if debug: print(f"[1.4.1a] cp = {current_position}")
        if debug: print(f"[0.4] global_span : {global_span} , text[global_span] : {text[global_span[0]:global_span[1]]}")
      else:
        current_position += len(add)
        if debug: print(f"[1.4.1b] cp = {current_position}")
      """-------------"""
      if elem.name == 'a':
        links.append((global_span, elem['href']))
    if links == []: links = 'empty'

    if text != '':
      new_text  = ''
      if text == '' or text[-1] == ' ':
        new_text = prev_text + text
      else: 
        new_text += prev_text +' '+ text
      if debug: print(f" text from the list tag -->   {new_text}")
      if debug: print(f"links from the list tag -->   {links}")
      event_info = {'text':new_text, 'links':links}
      events_info.append(event_info)
  return current_position

def leo_check(elem, current_position ,add):
  # Get the position of the link in the element's text
  try:
    start = elem.get_text().index(elem.string, 0)
  except:

    start = current_position
  end = start + len(add)

  global_span = (current_position+start+1, current_position+end+1)
  return current_position+end, global_span

# ------------ extraction function

def extract_eventTL(arc_n, verbose = False, debug = False):
  events_TL = []
  # get HTML from HTTP response and build BeautifulSoup object from constuctor
  soup = get_soup_(HTTPS_LINK_TIMELINE)

  # extract event from timeline for the 1st campaing
  span_TL = soup.find_all(id='Campaign_One:_Vox_Machina')[0]

  # get table tag
  h3_TL = span_TL.parent
  div_table_TL = next(next(h3_TL.next_siblings).next_siblings)
  tbody_TL = next(next(div_table_TL.children).children)
  rows_table = list(tbody_TL.children)[1:]

  # loop over rows 
  for i,row_table in enumerate(rows_table):

    

    # retrieve the 2 columns
    row_table_col1 = next(row_table.children)
    row_table_col2 = row_table_col1.next_sibling

    # ---------------------------------- get episode from 1st column ------------------------------
    p_col1 = list(row_table_col1.children)[2]
    elem = p_col1

    eps_tmp = []
    while (not(check_tag(elem,'td'))):

      elem = elem.next_element
      if is_formatEp(elem):
        ep_tmp, arc_tmp, _ = extract_ep(elem.string, verbose)
        eps_tmp.append(ep_tmp)
    
    # skip if this row describe episode of a previous arc
    if arc_n +1 > arc_tmp:
      continue

    # ----------------------------------get events from the 2nd column -----------------------------
    first_elem  = list(row_table_col2.children)[0]
    elem = first_elem
    # build the list for the events info
    events_info = []

    # first couple of variables for event not in a list
    links = []
    text_notlist = ''
    current_position = 0
    if debug:print(f"[0] cp = {current_position}")
    
    # ----------- first part event extraction,  look for string element or <a> marker --------------


    for elem in list(row_table_col2.children):
      add=0
     
      # list element are handled in the below part of this code, here just html text and <a>
      if check_tag(elem,'ul'):
        continue

      try:
        if text_notlist == '' or text_notlist[-1] == ' ':
          add =  elem.string
          text_notlist += elem.string
        else: 
          add = ' '+ elem.string
          text_notlist += ' '+ elem.string

        """modifiche leo"""
        if type(elem)!= bs4.element.NavigableString: 
          current_position, global_span = leo_check(elem, current_position,add)
          if debug:print(f"[1.1a] cp = {current_position}")
          if debug:print(f"[0.1] global_span : {global_span} , text_notlist[global_span] : {text_notlist[global_span[0]:global_span[1]]}")
        else:
          current_position += len(add)
          if debug:print(f"[1.1b] cp = {current_position}")
        """-------------"""

        if type(elem)!= bs4.element.NavigableString:
          if elem.name == 'a':
            links.append((global_span,elem['href']))
        
      except Exception as error:
        # handle special cases
        # <b> tag
        for b_elem in list(elem.children):
          if text_notlist == '' or text_notlist[-1] == ' ':
            add =  b_elem.string
            text_notlist += b_elem.string
          else: 
            add =' '+  b_elem.string
            text_notlist += ' '+ b_elem.string
          """modifiche leo"""
          if type(elem)!= bs4.element.NavigableString: 
            current_position, global_span = leo_check(b_elem, current_position, add)
            if debug:print(f"[1.2a] cp = {current_position}")
            if debug:print(f"[0.2] global_span : {global_span} , text_notlist[global_span] : {text_notlist[global_span[0]:global_span[1]]}")
          else:
            current_position += len(add)
            if debug:print(f"[1.2b] cp = {current_position}")
          """-------------"""
          if type(b_elem)!= bs4.element.NavigableString:
            if b_elem.name == 'a':
              links.append((global_span, b_elem['href']))

    if links == []: links = 'empty'
    if text_notlist != '':
      if verbose: print(f" text from the non-list tag -->   {text_notlist}")
      if verbose: print(f"links from the non-list tag -->   {links}")
      event_info = {'text':text_notlist, 'links':links}
      events_info.append(event_info)
      current_position = 0
      if debug:print(f"[1.0] cp = {current_position}")


    # ----------- second part event extraction, look for a possible list in the row: <ul> tag ------

    # i restart from the first child of <td> since the list can be both first or next child
    found = False
    for elem in list(row_table_col2.children):
      if (check_tag(elem,'ul')):
            ul_event = elem
            found = True
      if found: break


    if found:
      # scan the list
      for elem_list in list(ul_event.children):
        # initialize the info variables
        text = ''
        links = []
        current_position_=0
        if debug:print(f"[1.3a] cp = {current_position_}")
        for elem in list(elem_list.children):
          
          # possibility to find a list inside another list, use previous text int list's point
          if check_tag(elem,'ul'):
            """modifiche leo """
            current_position_ = insertInfoList(elem, events_info, text, current_position_)
            if debug:print(f"[1.3b] cp = {current_position_}")
            """----------"""
            text = ''
            continue

          if text == '' or text[-1] == ' ':
            add = elem.string
            text += elem.string
          else: 
            add = ' '+elem.string
            text += ' '+ elem.string
          """modifiche leo"""
          if type(elem)!= bs4.element.NavigableString: 
            current_position_, global_span = leo_check(elem, current_position_, add)
            if debug:print(f"[1.4a] cp = {current_position_}")
            if debug:print(f"[0.3] global_span : {global_span} , text_notlist[global_span] : {text[global_span[0]:global_span[1]]}")
          else:
            current_position_ += len(add)
            if debug:print(f"[1.4b] cp = {current_position_}")
          """-------------"""
          if elem.name == 'a':
            links.append((global_span, elem['href']))
        
        if links == []: links = 'empty'

        if text != '':
          if verbose: print(f" text from the list tag -->   {text}")
          if verbose: print(f"links from the list tag -->   {links}")
          event_info = {'text':text, 'links':links}
          events_info.append(event_info)
    
    if verbose: print(f'\n-------{i+1}-------\n')
    
    # exit if this row describe episode of a next arc
    if arc_n+1 < arc_tmp:
      return events_TL

    # otherwise we are extrating from the right arc, save data
    for ep in eps_tmp:
      event_TL = {}
      event_TL['episode'] = ep

      event_TL['description'] = events_info
      events_TL.append(event_TL)

  return events_TL


### Extracting and formatting in Json of Timeline texts

In [None]:
def jsonize_timeline_texts(events_TL, character_dictionary,roles_array,campaign = 1, debug = True):
  prev_ep_num = '0x00'
  iter_ = 0
  for event_dict in events_TL:
    ep_num = event_dict['episode']
    if campaign == 1:
      if ep_num>9:
        ep_num = "1x"+str(ep_num)
      else:
        ep_num = "1x0"+str(ep_num)
    if campaign == 2:
      if ep_num>9:
        ep_num = "2x"+str(ep_num)
      else:
        ep_num = "2x0"+str(ep_num)
    ep_list= [ep_num]
    if debug: print("--------", ep_num, "------------")
    for k in sorted(character_dictionary.keys()):
      if k not in ep_list:
        ep_list.append(k)
      else: break
    if ep_num != prev_ep_num: iter=0
    description_list = event_dict['description']
    for descr_dict in description_list:
      event_text=descr_dict['text'] 
      event_links = descr_dict['links'] 
      links= {}
      if event_links != 'empty':
        for link_tuple in event_links:
          span = link_tuple[0]
          link = link_tuple[1]
          links[span]=link
      #write json
      if debug: print("\n[0] extend_spans_in_text ... ")
      if debug: print("[0.0] links ", links)
      extended_spans_links, char_links = extend_spans_in_text( roles_array, event_text, links, ep_num ,ep_list,character_dictionary,debug=False)

      # print("\n", ep_num ,"[1] ner_and_fuzzy ...")
      matches = ner_and_fuzzy(event_text, extended_spans_links,ep_num , ep_list,  char_links,character_dictionary,debug=False)

      json_file = f'{ep_num}-{iter}-TL'
      json_obj = build_json(roles_array,character_dictionary,ep_num,event_text, matches, char_links,debug=False)

      if debug:
        for j in json_obj['sentences']:
          for e in j['elements']:
            print(e['id'] , e['token'], e['link'])
          print("\n")
      if not debug : saveJSON(json_obj,json_file, TL_PATH, download = False)
      iter+=1
      if debug:
        iter_+=1
        if iter_>10:
          break
    prev_ep_num = ep_num
    if debug:
      if iter_>10:
        break


In [None]:
# Launch

# arc_n_TL_list = [0,1]
# for (events_TL, arc) in zip(events_TL_list, arc_n_TL_list):  
#   jsonize_timeline_texts(events_TL, character_dictionary_list[arc],roles_array_list[arc],campaign =2,  debug = False)

# Pre-processing

## Load Content files & Models

In [None]:
# loading sources i.e. using the recap here

"""
      Function that returns from the folder of recap and summaries the associated text for arc and campaign
"""
def get_textRecapSumm(path, arc = arc_n +1, camp = campaign_n +1, verbose =False):
  
  files_content = []
  name_files = sorted( os.listdir(path))

  for file_name in name_files:
    full_text = ''
    matched = re.search('^\d*x\d*.*\.json$',file_name)
    if matched != None:
      name = matched.group()
      
      camp_ep = name.split('-')[0]
      camp_ = int(camp_ep.split('x')[0])
      ep = int(camp_ep.split('x')[1])
      arc_ = get_arc(ep, campaign = campaign_n)
      if camp_ != camp  or arc_ != arc:
        continue
      else:
        if verbose: print("loading ... " + name)

        # extract the text
        json_dict = loadJSON(path, file_name, need_ext=False, verbose = False)
        sentences = []
        sentences_file = []
        for json_sent in json_dict['sentences']: # access to the sentences dictionary -> get array
          tmp = {}
          words = []
          for array_word in json_sent['elements']:
            words.append(array_word['token'])
          sentence = ' '.join(words)
          tmp['sentence'] = sentence.strip()
          tmp['file'] = json_sent
          tmp['info'] = {'episode': ep,'id_sentence': json_sent['id'],'arc':arc_,'campaign':camp_} 
          sentences.append(sentence.strip())
          sentences_file.append(tmp)
        
        

        full_text = ' '.join(sentences)
        file_content = {'full_text': full_text, 'sentences-file': sentences_file}

        files_content.append(file_content)

  return files_content   # one entry for each file

"""
      Function that returns from the folder of recap and summeries the associated text for arc and campaign
"""

def get_textTimeline(path, arc = arc_n +1, camp = campaign_n +1, verbose = False):
  name_files = sorted( os.listdir(path))
  if verbose: print(f"extracting for campaign: {camp} arc: {arc}")
  text_timeline = ''
  files_content = []

  for file_name in name_files:
    matched = re.search('^\d*x\d*.*\.json$',file_name)
    if matched != None:

      name = matched.group()
      camp_ep = name.split('-')[0]
      progressive = int(name.split('-')[1])
      camp_ = int(camp_ep.split('x')[0])
      ep = int(camp_ep.split('x')[1])
      arc_ = get_arc(ep, campaign = camp_ -1)

      if camp_ != camp  or arc_ != arc:
        continue
      else:
        if verbose: print("loading ... " + name)

        json_dict = loadJSON(path, file_name, need_ext=False, verbose = False)
        for json_sent in json_dict['sentences']: # access to the sentences dictionary -> get array
          words = []
          for array_word in json_sent['elements']:
            words.append(array_word['token'])
          sentence = ' '.join(words)

          info = {'episode': ep,'progressive': progressive,'id_sentence': json_sent['id'],'arc':arc_,'campaign':camp_} 

          text_timeline += (' ' + sentence) 
          file_content = {'sentence': sentence, 'file':json_sent,  'info': info}
          files_content.append(file_content)

  return text_timeline.strip(), files_content   # one entry for each sentence


def get_lengthsContent(content):
  for idx,con in enumerate(content):
    print(f"- number of sentences file n°{idx} :" + str(len(con['sentences-file'])))
    n_words = 0
    n_char = 0

    for sent in con['sentences-file']:
      words_sent = len(sent['sentence'].
                      split(' '))
      char_sent = len(sent['sentence'])
      n_words += words_sent
      n_char  += char_sent
    print(f"- number of words file n°{idx} :" + str(words_sent))
    print(f"- number of char file n°{idx} :" + str(len(con['full_text'])) + "\n")

if True:
  content_rec  = get_textRecapSumm(PATH_RECAP, verbose = False)
  content_sum  = get_textRecapSumm(PATH_SUMMARIES, verbose = False)
  full_text_tl, content_tl = get_textTimeline(PATH_TIMELINE, verbose = False) 

  # ----- look content recap
  print("[recap]\n") 
  print(f"n° files: {len(content_rec)}\n") 
  get_lengthsContent(content_rec)
  print()

  # ----- look content summaries 
  print("[summaries]")
  print(f"n° files: {len(content_sum)}\n") 
  get_lengthsContent(content_sum)
  print()

  # ----- look content timeline 
  print("[timeline]") 
  print(f"n° files: {len(content_tl)}\n") 

[recap]

n° files: 22

- number of sentences file n°0 :16
- number of words file n°0 :48
- number of char file n°0 :2176

- number of sentences file n°1 :7
- number of words file n°1 :15
- number of char file n°1 :1352

- number of sentences file n°2 :7
- number of words file n°2 :75
- number of char file n°2 :1256

- number of sentences file n°3 :16
- number of words file n°3 :89
- number of char file n°3 :2769

- number of sentences file n°4 :14
- number of words file n°4 :41
- number of char file n°4 :3173

- number of sentences file n°5 :19
- number of words file n°5 :41
- number of char file n°5 :2795

- number of sentences file n°6 :5
- number of words file n°6 :24
- number of char file n°6 :773

- number of sentences file n°7 :9
- number of words file n°7 :62
- number of char file n°7 :1511

- number of sentences file n°8 :15
- number of words file n°8 :60
- number of char file n°8 :3400

- number of sentences file n°9 :9
- number of words file n°9 :34
- number of char file n°9 

In [None]:
# loading models for spacy

# define nlp pipeline + extensions
nlp = spacy.load(PIPELINES_TYPE[3])

""" load fastcoref model into Spacy""" 
nlp.add_pipe("fastcoref")

print(f"Max limit of char in input for the nlp pipeline: {nlp.max_length}")

# Loading glove word embedding 
glove = TT.vocab.GloVe(name="6B", dim=300)

Max limit of char in input for the nlp pipeline: 1000000


.vector_cache/glove.6B.zip: 862MB [02:47, 5.15MB/s]                           
100%|█████████▉| 399999/400000 [01:09<00:00, 5789.46it/s]


## Functions and launcher

### Main group analyzer

In [None]:
from operator import truediv
"""
       Semantic matching by single words: matching patterns + glove embedding & cosine similarity
       Returns a list of substitutions to name the group Entity, which is used in the pre-proceesing
"""

# -- definition of words that named and semantically refers to the party

names_party_c1 = ["Super High Intensity Team", "Super High-Intensity Team" \
                    ,"main", 'team','heroes','squad','crew','gang']

names_party_c2 = ["main", 'team','heroes','squad','crew','gang']

# word to use as unique named Entity for the members party
tag_substitute_arc1 = "Vox Machina"
tag_substitute_arc2 = "Mighty Nein"

# custom campaing variables for each campaing, i.e party_entityDict[campaign_n]
if  campaign_n == 0:
  party_entityDict = [names_party_c1, tag_substitute_arc1]
elif campaign_n == 1:
  party_entityDict = [names_party_c2, tag_substitute_arc2]
else:
  raise ValueError(f"invalid campaign number: {campaign_n+1}  (valid 1 or 2)")

"""
      Aux function n°1, get embedding from target vector 
"""

def get_embTarget(averaged = False, verbose= False):
  if verbose: print("   Generating the embedding for the target vector...")

  # define the array composed by the words target 
  array_target = ['team','group','heroes','squad','crew','gang','party']

  # array to accumulate embeddings, if no averaging is also the output of the function 
  embs_array_target = []
  for word in array_target:
    embs_array_target.append(glove[word].numpy())

  embs_array_target = np.asarray(embs_array_target)

  if averaged: embs_array_target = embs_array_target.mean(axis=0)
  if verbose:
    if len(embs_array_target.shape) < 2: 
        print(f"  Target vector dimensions: {embs_array_target.shape} ")
    else:
        print(f"  Target vectors dimensions: {embs_array_target[0].shape}, vectors number: {len(embs_array_target)} ")

  return embs_array_target

"""
      Aux function n°2, correct ids of GT for the substitutions found
"""
def add_subsGT(subs_sentence, sentence_file, subs_gt):

  # get the list made up of dictionaries, one for each word
  array_words = sentence_file['file']['elements']

  # get id sentence
  id_sent = sentence_file['info']['id_sentence']

  # get id word
  for sub in subs_sentence:

    wanted = sub[0].strip().lower()
    wanted_words = wanted.split(' ')
    sub_ids = []
    sub_words = ''

    for word_dict in array_words:
      candidate = copy.deepcopy(word_dict['token'].strip().lower())

      if any(candidate == word_wanted for word_wanted in wanted_words):

        wanted = wanted.replace(candidate, '').strip()

        sub_ids = [*sub_ids, *word_dict['id']]
        if sub_words == '': sub_words = word_dict['token']
        else: sub_words += " " + word_dict['token']

        substitution = {'id_sentence':id_sent, 'id_word':sub_ids, 'token_word': sub_words,\
                        'new_word':[sub[1]], 'info': sentence_file['info']}
        
        if wanted == '':
          subs_gt.append(substitution)
          break
  return


"""
      Aux function n°3, get list of matches 
"""
def match_entityMain(docs, sentences_file, target_emb, subs_gt, list_matching, list_skipped, delta = 0.8, use_span = True, verbose = False):
  
  # define the cosine similarity
  cos_sim = lambda x,y: np.dot(x,y)/(norm(x)*norm(y))

  # get embedding for the sentence
  def get_emb(x, verbose = False):
    if verbose: print(f"   Generating the embedding for: '{x}' ...")
    if type(x) == str: # single word
      return glove[x].numpy()
  
  # loop over each doc's token
  for idx_d, (doc, sentence_file) in enumerate(zip(docs, sentences_file)): # possiblity also to use annoted span (doc.ents) instead of tokens
    if verbose:
      print(f"sentence n° {idx_d}") 
      print(doc.text)

    # variables used to skip tokens from the analysis
    skip_token = False
    temp_text_token = ''
    
    # array for the words with match
    words_match = [] # usually no or just one match for sentence

    for idx, token in enumerate(doc):
      # aux boolean flag for the execution
      matched = False


      # 1) just check if name of main group is present and merge the tokens

      if skip_token:
        skip_token = False
        if campaign_n == 0:
          if 'machina' in token.text.strip().lower():
            # print('matched machina')
            list_matching.append(token.text)
            words_match.append((temp_text_token + " " + token.text, party_entityDict[1]))

            continue
          else:
            list_matching.append(temp_text_token)
            words_match.append((temp_text_token, party_entityDict[1]))

        elif campaign_n == 1:
          if 'nein' in token.text.strip().lower():
            list_matching.append(token.text)
            words_match.append((temp_text_token + " " + token.text, party_entityDict[1]))
            continue

        temp_text_token = ''

      if campaign_n == 0:
        # look for 'Vox Machina'
        if 'vox' in token.text.strip().lower():
          temp_text_token = token.text
          skip_token = True
          continue

      elif campaign_n == 1:
        # look for 'Mighty Nein'
        if 'mighty' in token.text.strip().lower():
          temp_text_token = token.text
          skip_token = True
          continue

        if 'nein' in token.text.strip().lower():
            list_matching.append(token.text)
            words_match.append((token.text, party_entityDict[1]))
            continue


      # 2) filter for Nouns and proper nouns using POS tagging 
      if token.pos_ == 'NOUN' or token.pos_ == 'PROPN':

        # 2.1) try easy literal match
        if token.text.strip().lower() in [name.strip().lower() for name in party_entityDict[0]]:
          if verbose: print('   Easy match for party entity')
          list_matching.append(token.text)
          words_match.append((token.text, party_entityDict[1]))
          matched = True

        # 2.2) if no match, try using the embedding similarity
        else:
          if not(matched):
            x_emb = get_emb(token.text)
            if x_emb.shape == target_emb.shape: # averaged target vector case
              similarity = cos_sim(x_emb, target_emb)
              if similarity > delta:
                if verbose: print("   Case of cosine distance between words")
                list_matching.append(token.text)
                words_match.append((token.text, party_entityDict[1]))
                matched = True

            else: 
              for emb in target_emb:
                similarity = cos_sim(x_emb, emb)
                if similarity > delta:
                  if verbose: print("   Case of cosine distance between words")
                  list_matching.append(token.text)
                  words_match.append((token.text, party_entityDict[1]))
                  matched = True
                  break
          
          if not(matched):
            list_skipped.append(token.text)
          
          
    if words_match != []:
      add_subsGT(words_match, sentence_file, subs_gt)    

  return words_match

# launch process
def pre_processingEntityGroup(sentences_file, verbose = False):
    list_matching = []
    list_skipped = []

    subs_gt = []

    # compute target embedding/s
    target_emb = get_embTarget(averaged = False, verbose = False)

    # list all the sentences as texts 
    sentences = [sentence_file['sentence'] for sentence_file in sentences_file]

    # get annoted docs from nlp pipeline of spacy, showing the progress of the nlp operations
    docs = list(nlp.pipe(sentences))

    # get list of substitutions 
    subs_sentence = match_entityMain(docs, sentences_file, target_emb, subs_gt, list_matching, list_skipped, verbose = verbose)
    
    if verbose: 
      print("[match]")
      print_list(list_matching)
      print("[no match]")
      print_list(list_skipped)
      print_list(subs_gt)

    return subs_gt

### Coreference Resolution

In [None]:
"""
      Application of Coreference resolution using Spacy + Add-on
      Return a list of meaningfull substitutions to do in the JSON files
"""

def pre_processingCoreference(text_file, sentences_file, verbose = False, use_clusters = False):
  # define the array for the substitutions 
  subs_gt = []

  # spacy pipeline on the whole text
  doc = nlp(
    text_file, 
    component_cfg={"fastcoref": {'resolve_text': True}}
  )


  def get_clusters(doc, verbose = True):
    if verbose: print("Clusters from the coreference resolution")
    clusters = doc._.coref_clusters
    clusters_list = []
    for cluster in clusters:
      if verbose:print("------------------")
      cluster_words = []
      for interval in cluster:
        indices = range(interval[0],interval[1])
        letters = [text_file[i] for i in indices]
        word = ''.join(letters)
        cluster_words.append(word)
        if verbose: print("word/s:  {:<40} || char interval:  {:<15}".format(word, str(interval)))
      clusters_list.append(cluster_words)

    return clusters_list
  

  coref_text = doc._.resolved_text
  array_coref_text = coref_text.split()
  array_file_text = text_file.split()

  if verbose: 
    print("Original text:")
    print(text_file)
    print()
    print("Resolved text:")
    print(coref_text)

  # generators for the report of differences
  report_diffs = dl.ndiff(array_file_text, array_coref_text)  # get generator with the differences

  if verbose: 
    for report_entry in report_diffs:
      print(report_entry)
    report_diffs = dl.ndiff(array_file_text, array_coref_text) # reset generator if read

  """
    typology of the results for each row:
       ' ' -> common to both,
       '-' -> from sequence 1 not in 2,
       '+' -> from sequence 2 not in 1,
       '?' -> addition line not present in both sequence 1 and 2, for more info on char differences
  """

  # take first element from the report generator
  word_result = next(report_diffs, 'empty')
  type_word_result = word_result[0]
  text_word_result = word_result[2:]

  # loop over the sentences
  for sent_gt in sentences_file:

    # retrieve sentence id & info dictionary
    id_sent = sent_gt['file']['id']
    info_sent = sent_gt['info']

    # define auxiliary variables used in the inner loop
    to_remove = []      # list of ids to remove
    to_insert = []      # list of words that substitute the corresponding words of the ids to remove
    residual = ''       # variable used to manage the problem of having token with multiple words

    # get the iterators for the list of word dictionaries and get first element in the actual sentence
    iter_sent = iter(sent_gt['file']['elements'])
    word_dict = next(iter_sent, 'empty')
  
    while(word_dict != 'empty'):

      # 1) check the case of empty token for the two iterators
      if word_result.strip() == '':
        if verbose: print("Empty row from the difference report (Skipped)")
        word_result = next(report_diffs, 'empty')
        type_word_result = word_result[0]
        text_word_result = word_result[2:]
        continue 
      if word_dict['token'].strip() == '':
        if verbose:print("Empty token from the file (Skipped)")
        word_dict = next(iter_sent, 'empty')
        continue

      if verbose: print(f"type: {type_word_result}| dict: {word_dict['token']} | diff: {text_word_result}")

      # 2) compute the residual
      if residual == '':
        residual = word_dict['token'].strip().lower().replace(text_word_result.strip().lower(),'', 1)
      else: # not empty residual
        if not(type_word_result == '+' or type_word_result == '?'):
          residual = residual.strip().lower().replace(text_word_result.strip().lower(),'', 1)

      if residual != '' and verbose:
        print(word_dict['token'].strip().lower())
        print(text_word_result.strip().lower())
        print(f"residual: {residual}")
        print()

      # 3) check the correct progress for both ' ' and '-' case. Memorize the token id/s to remove
      #    the creation of the the substitution dictionary (the result) in the ' ' case
      if type_word_result == ' ':
        if len(word_dict['token']) == 1: assert word_dict['token'].lower().strip() == text_word_result.lower().strip()
        if len(to_remove) > 0 and len(to_insert)> 0:  # we can create the dictionary 
          substitution = {'id_sentence':id_sent, 'id_word': to_remove,'new_word': to_insert, 'info':info_sent}
          subs_gt.append(substitution)
          to_remove = []
          to_insert = []

        # flush the auxiliary list of one is empty (this avoid problems in the matching of insertions and delations)
        if (len(to_remove) > 0 and len(to_insert)== 0) or (len(to_remove) == 0 and len(to_insert)> 0):
          to_remove = []
          to_insert = []

      elif type_word_result == '-':
        if len(word_dict['token']) == 1: assert word_dict['token'].lower().strip() == text_word_result.lower().strip()
        if len(word_dict['id']) == 1:
          to_remove.append(word_dict['id'][0])
        else:
          to_remove = [*to_remove,*word_dict['id']]
      
      # 4) handle the case of '+' and '?'
      elif type_word_result == '?':
        pass                    # actually nothing to do here
      elif type_word_result == '+':
        to_insert.append(text_word_result)
      else:
        raise ValueError(f"Error in the typology of row for the difference report.\nExpected: ' ','+','?','-'  Found: {type_word_result}")


      # 5) handle the progress for both iterators
      #    check if are needed more words from the difference report since the token is multi-word
      if residual == '':
        word_dict = next(iter_sent, 'empty')
        word_result = next(report_diffs, 'empty')
        type_word_result = word_result[0]
        text_word_result = word_result[2:]
        if type_word_result == '+':
          to_insert.append(text_word_result)

        if type_word_result == '+'or type_word_result == '?':
          while(type_word_result == '+' or type_word_result == '?'):
            word_result = next(report_diffs, 'empty')
            type_word_result = word_result[0]
            text_word_result = word_result[2:]
            if type_word_result == '+':
              to_insert.append(text_word_result)
          continue

        elif word_dict == 'empty': # ended the list of words from the file, look if is needed the insertion
          if len(to_remove) > 0 and len(to_insert)> 0:  # we can create the dictionary 
            substitution = {'id_sentence':id_sent, 'id_word': to_remove,'new_word': to_insert, 'info':info_sent}
            subs_gt.append(substitution)
            to_remove = []
            to_insert = []

        else: 
          continue

      else: # not empty residual case, scan only the differences report to get residual = ''
        word_result = next(report_diffs, 'empty')
        type_word_result = word_result[0]
        text_word_result = word_result[2:]
        continue
        
  if not(use_clusters):
    return subs_gt
  else:
    return subs_gt, get_clusters(doc, verbose = True)

### Substitutions Generator

In [None]:
"""
      Pre-processing for the different types of content: Summaries, Recaps, timeline extractions
"""

def ppr_recap_summ(content_files, name_file, verbose = True):
  substitutions = []

  arc = content_files[0]["sentences-file"][0]['info']['arc']
  campaign = content_files[0]["sentences-file"][0]['info']['campaign']

  # variables to handle the save at each step for the summaries substitutions
  last_ep = -1; last_file = []

  if name_file == 'pprSUMM_':
    files = os.listdir(PATH_PPR)
    print(files)
    if 'pprSUMM_'+"C"+ str(campaign) + "A"+ str(arc)+ '.json' in files:
      last_file = loadJSON(PATH_PPR, 'pprSUMM_'+"C"+ str(campaign) + "A"+ str(arc))
      if last_file != []: 
        last_content = last_file[-1]['ep']
        last_ep = last_content.split('x')[1]

  # loop over episodes' content 
  for content_file in content_files:

    info = content_file["sentences-file"][0]['info']

    if name_file == 'pprSUMM_':
      if int(last_ep) >= int(info['episode']):
        print(f"skipping ep n° {info['episode']}")
        continue

    print(f"pre-processing for file -> [campaign: {info['campaign']}] [arc {info['arc']}] [episode {info['episode']}]")

    # 1) get substitutions of nouns that refers to the unnamed group entity for the party
    subs_1 = pre_processingEntityGroup(content_file['sentences-file'], verbose = verbose)

    print_list(subs_1)
    print("*"*10)

    # 2) get coreference resolution substitutions
    subs_2 = pre_processingCoreference(content_file['full_text'], content_file['sentences-file'], verbose = verbose, use_clusters = False)  # coreference resolver needs the whole text!

    print_list(subs_2)
    print("*"*10)

    # 3) Enrich subs_2 from the information coming from subs_1
    for sub2 in subs_2:
      text_substitution = sub2['new_word']
      for idx,word_substitution in enumerate(text_substitution):
        for sub1 in subs_1:
          if  sub1['token_word'].lower().strip() == 'vox machina' or \
              sub1['token_word'].lower().strip() == 'vox' or \
              sub1['token_word'].lower().strip() == 'mighty nein' or \
              sub1['token_word'].lower().strip() == 'mighty' or \
              sub1['token_word'].lower().strip() == 'nein':
            continue
          if sub1['token_word'] in word_substitution:
            sub2['new_word'][idx] = word_substitution.replace(sub1['token_word'], sub1['new_word'][0])
    
    # 4) include in the substitutions expression that refer with very high probability to the Team,
    #    use subs from step (1), then sort the resulting list 

    final_subs = [*subs_2]
    if campaign_n == 0:
      party_words = ['party','vox machina', 'vox']
    elif campaign_n ==1:
      party_words = ['party','mighty nein']

    for sub1 in subs_1:
      if  any([party_word in sub1['token_word'].lower().strip() for party_word in party_words]) : # save in this case the result frome step (1)
        inserted = False
        for sub2 in subs_2:
          if all((sub2['info'].get(k) == v for k, v in sub1['info'].items())) and sub1['id_word'][0] in sub2['id_word']:
            inserted = True
            print("***Already inserted***")
            print(sub1)
            print(sub2)
            break 
        if inserted:
          continue
        else:
          sub1.pop('token_word')
          final_subs.append(sub1)
    
    # sort the list of substitutions 
    sorting_criterion = lambda x: [x['info']['episode'],x['id_sentence'],*x['id_word']]
    final_subs = sorted(final_subs, key= sorting_criterion)

    print_list(final_subs)
    print("*"*10)

    # 5) create the final dictionary of substitutions for the current episode
    substitutions.append({"ep" : str(info['campaign']) + "x" + str(info['episode']), "subs":final_subs})

    # 6.1) save array of substitutions as json file
    if name_file == 'pprSUMM_':
      data = [*last_file, *substitutions]
      saveJSON(data = data, path = PATH_PPR, name= name_file +"C"+ str(campaign) + "A"+ str(arc), download = False)
    
  # 6.2) save array of substitutions as json file
  if name_file == 'pprREC_' or name_file == 'pprREC_tmp':
    saveJSON(data = substitutions, path = PATH_PPR, name= name_file +"C"+ str(campaign) + "A"+ str(arc), download = False)

  return substitutions

# handle timeline content in different way for lack of context 
def ppr_tl(full_text, content_files, name_file = 'pprTL_',verbose = False):

  # define output variable
  subs_merged = []

  info = content_files[0]['info']
  print(f"pre-processing whole timeline -> [campaign: {info['campaign']}] [arc {info['arc']}]")

  # 1) get substitutions of nouns that refers to the unnamed group entity for the party
  subs_1 = pre_processingEntityGroup(content_files, verbose = verbose)

  # 2) get coreference resolution substitutions
  subs_2 = pre_processingCoreference(full_text, content_files, verbose = verbose)

  # 3) merge the result from (1) & (2)
  # 3.1) Enrich subs of (2) from the information coming from subs of (1)
  for sub2 in subs_2:
    text_substitution = sub2['new_word']
    if any('vm' in word_substitution.strip().lower() for word_substitution in text_substitution):
      sub2['new_word'] = ['Vox Machina']
      subs_merged.append(sub2)
      continue

    edit = False
    for idx, word_substitution in enumerate(text_substitution):
      if edit: break
      for sub1 in subs_1:
          if sub1['token_word'].lower().strip() == 'vox machina' or \
             sub1['token_word'].lower().strip() == 'vox' or \
              sub1['token_word'].lower().strip() == 'mighty nein':
            continue

          if sub1['token_word'].lower().strip() in word_substitution.lower().strip():
            sub2['new_word'][idx] = word_substitution.replace(sub1['token_word'], sub1['new_word'][0])
            subs_merged.append(sub2)
            edit = True
            break

    if not(edit):
      subs_merged.append(sub2)

  # 3.2) insert safe substitututions from step (1)

  if campaign_n == 0:
    party_words = ['party','vox machina'] #'vox'
  elif campaign_n ==1:
    party_words = ['party','mighty nein']

  for sub1 in subs_1:
    if  any([party_word in sub1['token_word'].lower().strip() for party_word in party_words]) : # save in this case the result frome step (1)
      inserted = False
      for sub2 in subs_merged:
        if all((sub2['info'].get(k) == v for k, v in sub1['info'].items())) and sub1['id_word'][0] in sub2['id_word']:
          inserted = True
          print("***Already inserted***")
          print(sub1)
          print(sub2)
          break 
      if inserted:
        continue
      else:
        sub1.pop('token_word')
        subs_merged.append(sub1)

  # 3.3) for sentences with no substitution insert also the substitution from (1)  
  # (given the lack of contextuality we limit this to less as possible substitutions)
  for sub1 in subs_1:
    all_info_inserted = [sub['info'] for sub in subs_merged]
    if not(sub1['info'] in all_info_inserted):
      all_info_inserted.append(sub1['info'])
      # add entry in head of the list 
      sub1.pop('token_word')
      sub1['new_word'] = [sub1['new_word']]
      subs_merged.insert(0, sub1)
  

  # 4) sort the substitution: priority: episode, progressive, id_sentence, id_word[0]
  sorting_criterion = lambda x: [x['info']['episode'], x['info']['progressive'],x['id_sentence'],x['id_word']]
  subs_merged = sorted(subs_merged, key= sorting_criterion)

  if verbose:
    print("Complete list of substitution for the timeline content")
    for sub in subs_merged:
      print_table(sub)
      print()
  
  # 5) save array of substitutions as json file
  saveJSON(data = subs_merged, path = PATH_PPR, name= name_file +"C"+ str(info['campaign']) + "A"+ str(info['arc']), download = False)

  return subs_merged 

### Launcher

In [None]:
"""
                        launch pre-processing (for each arc)
"""
# define execution variables
task_selector = [1,1,1]
arcs = [0,1]
default_arc = arc_n # to restore the original arc select in the pipeline

for arc in arcs:
  arc_n = arc
  print(f"Elaborating for arc number n° {arc_n + 1}")
  # recap
  if task_selector[0]:
    content_rec  = get_textRecapSumm(PATH_RECAP, arc = arc_n + 1, verbose = False)
    subs_rec = ppr_recap_summ(content_rec, name_file = 'pprREC_',verbose = False)
  # summ
  if task_selector[1]:
    content_summ  = get_textRecapSumm(PATH_SUMMARIES, arc = arc_n + 1, verbose = False)
    subs_summ = ppr_recap_summ(content_summ,  name_file = 'pprSUMM_', verbose = False)
  # timeline
  if task_selector[2]:
    full_text_tl, content_tl = get_textTimeline(PATH_TIMELINE, arc = arc_n + 1, verbose = False) 
    subs_tl = ppr_tl(full_text_tl, content_tl, verbose= False)

arc_n = default_arc

Elaborating for arc number n° 1
['pprTL_C1A1.json', 'pprTL_C1A2.json', 'pprTL_C1A3.json', 'pprREC_C1A1.json', 'pprREC_C1A2.json', 'pprREC_C1A3.json', 'pprSUMM_C1A1.json', 'pprSUMM_C1A2.json', 'pprSUMM_C1A3.json', 'pprREC_C2A1.json', 'pprREC_C2A2.json']
pre-processing for file -> [campaign: 2] [arc 1] [episode 1]


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Inference:   0%|          | 0/24 [00:00<?, ?it/s]

  cos_sim = lambda x,y: np.dot(x,y)/(norm(x)*norm(y))


0)   {'id_sentence': 25, 'id_word': [6], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 1, 'id_sentence': 25, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 32, 'id_word': [14], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 1, 'id_sentence': 32, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 36, 'id_word': [14], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 1, 'id_sentence': 36, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 45, 'id_word': [2], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 1, 'id_sentence': 45, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 61, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 1, 'id_sentence': 61, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 63, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 1, 'id_sentence': 63, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 68, 'id_wor

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['Caleb', 'Widogast', ',', 'a', 'human', 'transmutation', 'wizard'], 'info': {'episode': 1, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [0], 'new_word': ['Caleb', 'Widogast', ',', 'a', 'human', 'transmutation', 'wizard'], 'info': {'episode': 1, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['Caleb', 'Widogast', ',', 'a', 'human', 'transmutation', 'wizard'], 'info': {'episode': 1, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 5, 'id_word': [1], 'new_word': ['Caleb', 'Widogast', ',', 'a', 'human', 'transmutation', "wizard's"], 'info': {'episode': 1, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 6, 'id_word': [0], 'new_word': ['Veth', 'Brenatto', ',', 'a', 'young', 'female', 'goblin', 'rogue'], 'info': {'episode': 1, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 8, 'id_word': [19], 'new_word': ['Ca

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Inference:   0%|          | 0/7 [00:00<?, ?it/s]

0)   {'id_sentence': 3, 'id_word': [7], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 2, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 9, 'id_word': [27], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 2, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 17, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 2, 'id_sentence': 17, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 25, 'id_word': [7], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 2, 'id_sentence': 25, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 25, 'id_word': [29], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 2, 'id_sentence': 25, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 28, 'id_word': [4], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 2, 'id_sentence': 28, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 31, 'id_word': 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [2], 'new_word': ["'s's"], 'info': {'episode': 2, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [18], 'new_word': ['Beauregard'], 'info': {'episode': 2, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [7], 'new_word': ['the', 'guard', 'she', 'tricked'], 'info': {'episode': 2, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [12], 'new_word': ['the', 'investigation'], 'info': {'episode': 2, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [7, 8], 'new_word': ['remaining', 'members', 'of', 'the', 'troupe', 'the', 'remaining', 'members', 'of', 'the', 'troupe'], 'info': {'episode': 2, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['the'], 'info': {'episode': 2, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [8], 'new_word': ['the', 'remaining', 'members

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/56 [00:00<?, ? examples/s]

Inference:   0%|          | 0/56 [00:00<?, ?it/s]

0)   {'id_sentence': 23, 'id_word': [27], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 3, 'id_sentence': 23, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 24, 'id_word': [22], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 3, 'id_sentence': 24, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 30, 'id_word': [15], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 3, 'id_sentence': 30, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 34, 'id_word': [11], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 3, 'id_sentence': 34, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 34, 'id_word': [11], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 3, 'id_sentence': 34, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 35, 'id_word': [9], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 3, 'id_sentence': 35, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 37, 'id_

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [6], 'new_word': ["'s's"], 'info': {'episode': 3, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [14], 'new_word': ['Jester', 'Lavorre', "'s"], 'info': {'episode': 3, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 3, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [6, 7], 'new_word': ["zombie's"], 'info': {'episode': 3, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [11], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 3, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [13], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 3, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [3], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 3, 'id_sentence': 5, 'arc': 1, 'campaign': 2}

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Inference:   0%|          | 0/28 [00:00<?, ?it/s]

0)   {'id_sentence': 15, 'id_word': [18], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 4, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 19, 'id_word': [7], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 4, 'id_sentence': 19, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 24, 'id_word': [5], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 4, 'id_sentence': 24, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 35, 'id_word': [8], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 4, 'id_sentence': 35, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 37, 'id_word': [11], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 4, 'id_sentence': 37, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 38, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 4, 'id_sentence': 38, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 39, 'id_wor

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [18], 'new_word': ['Fjord', 'and', 'Caleb', 'Widogast'], 'info': {'episode': 4, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [8], 'new_word': ['the', 'still', '-', 'unconscious', 'Toya'], 'info': {'episode': 4, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['Mollymauk', 'Tealeaf'], 'info': {'episode': 4, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [7], 'new_word': ['the', 'still', '-', 'unconscious', 'Toya'], 'info': {'episode': 4, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [10], 'new_word': ['Mollymauk', 'Tealeaf'], 'info': {'episode': 4, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [13, 14], 'new_word': ['the', 'dead', 'nergaliid', "'s's"], 'info': {'episode': 4, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [16, 16], 'new_word': [

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Inference:   0%|          | 0/40 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 5, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 5, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 5, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 5, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 9, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 5, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 10, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 5, 'id_sentence': 10, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 18, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 5, 'id_sentence': 18, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 22, 'id_word': [13], 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [5], 'new_word': ["'s's"], 'info': {'episode': 5, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [13], 'new_word': ['The', 'horse', 'pulling', 'the', 'party', "'s", "cart's"], 'info': {'episode': 5, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [2, 3], 'new_word': ["'s", 'cart'], 'info': {'episode': 5, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [6], 'new_word': ['the', 'party', "'s"], 'info': {'episode': 5, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [15], 'new_word': ['Amber', 'Road'], 'info': {'episode': 5, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['Beauregard', 'Lionett', 'and', 'Jester', 'Lavorre'], 'info': {'episode': 5, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 4, 'id_word': [10], 'new_word': ['Beauregard', 'Lionett', 'and', 'Jester', 

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Inference:   0%|          | 0/36 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 6, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [16], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 6, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 6, 'id_word': [6], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 6, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 9, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 6, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 12, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 6, 'id_sentence': 12, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 16, 'id_word': [10], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 6, 'id_sentence': 16, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 19, 'id_word': [15]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [14], 'new_word': ['The', "party's"], 'info': {'episode': 6, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [8], 'new_word': ['Mollymauk', "Tealeaf's"], 'info': {'episode': 6, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [11], 'new_word': ['Mollymauk', "Tealeaf's"], 'info': {'episode': 6, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [16], 'new_word': ['Mollymauk', 'Tealeaf'], 'info': {'episode': 6, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [1], 'new_word': ['The', 'party'], 'info': {'episode': 6, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [18, 19], 'new_word': ['Alfield'], 'info': {'episode': 6, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [12], 'new_word': ['the', 'halfling', 'Thadeus', "Candleglow's"], 'info': {'episode': 6, 'id_sentence': 3, 'ar

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Inference:   0%|          | 0/51 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 7, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [6], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 7, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 6, 'id_word': [14], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 7, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 10, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 7, 'id_sentence': 10, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 27, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 7, 'id_sentence': 27, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 29, 'id_word': [2], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 7, 'id_sentence': 29, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 37, 'id_word': [9]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [12], 'new_word': ['The', 'party'], 'info': {'episode': 7, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['The', 'party'], 'info': {'episode': 7, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [5, 6], 'new_word': ['The', 'party'], 'info': {'episode': 7, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [2], 'new_word': ['The', 'party'], 'info': {'episode': 7, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [6], 'new_word': ['The', "party's"], 'info': {'episode': 7, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [14, 15], 'new_word': ['a', 'second', 'large', 'chamber', 'containing', 'two', 'hyenas', ',', 'a', 'skeletal', 'gnoll', ',', 'and', 'two', 'regular', 'gnolls', 'bickering', 'over', 'a', 'small', 'human', 'child'], 'info': {'episode': 7, 'id_sentence': 3, 'arc': 1, 'campaign

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Inference:   0%|          | 0/51 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 8, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [13], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 8, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 8, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 12, 'id_word': [9], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 8, 'id_sentence': 12, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 13, 'id_word': [5], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 8, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 13, 'id_word': [12, 13], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 8, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 16, 'id

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [12], 'new_word': ["'s's"], 'info': {'episode': 8, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['Beauregard,', 'Fjord,', 'Frumpkin,', 'Jester', 'Lavorre,', 'Nott,', 'Mollymauk', 'Tealeaf'], 'info': {'episode': 8, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [1, 2], 'new_word': ['several', 'hours'], 'info': {'episode': 8, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [12], 'new_word': ['The'], 'info': {'episode': 8, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [4], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 8, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [8], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 8, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [12], 'new_word': ['Beauregard', 'Lionett'], 'info': {

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Inference:   0%|          | 0/41 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 9, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [11], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 9, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 8, 'id_word': [4], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 9, 'id_sentence': 8, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 11, 'id_word': [12], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 9, 'id_sentence': 11, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 13, 'id_word': [11], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 9, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 13, 'id_word': [11], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 9, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 15, 'id_word': [

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [4], 'new_word': ['Beauregard', 'Lionett', 'and', 'Fjord'], 'info': {'episode': 9, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['The', 'party'], 'info': {'episode': 9, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [14], 'new_word': ['Beauregard', 'Lionett', 'and', 'Fjord'], 'info': {'episode': 9, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [18], 'new_word': ['Beauregard', 'Lionett', 'and', 'Fjord'], 'info': {'episode': 9, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [20], 'new_word': ['Beauregard'], 'info': {'episode': 9, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [24], 'new_word': ['Beauregard', 'Lionett', 'and', 'Fjord'], 'info': {'episode': 9, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 4, 'id_word': [26], 'new_word': ['Beauregard', 'Lionett

Map:   0%|          | 0/47 [00:00<?, ? examples/s]

Inference:   0%|          | 0/47 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [10], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 10, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 10, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 5, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 10, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 7, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 10, 'id_sentence': 7, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 27, 'id_word': [12], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 10, 'id_sentence': 27, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 30, 'id_word': [17], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 10, 'id_sentence': 30, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 34, 'id_word

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [18], 'new_word': ['the', 'party'], 'info': {'episode': 10, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [22], 'new_word': ['Caleb', "Widogast's"], 'info': {'episode': 10, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['the', 'party'], 'info': {'episode': 10, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [9], 'new_word': ['the', 'party'], 'info': {'episode': 10, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [14, 15, 16, 17, 18], 'new_word': ['The', 'Traveler', ',', 'and', 'the', 'guard'], 'info': {'episode': 10, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [20], 'new_word': ['the', 'party'], 'info': {'episode': 10, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [0, 1], 'new_word': ['the', 'party'], 'info': {'episode': 10, 'id_sentence': 3,

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

Inference:   0%|          | 0/43 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 11, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [8, 9], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 11, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 21, 'id_word': [3], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 11, 'id_sentence': 21, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 22, 'id_word': [3, 4], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 11, 'id_sentence': 22, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 23, 'id_word': [3], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 11, 'id_sentence': 23, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 24, 'id_word': [3, 4], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 11, 'id_sentence': 24, 'arc': 1, 'campaign': 2}}
6)   

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [1], 'new_word': ['The', 'group'], 'info': {'episode': 11, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [8, 9], 'new_word': ['group'], 'info': {'episode': 11, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [12, 13], 'new_word': ['a', 'thunderstorm'], 'info': {'episode': 11, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [12], 'new_word': ['Pumat', 'Sol'], 'info': {'episode': 11, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [22], 'new_word': ['Pumat', 'Sol'], 'info': {'episode': 11, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [4], 'new_word': ['Pumat', 'Sol'], 'info': {'episode': 11, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['Pumat', 'Sol'], 'info': {'episode': 11, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
7)   {'id_sentence': 5, 'i

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Inference:   0%|          | 0/19 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 12, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [17, 18], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 12, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 10, 'id_word': [6], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 12, 'id_sentence': 10, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 11, 'id_word': [5], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 12, 'id_sentence': 11, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 11, 'id_word': [5], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 12, 'id_sentence': 11, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 30, 'id_word': [2, 3], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 12, 'id_sentence': 30, 'arc': 1, 'campaign': 2}}
6) 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [4], 'new_word': ['The', 'Mighty', "Nein's"], 'info': {'episode': 12, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [8], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 12, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 0, 'id_word': [31], 'new_word': ['The', 'Mighty', "Nein's"], 'info': {'episode': 12, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [9, 10, 11], 'new_word': ["Sutan's"], 'info': {'episode': 12, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 1, 'id_word': [47, 45], 'new_word': ['Diedric'], 'info': {'episode': 12, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [3, 4], 'new_word': ['High', "Richter's"], 'info': {'episode': 12, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 2, 'id_word': [7], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 12, 'id_sentence': 2, 'arc': 1

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Inference:   0%|          | 0/59 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 13, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [21], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 13, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 9, 'id_word': [5, 6], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 13, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 11, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 13, 'id_sentence': 11, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 13, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 13, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 16, 'id_word': [6], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 13, 'id_sentence': 16, 'arc': 1, 'campaign': 2}}
6)   {

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 13, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [2], 'new_word': ['the', 'drow'], 'info': {'episode': 13, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [6], 'new_word': ['the', 'drow'], 'info': {'episode': 13, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [8], 'new_word': ['the', "drow's"], 'info': {'episode': 13, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 1, 'id_word': [11], 'new_word': ['the', "drow's"], 'info': {'episode': 13, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [1], 'new_word': ['the', "drow's"], 'info': {'episode': 13, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [6], 'new_word': ['a'], 'info': {'episode': 13, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
7)   {'id_sentence': 3, 'id_word':

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Inference:   0%|          | 0/54 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [19], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 14, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [7], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 14, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 6, 'id_word': [5], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 14, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 9, 'id_word': [3], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 14, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 12, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 14, 'id_sentence': 12, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 18, 'id_word': [18], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 14, 'id_sentence': 18, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 19, 'id_word'

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [11], 'new_word': ['Mollymauk', "Tealeaf's"], 'info': {'episode': 14, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['Mollymauk', 'Tealeaf'], 'info': {'episode': 14, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [2], 'new_word': ['Fjord'], 'info': {'episode': 14, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 5, 'id_word': [9], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 14, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [16], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 14, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [18], 'new_word': ['the', 'group'], 'info': {'episode': 14, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 6, 'id_word': [17], 'new_word': ['Mollymauk', "Tealeaf's"], 'info': {'episode': 14, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Inference:   0%|          | 0/54 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [10, 11], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 15, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [3], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 15, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 13, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 15, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 28, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 15, 'id_sentence': 28, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 29, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 15, 'id_sentence': 29, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 32, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 15, 'id_sentence': 32, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 2, 'id_word': [5], 'new_word': ['the', 'party'], 'info': {'episode': 15, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [35], 'new_word': ['the', 'party'], 'info': {'episode': 15, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [64], 'new_word': ['the', "party's"], 'info': {'episode': 15, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [67], 'new_word': ['the', "party's"], 'info': {'episode': 15, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [8], 'new_word': ['the', 'party'], 'info': {'episode': 15, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [30], 'new_word': ['a', 'pair', 'of', 'slow', '-', 'moving', 'rock', 'monsters'], 'info': {'episode': 15, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 6, 'id_word': [1], 'new_word': ['the', 'boats'], 'info': {'episode': 15, 'id_sentence': 6, 'arc':

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Inference:   0%|          | 0/60 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [12], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 16, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [11], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 16, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [19], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 16, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [19], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 16, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 6, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 16, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 12, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 16, 'id_sentence': 12, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 24, 'id_word':

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [33], 'new_word': ['the', 'party'], 'info': {'episode': 16, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [41], 'new_word': ['Siff', 'Duthar'], 'info': {'episode': 16, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [3], 'new_word': ['Siff', 'Duthar'], 'info': {'episode': 16, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [19], 'new_word': ['Siff', 'Duthar'], 'info': {'episode': 16, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 1, 'id_word': [25], 'new_word': ['Siff', 'Duthar'], 'info': {'episode': 16, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [8], 'new_word': ['Whispers', 'of', 'Madness'], 'info': {'episode': 16, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 2, 'id_word': [18], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 16, 'id_sentence': 2, 'arc': 1, 'campaign': 2

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Inference:   0%|          | 0/48 [00:00<?, ?it/s]

0)   {'id_sentence': 3, 'id_word': [18], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 17, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 6, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 17, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 29, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 17, 'id_sentence': 29, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 41, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 17, 'id_sentence': 41, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 44, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 17, 'id_sentence': 44, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 47, 'id_word': [10], 'token_word': 'teams', 'new_word': ['Mighty Nein'], 'info': {'episode': 17, 'id_sentence': 47, 'arc': 1, 'campaign': 2}}
6) 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [15], 'new_word': ['the', 'beacon'], 'info': {'episode': 17, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [24], 'new_word': ['the', 'new', 'owner', 'of', 'The', 'Leaky', 'Tap', ',', 'Lauren', 'Schvine', ',', 'Beauregard', 'Lionett'], 'info': {'episode': 17, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [33], 'new_word': ['the', 'new', 'owner', 'of', 'The', 'Leaky', 'Tap', ',', 'Lauren', 'Schvine', ',', 'Beauregard', 'Lionett'], 'info': {'episode': 17, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [42, 43], 'new_word': ['The', 'Leaky', 'Tap'], 'info': {'episode': 17, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [13], 'new_word': ['a'], 'info': {'episode': 17, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 6, 'id_word': [0], 'new_word': ['the'], 'info': {'episode': 17, 'id_sentence': 6, 'arc': 1, 'campai

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Inference:   0%|          | 0/12 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [11], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 18, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 10, 'id_word': [29], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 18, 'id_sentence': 10, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 12, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 18, 'id_sentence': 12, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 15, 'id_word': [9], 'token_word': 'teams', 'new_word': ['Mighty Nein'], 'info': {'episode': 18, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 15, 'id_word': [16, 17], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 18, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 20, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 18, 'id_sentence': 20, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['the', 'huge', ',', 'powerful', ',', 'lumbering', 'hill'], 'info': {'episode': 18, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [7], 'new_word': ['the', 'Nein'], 'info': {'episode': 18, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [6], 'new_word': ['Mollymauk', "Tealeaf's"], 'info': {'episode': 18, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [16], 'new_word': ['huge', ',', 'powerful', ',', 'lumbering', 'hill'], 'info': {'episode': 18, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 18, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [14], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 18, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 8, 'id_word': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 19, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 11, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 19, 'id_sentence': 11, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 15, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 19, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 42, 'id_word': [33], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 19, 'id_sentence': 42, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 43, 'id_word': [10], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 19, 'id_sentence': 43, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 45, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 19, 'id_sentence': 45, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence'

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [12], 'new_word': ["'s's"], 'info': {'episode': 19, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [9, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'new_word': ['Beauregard', 'Lionett,'], 'info': {'episode': 19, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [9, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 8], 'new_word': ["Widogast's"], 'info': {'episode': 19, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [12, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], 'new_word': ['Beauregard', 'Lionett,'], 'info': {'episode': 19, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['Babenon', 'Dosal', "'s"], 'info': {'episode': 19, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 6, 'id_word': [0], 'new_word': ['Babenon', 'Dosal', "'s"], 'info': {'episode': 19, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
6)   {'id_s

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Inference:   0%|          | 0/45 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [12], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 20, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [4], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 20, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 11, 'id_word': [21, 21], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 20, 'id_sentence': 11, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 19, 'id_word': [46], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 20, 'id_sentence': 19, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 23, 'id_word': [4], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 20, 'id_sentence': 23, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 41, 'id_word': [5], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 20, 'id_sentence': 41, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence'

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [8], 'new_word': ['Veth', "Brenatto's"], 'info': {'episode': 20, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [14, 15, 14], 'new_word': ["Brenatto's"], 'info': {'episode': 20, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [40], 'new_word': ['her', 'flask'], 'info': {'episode': 20, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [4], 'new_word': ['Veth', 'Brenatto'], 'info': {'episode': 20, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [10], 'new_word': ['all', 'in', 'the', 'group'], 'info': {'episode': 20, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [12], 'new_word': ['her'], 'info': {'episode': 20, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [21], 'new_word': ['her', 'flask'], 'info': {'episode': 20, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
7)   {'id

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Inference:   0%|          | 0/42 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [11], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 21, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 21, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 9, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 21, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 14, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 21, 'id_sentence': 14, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 20, 'id_word': [18], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 21, 'id_sentence': 20, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 26, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 21, 'id_sentence': 26, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 36

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [16], 'new_word': ['the', 'party'], 'info': {'episode': 21, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [7], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 21, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [9], 'new_word': ['Yasha', "Nydoorin's"], 'info': {'episode': 21, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [13], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 21, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 1, 'id_word': [18], 'new_word': ['Febron', 'Keyes'], 'info': {'episode': 21, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 1, 'id_word': [21], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 21, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 1, 'id_word': [23], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 21, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Inference:   0%|          | 0/22 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [4, 5], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 22, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 9, 'id_word': [35], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 22, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 13, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 22, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 17, 'id_word': [11], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 22, 'id_sentence': 17, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 19, 'id_word': [7], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 22, 'id_sentence': 19, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 21, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 22, 'id_sentence': 21, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 4, 'id_word': [3], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 22, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [6], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 22, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [8, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'new_word': ['Beauregard', 'Lionett,'], 'info': {'episode': 22, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [22], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 22, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [29], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 22, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [31], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 22, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 4, 'id_word': [34], 'new_word': ['Beauregard', 'Lionet

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Inference:   0%|          | 0/11 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [22, 23], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 23, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [15], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 23, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 9, 'id_word': [17], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 23, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 13, 'id_word': [28], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 23, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 15, 'id_word': [21], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 23, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 36, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 23, 'id_sentence': 36, 'arc': 1, 'campaign': 2}}
6)   {'id_

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [4], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 23, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [26], 'new_word': ['the', 'Mighty', "Nein's"], 'info': {'episode': 23, 'id_sentence': 0, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [2], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 23, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [13], 'new_word': ['the', 'Mighty', 'Nein'], 'info': {'episode': 23, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [15], 'new_word': ['Mighty', 'Nein'], 'info': {'episode': 23, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [21], 'new_word': ['Mighty', 'Nein'], 'info': {'episode': 23, 'id_sentence': 2, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [5], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 23, 'id_sentence': 3, 'arc': 1, 'ca

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Inference:   0%|          | 0/13 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [21], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 24, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 9, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 24, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 10, 'id_word': [1], 'token_word': 'team', 'new_word': ['Mighty Nein'], 'info': {'episode': 24, 'id_sentence': 10, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 13, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 24, 'id_sentence': 13, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 15, 'id_word': [33], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 24, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 29, 'id_word': [18], 'token_word': 'team', 'new_word': ['Mighty Nein'], 'info': {'episode': 24, 'id_sentence': 29, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 30, 'id_wo

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['The', 'party'], 'info': {'episode': 24, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [10], 'new_word': ['a'], 'info': {'episode': 24, 'id_sentence': 5, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 6, 'id_word': [5], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 24, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 6, 'id_word': [7], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 24, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 6, 'id_word': [26], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 24, 'id_sentence': 6, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 7, 'id_word': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 'new_word': ['Beauregard', 'Lionett,'], 'info': {'episode': 24, 'id_sentence': 7, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 7, 'id_word': [21], 'new_word': ['a', 'gold'], 'info': {'episode': 24, 'id_sentence': 7, 'arc': 1, '

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Inference:   0%|          | 0/24 [00:00<?, ?it/s]

0)   {'id_sentence': 3, 'id_word': [6, 7], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 25, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 9, 'id_word': [59], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 25, 'id_sentence': 9, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 12, 'id_word': [6], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 25, 'id_sentence': 12, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 15, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 25, 'id_sentence': 15, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 27, 'id_word': [34, 35], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 25, 'id_sentence': 27, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 34, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 25, 'id_sentence': 34, 'arc': 1, 'campaign': 2}}
6)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [17], 'new_word': ['Gearhole', 'Prison'], 'info': {'episode': 25, 'id_sentence': 1, 'arc': 1, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [8], 'new_word': ["'s's"], 'info': {'episode': 25, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [16], 'new_word': ['"', 'teenagers', '"'], 'info': {'episode': 25, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [19], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 25, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [24], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 25, 'id_sentence': 3, 'arc': 1, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [9], 'new_word': ['the', '"', 'teenagers', '"'], 'info': {'episode': 25, 'id_sentence': 4, 'arc': 1, 'campaign': 2}}
6)   {'id_sentence': 6, 'id_word': [14, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], 'new_word': ['Beaure

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

- saving JSON file: pprTL_C2A1.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/ppr/ ...
Elaborating for arc number n° 2
['pprTL_C1A1.json', 'pprTL_C1A2.json', 'pprTL_C1A3.json', 'pprREC_C1A1.json', 'pprREC_C1A2.json', 'pprREC_C1A3.json', 'pprSUMM_C1A1.json', 'pprSUMM_C1A2.json', 'pprSUMM_C1A3.json', 'pprREC_C2A1.json', 'pprREC_C2A2.json', 'pprSUMM_C2A1.json', 'pprTL_C2A1.json']
pre-processing for file -> [campaign: 2] [arc 2] [episode 26]


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Inference:   0%|          | 0/52 [00:00<?, ?it/s]

0)   {'id_sentence': 4, 'id_word': [6, 7], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 26, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [21], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 26, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 6, 'id_word': [6], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 26, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 8, 'id_word': [17], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 26, 'id_sentence': 8, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 9, 'id_word': [7, 8], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 26, 'id_sentence': 9, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 11, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 26, 'id_sentence': 11, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [3], 'new_word': ['The', "group's"], 'info': {'episode': 26, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [7, 8], 'new_word': ['The', 'group'], 'info': {'episode': 26, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [6, 7], 'new_word': ['an', 'armored', 'dwarven', 'woman'], 'info': {'episode': 26, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [10], 'new_word': ['an', 'armored', 'dwarven', 'woman'], 'info': {'episode': 26, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [9], 'new_word': ['the', 'remaining', 'members', 'of', 'the', 'Mighty', "Nein's"], 'info': {'episode': 26, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [30], 'new_word': ['the', 'remaining', 'members', 'of', 'the', 'Mighty', "Nein's"], 'info': {'episode': 26, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 5, 

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Inference:   0%|          | 0/11 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [15], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 40, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'id_sentence': 40, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 48, 'id_word': [14], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'id_sentence': 48, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 62, 'id_word': [7], 'token_word': 'team', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'id_sentence': 62, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 73, 'id_word': [7], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'id_sentence': 73, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [43, 44], 'new_word': ['A', 'faint', 'bit', 'of', 'snowfall'], 'info': {'episode': 27, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [52], 'new_word': ['Mollymauk', "Tealeaf's"], 'info': {'episode': 27, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [0, 1, 2, 3, 4], 'new_word': ['Beauregard', 'Lionett,'], 'info': {'episode': 27, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [11], 'new_word': ['the', 'stunned', 'group', '-', 'Beauregard', 'Lionett', ',', 'Veth', 'Brenatto', ',', 'Caleb', 'Widogast', ',', 'and', 'Keg', '-'], 'info': {'episode': 27, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 1, 'id_word': [20], 'new_word': ['the', 'stunned', 'group', '-', 'Beauregard', 'Lionett', ',', 'Veth', 'Brenatto', ',', 'Caleb', 'Widogast', ',', 'and', 'Keg', '-'], 'info': {'episode': 27, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
5)   {'id_sente

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Inference:   0%|          | 0/55 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 28, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 25, 'id_word': [9], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 28, 'id_sentence': 25, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 25, 'id_word': [9], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 28, 'id_sentence': 25, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['The', 'party'], 'info': {'episode': 28, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [4], 'new_word': ['the', "town's"], 'info': {'episode': 28, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [0], 'new_word': ['Everyone', 'in', 'town'], 'info': {'episode': 28, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [6, 7], 'new_word': ['a', 'priest', 'or', 'cleric'], 'info': {'episode': 28, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [10], 'new_word': ['Everyone', 'in', 'town'], 'info': {'episode': 28, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [13], 'new_word': ['Everyone', 'in', 'town'], 'info': {'episode': 28, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 4, 'id_word': [15, 16, 17], 'new_word': ['a', 'priest', 'or', 'cleric'], 'info': {'episode': 28, 'id

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Inference:   0%|          | 0/51 [00:00<?, ?it/s]

0)   {'id_sentence': 16, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 29, 'id_sentence': 16, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 31, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 29, 'id_sentence': 31, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 37, 'id_word': [13], 'token_word': 'team', 'new_word': ['Mighty Nein'], 'info': {'episode': 29, 'id_sentence': 37, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 39, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 29, 'id_sentence': 39, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [1, 2], 'new_word': ["Keg's"], 'info': {'episode': 29, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [21], 'new_word': ['two', 'more', "guards's"], 'info': {'episode': 29, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [5], 'new_word': ['two', 'more'], 'info': {'episode': 29, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [9], 'new_word': ['two', 'more', "guards's"], 'info': {'episode': 29, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [14], 'new_word': ['two', 'more', "guards's"], 'info': {'episode': 29, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [23], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 29, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [15], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 29, 'id_sentence': 3, 'arc': 2, 'campaig

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Inference:   0%|          | 0/32 [00:00<?, ?it/s]

0)   {'id_sentence': 28, 'id_word': [6], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 30, 'id_sentence': 28, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 43, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 30, 'id_sentence': 43, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 55, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 30, 'id_sentence': 55, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 65, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 30, 'id_sentence': 65, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 67, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 30, 'id_sentence': 67, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 79, 'id_word': [4, 5], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 30, 'id_sentence': 79, 'arc': 2, 'campaign': 2}}
6

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [3], 'new_word': ['Jester', 'Lavorre', 'and', 'Fjord', 'Fjord'], 'info': {'episode': 30, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [6], 'new_word': ['Jester', 'Lavorre', 'and', 'Fjord', "Fjord's"], 'info': {'episode': 30, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [19], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 30, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [21], 'new_word': ['Jester', "Lavorre's"], 'info': {'episode': 30, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [2], 'new_word': ['screams', 'and', 'familiar'], 'info': {'episode': 30, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [8], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 30, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [14], 'new_word': ['Jester', "Lavorre's"],

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Inference:   0%|          | 0/17 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 31, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 31, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 12, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 31, 'id_sentence': 12, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 14, 'id_word': [62], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 31, 'id_sentence': 14, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [14], 'new_word': ['The', 'Mighty', "Nein's"], 'info': {'episode': 31, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [1], 'new_word': ['Mighty', 'Nein'], 'info': {'episode': 31, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 5, 'id_word': [10], 'new_word': ['The', 'Mighty', "Nein's"], 'info': {'episode': 31, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 6, 'id_word': [3], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 31, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 7, 'id_word': [5], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 31, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 8, 'id_word': [0], 'new_word': ['Fjord'], 'info': {'episode': 31, 'id_sentence': 8, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 8, 'id_word': [7, 8], 'new_word': ['the', 'Wayfarer', "'s", 'Cove', 'on', 'the', 'Menagerie', 'Coast.[31'], 'info': {'

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Inference:   0%|          | 0/35 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [2], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 32, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [3], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 32, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 15, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 32, 'id_sentence': 15, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 25, 'id_word': [2], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 32, 'id_sentence': 25, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 28, 'id_word': [2], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 32, 'id_sentence': 28, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [15], 'new_word': ['Caleb', "Widogast's"], 'info': {'episode': 32, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [32], 'new_word': ['the', 'dodecahedron'], 'info': {'episode': 32, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 0, 'id_word': [34], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 32, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['the', 'party'], 'info': {'episode': 32, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [17], 'new_word': ['Marion', 'Lavorre'], 'info': {'episode': 32, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [19], 'new_word': ['the', 'party'], 'info': {'episode': 32, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 7, 'id_word': [0], 'new_word': ['the', 'party'], 'info': {'episode': 32, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
7)   

Map:   0%|          | 0/62 [00:00<?, ? examples/s]

Inference:   0%|          | 0/62 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [14], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 33, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [3], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 33, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 21, 'id_word': [3], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 33, 'id_sentence': 21, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 25, 'id_word': [43], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 33, 'id_sentence': 25, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 32, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 33, 'id_sentence': 32, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 48, 'id_word': [2], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 33, 'id_sentence': 48, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 52, 'id_w

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'new_word': ['the', 'party'], 'info': {'episode': 33, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['the', 'party'], 'info': {'episode': 33, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [3], 'new_word': ['the', 'party'], 'info': {'episode': 33, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [6, 7], 'new_word': ['Nicodranas'], 'info': {'episode': 33, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [9], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 33, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [21], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 33, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 2, 'id_word': [23], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 33, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
7)   {'id_senten

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Inference:   0%|          | 0/53 [00:00<?, ?it/s]

0)   {'id_sentence': 5, 'id_word': [8], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 34, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 11, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 34, 'id_sentence': 11, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 14, 'id_word': [26], 'token_word': 'team', 'new_word': ['Mighty Nein'], 'info': {'episode': 34, 'id_sentence': 14, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 42, 'id_word': [9], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 34, 'id_sentence': 42, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 46, 'id_word': [1], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 34, 'id_sentence': 46, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [13], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 34, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['Orly', 'Skiffback'], 'info': {'episode': 34, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [7], 'new_word': ['Orly', 'Skiffback'], 'info': {'episode': 34, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [17], 'new_word': ['another', 'customer', 'in', 'the', 'bar'], 'info': {'episode': 34, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['another', 'customer', 'in', 'the', 'bar'], 'info': {'episode': 34, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [11], 'new_word': ['another', 'customer', 'in', 'the', 'bar'], 'info': {'episode': 34, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 6, 'id_word': [0], 'new_word': ['another', 'customer

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Inference:   0%|          | 0/26 [00:00<?, ?it/s]

0)   {'id_sentence': 6, 'id_word': [8, 9], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 35, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 10, 'id_word': [3, 4], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 35, 'id_sentence': 10, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 16, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 35, 'id_sentence': 16, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 24, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 35, 'id_sentence': 24, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 34, 'id_word': [14, 15], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 35, 'id_sentence': 34, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 37, 'id_word': [3, 4], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 35, 'id_sentence': 37, 

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [6], 'new_word': ["'s's"], 'info': {'episode': 35, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 35, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [3, 4], 'new_word': ["Fjord's"], 'info': {'episode': 35, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 1, 'id_word': [10, 11], 'new_word': ['Beauregard', 'Lionett', 'Fjord', "'s"], 'info': {'episode': 35, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 1, 'id_word': [15], 'new_word': ['Fjord'], 'info': {'episode': 35, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [3], 'new_word': ["'s's"], 'info': {'episode': 35, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 2, 'id_word': [8], 'new_word': ['Algar', 'Dyomin', "'s", 'bracelet'], 'info': {'episode': 35, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Inference:   0%|          | 0/53 [00:00<?, ?it/s]

0)   {'id_sentence': 21, 'id_word': [21], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 36, 'id_sentence': 21, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 29, 'id_word': [1], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 36, 'id_sentence': 29, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 43, 'id_word': [8, 8], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 36, 'id_sentence': 43, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 56, 'id_word': [3], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 36, 'id_sentence': 56, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 61, 'id_word': [5, 6], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 36, 'id_sentence': 61, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 72, 'id_word': [6], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 36, 'id_sentence': 72, 'arc': 2, 'campaign': 2}}
6)   {'id_se

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 3, 'id_word': [4], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 36, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [15], 'new_word': ["'s's"], 'info': {'episode': 36, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [18], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 36, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [23], 'new_word': ['a', 'guy', "'s's"], 'info': {'episode': 36, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [22, 23, 24], 'new_word': ['a', 'guy', "'s"], 'info': {'episode': 36, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 6, 'id_word': [17, 18], 'new_word': ['The', 'Mist'], 'info': {'episode': 36, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 6, 'id_word': [20], 'new_word': ['Jester', 'Lavorre'], 'info': {'episode': 36, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
7)  

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Inference:   0%|          | 0/61 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 37, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 37, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [37], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 37, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 7, 'id_word': [9], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 37, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 7, 'id_word': [17], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 37, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 18, 'id_word': [4], 'token_word': 'team', 'new_word': ['Mighty Nein'], 'info': {'episode': 37, 'id_sentence': 18, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 30, 'id_wo

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 3, 'id_word': [1], 'new_word': ['Mighty', 'Nein'], 'info': {'episode': 37, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [4], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 37, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [38], 'new_word': ['the', 'presumed', 'Javeed', 'Jawgrasp', '(', 'actually', ',', 'Fjord', 'in', 'disguise', ')'], 'info': {'episode': 37, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 4, 'id_word': [41], 'new_word': ['the', 'presumed', 'Javeed', 'Jawgrasp', '(', 'actually', ',', 'Fjord', 'in', 'disguise', ')'], 'info': {'episode': 37, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 6, 'id_word': [0], 'new_word': ['Avantika', 'who', 'introduces', 'herself', 'as', 'Captain', 'Avantika'], 'info': {'episode': 37, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 7, 'id_word': [0], 'new_word': ['Avantika', 'who', 'introduce

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Inference:   0%|          | 0/17 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 38, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [9], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 38, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 9, 'id_word': [11], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 38, 'id_sentence': 9, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 10, 'id_word': [11], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 38, 'id_sentence': 10, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 11, 'id_word': [13], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 38, 'id_sentence': 11, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 15, 'id_word': [7], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 38, 'id_sentence': 15, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 18

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 2, 'id_word': [1], 'new_word': ['an', 'abandoned', 'ruin'], 'info': {'episode': 38, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [22], 'new_word': ['a'], 'info': {'episode': 38, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [7], 'new_word': ['a', 'golden', 'snake', 'icon'], 'info': {'episode': 38, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [9], 'new_word': ['The', 'Mighty'], 'info': {'episode': 38, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [32], 'new_word': ['Yasha', 'Nydoorin'], 'info': {'episode': 38, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [34], 'new_word': ['three', 'large'], 'info': {'episode': 38, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 4, 'id_word': [4], 'new_word': ['the', 'group'], 'info': {'episode': 38, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
7)   {'

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Inference:   0%|          | 0/6 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 39, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 39, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 30, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 39, 'id_sentence': 30, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 49, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 39, 'id_sentence': 49, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 69, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 39, 'id_sentence': 69, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [11, 12], 'new_word': ['The', 'whisperer'], 'info': {'episode': 39, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [34, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44], 'new_word': ['Beauregard', 'Lionett,'], 'info': {'episode': 39, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [34, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44], 'new_word': ['Nydoorin'], 'info': {'episode': 39, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['The', 'party', ',', 'including', 'Avantika', 'and', 'Jamedi', 'Cosko'], 'info': {'episode': 39, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 7, 'id_word': [13], 'new_word': ['The', 'stonework', 'and', 'iconography'], 'info': {'episode': 39, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 8, 'id_word': [0], 'new_word': ['The', 'party', ',', 'including', 'Avantika', 'and', 

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Inference:   0%|          | 0/24 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 40, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 24, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 40, 'id_sentence': 24, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 40, 'id_word': [4], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 40, 'id_sentence': 40, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 50, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 40, 'id_sentence': 50, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 54, 'id_word': [10], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 40, 'id_sentence': 54, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 57, 'id_word': [3], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 40, 'id_sentence': 57, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence':

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [12], 'new_word': ['Jamedi', 'Cosko'], 'info': {'episode': 40, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [4], 'new_word': ['Jamedi', 'Cosko'], 'info': {'episode': 40, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [13], 'new_word': ['Jamedi', 'Cosko'], 'info': {'episode': 40, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 5, 'id_word': [0, 1], 'new_word': ['the'], 'info': {'episode': 40, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 6, 'id_word': [11], 'new_word': ['the', 'three', 'pillars', 'that', 'hold', 'up', 'the', 'room'], 'info': {'episode': 40, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 8, 'id_word': [2], 'new_word': ['One'], 'info': {'episode': 40, 'id_sentence': 8, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 9, 'id_word': [0], 'new_word': ['a', 'giant'], 'info': {'episode': 40, 'id_sentence': 9, 'arc': 2, 'campai

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Inference:   0%|          | 0/12 [00:00<?, ?it/s]

0)   {'id_sentence': 12, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 41, 'id_sentence': 12, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 22, 'id_word': [15], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 41, 'id_sentence': 22, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 24, 'id_word': [8], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 41, 'id_sentence': 24, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 28, 'id_word': [4], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 41, 'id_sentence': 28, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 36, 'id_word': [12], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 41, 'id_sentence': 36, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 37, 'id_word': [11], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 41, 'id_sentence': 37, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 49, 'id_w

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [43], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 41, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [5], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 41, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 41, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [6], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 41, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [17, 18], 'new_word': ['that', 'point'], 'info': {'episode': 41, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 2, 'id_word': [23], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 41, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 2, 'id_word': [28], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 41, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Inference:   0%|          | 0/11 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 42, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 4, 'id_word': [4], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 42, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 19, 'id_word': [1], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 42, 'id_sentence': 19, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 20, 'id_word': [11], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 42, 'id_sentence': 20, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 54, 'id_word': [1], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 42, 'id_sentence': 54, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 70, 'id_word': [34], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 42, 'id_sentence': 70, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [14], 'new_word': ['Avantika'], 'info': {'episode': 42, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 0, 'id_word': [23], 'new_word': ['Avantika'], 'info': {'episode': 42, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [16], 'new_word': ['the', 'amount', 'Avantika', 'gave', 'to', 'the', 'Plank', 'King'], 'info': {'episode': 42, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [17], 'new_word': ['the', 'Plank', 'King'], 'info': {'episode': 42, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 3, 'id_word': [2], 'new_word': ['the', 'Plank', 'King'], 'info': {'episode': 42, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [6], 'new_word': ['Avantika'], 'info': {'episode': 42, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 4, 'id_word': [2, 3], 'new_word': ["Avantika's"], 'info': {'episode': 42, 'id_sentence': 4, 

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Inference:   0%|          | 0/34 [00:00<?, ?it/s]

0)   {'id_sentence': 4, 'id_word': [18, 19], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 43, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 7, 'id_word': [3], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 43, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 11, 'id_word': [16], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 43, 'id_sentence': 11, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 12, 'id_word': [39], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 43, 'id_sentence': 12, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 17, 'id_word': [15], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 43, 'id_sentence': 17, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 21, 'id_word': [8, 9], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 43, 'id_sentence': 21, 'arc': 2, 'campaign': 2}}
6)   {'id_sen

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [5], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 43, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [7, 7], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 43, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [11], 'new_word': ['Caleb', 'Widogast'], 'info': {'episode': 43, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [0], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 43, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [2], 'new_word': ['herself', 'and', 'Beauregard', 'Lionett'], 'info': {'episode': 43, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [10], 'new_word': ['herself', 'and', 'Beauregard', "Lionett's"], 'info': {'episode': 43, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [2], 'new_word': ['Vera'], 'info': {'episo

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Inference:   0%|          | 0/19 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 44, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [3], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 44, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 13, 'id_word': [8], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 44, 'id_sentence': 13, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 19, 'id_word': [14], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 44, 'id_sentence': 19, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 21, 'id_word': [4], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 44, 'id_sentence': 21, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 22, 'id_word': [9, 10], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 44, 'id_sentence': 22, 'arc': 2, 'campaign': 2}}
6)   {'id_se

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 44, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 2, 'id_word': [2, 3], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 44, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 2, 'id_word': [15], 'new_word': ['Fjord'], 'info': {'episode': 44, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 3, 'id_word': [9], 'new_word': ['Sabian'], 'info': {'episode': 44, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 4, 'id_word': [5, 6], 'new_word': ["Fjord's"], 'info': {'episode': 44, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 4, 'id_word': [14], 'new_word': ['Sabian'], 'info': {'episode': 44, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 7, 'id_word': [12], 'new_word': ['Frumpkin', 'in', 'octopus', 'form'], 'info': {'episode': 44, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
7)   {'id_

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/53 [00:00<?, ? examples/s]

Inference:   0%|          | 0/53 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [12], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 45, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 5, 'id_word': [4, 5], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 45, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 7, 'id_word': [10, 11], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 45, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 9, 'id_word': [12], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 45, 'id_sentence': 9, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 40, 'id_word': [22, 23], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 45, 'id_sentence': 40, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 46, 'id_word': [28, 29], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 45, 'id_sentence': 46, 'arc': 2, 'campaign

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [0, 1], 'new_word': ['Orly', 'Skiffback'], 'info': {'episode': 45, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [11], 'new_word': ["Avantika's"], 'info': {'episode': 45, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 3, 'id_word': [0], 'new_word': ['Veth', 'Brenatto'], 'info': {'episode': 45, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 6, 'id_word': [6], 'new_word': ['The', 'rest', 'of', 'the', 'Mighty', 'Nein'], 'info': {'episode': 45, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 6, 'id_word': [16], 'new_word': ['Veth', "Brenatto's"], 'info': {'episode': 45, 'id_sentence': 6, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 8, 'id_word': [1], 'new_word': ['rest', 'of', 'the', 'Mighty', 'Nein'], 'info': {'episode': 45, 'id_sentence': 8, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 8, 'id_word': [22], 'new_word': ['Fjord'], 'info': {'episode': 45, 'id_sentence'

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Inference:   0%|          | 0/3 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 46, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 7, 'id_word': [19], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 46, 'id_sentence': 7, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 15, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 46, 'id_sentence': 15, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 58, 'id_word': [6], 'token_word': 'group', 'new_word': ['Mighty Nein'], 'info': {'episode': 46, 'id_sentence': 58, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 60, 'id_word': [12], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 46, 'id_sentence': 60, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 66, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 46, 'id_sentence': 66, 'arc': 2, 'campaign': 2}}
**********


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [0], 'new_word': ['the', 'party'], 'info': {'episode': 46, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 1, 'id_word': [20], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 46, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 1, 'id_word': [25], 'new_word': ['the', "party's"], 'info': {'episode': 46, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 2, 'id_word': [0], 'new_word': ['Caduceus', 'Clay'], 'info': {'episode': 46, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 2, 'id_word': [3], 'new_word': ['the', 'party'], 'info': {'episode': 46, 'id_sentence': 2, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 3, 'id_word': [0], 'new_word': ['the', 'party'], 'info': {'episode': 46, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 3, 'id_word': [5, 6], 'new_word': ['Bisaft', 'Isle'], 'info': {'episode': 46, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
7)   {'id_sentenc

Map:   0%|          | 0/35 [00:00<?, ? examples/s]

Inference:   0%|          | 0/35 [00:00<?, ?it/s]

0)   {'id_sentence': 0, 'id_word': [1, 2], 'token_word': 'Mighty Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 47, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 14, 'id_word': [14], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 47, 'id_sentence': 14, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 20, 'id_word': [3], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 47, 'id_sentence': 20, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 20, 'id_word': [10], 'token_word': 'Nein', 'new_word': ['Mighty Nein'], 'info': {'episode': 47, 'id_sentence': 20, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 21, 'id_word': [1], 'token_word': 'crew', 'new_word': ['Mighty Nein'], 'info': {'episode': 47, 'id_sentence': 21, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 23, 'id_word': [9], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 47, 'id_sentence': 23, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 2

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

0)   {'id_sentence': 1, 'id_word': [5], 'new_word': ['The', 'Mighty', 'Nein'], 'info': {'episode': 47, 'id_sentence': 1, 'arc': 2, 'campaign': 2}}
1)   {'id_sentence': 3, 'id_word': [1], 'new_word': ['a', 'spherical'], 'info': {'episode': 47, 'id_sentence': 3, 'arc': 2, 'campaign': 2}}
2)   {'id_sentence': 4, 'id_word': [9], 'new_word': ['a', 'spherical'], 'info': {'episode': 47, 'id_sentence': 4, 'arc': 2, 'campaign': 2}}
3)   {'id_sentence': 5, 'id_word': [0], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 47, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
4)   {'id_sentence': 5, 'id_word': [2], 'new_word': ['Beauregard', "Lionett's"], 'info': {'episode': 47, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
5)   {'id_sentence': 5, 'id_word': [12], 'new_word': ['Beauregard', 'Lionett'], 'info': {'episode': 47, 'id_sentence': 5, 'arc': 2, 'campaign': 2}}
6)   {'id_sentence': 5, 'id_word': [19, 20], 'new_word': ["Uk'otoa"], 'info': {'episode': 47, 'id_sentence': 5, 'arc': 2, 'campai

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Inference:   0%|          | 0/51 [00:00<?, ?it/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Inference:   0%|          | 0/1 [00:00<?, ?it/s]

***Already inserted***
{'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'progressive': 2, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
{'id_sentence': 0, 'id_word': [0, 1], 'new_word': ['Vox', 'Machina'], 'info': {'episode': 27, 'progressive': 2, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
***Already inserted***
{'id_sentence': 0, 'id_word': [1], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'progressive': 3, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
{'id_sentence': 0, 'id_word': [0, 1], 'new_word': ['Vox', 'Machina'], 'info': {'episode': 27, 'progressive': 3, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
***Already inserted***
{'id_sentence': 0, 'id_word': [8], 'token_word': 'party', 'new_word': ['Mighty Nein'], 'info': {'episode': 27, 'progressive': 4, 'id_sentence': 0, 'arc': 2, 'campaign': 2}}
{'id_sentence': 0, 'id_word': [7, 8], 'new_word': ['Vox', 'Machina'], 'info': {'episode': 27, 'progr

## Substitutions Application

In [None]:
def flatten(lst):
    result = []
    for item in lst:
        if isinstance(item, list):
            result.extend(flatten(item))
        else:
            result.append(item)
    return result

In [None]:
import nltk

def reverse_look_for_char(new_word_str,ep_num,character_dictionary):
  ret_items = []

  tokens = nltk.word_tokenize(new_word_str.lower())
  for item in character_dictionary[ep_num]:
    phrase = item['name'].lower()
    if item['aka'] == ["Vox Machina"]: phrase = "vox machina"
    phrase_tokens = phrase.split()

    indices = []
    found =False
    for i in range(len(tokens) - len(phrase_tokens) + 1):
        if tokens[i:i+len(phrase_tokens)] == phrase_tokens:
            indices = list(range(i, i+len(phrase_tokens)))
            if phrase!="vox machina":
              ret_items.append((indices,item))
            else:
              #caso che va splittato ulteriormente 
              item_names=item['name'].split(',')
              # print("item_names: ",item_names)
              for name in item_names:
                item_ = look_for_char_item(name, ep_num, character_dictionary)
                if item_:
                  ret_items.append((indices,item_))
            found = True
            break
    if found: continue
    for name in item['aka']:
      phrase = name.lower()
      phrase_tokens = phrase.split()

      indices = []
      for i in range(len(tokens) - len(phrase_tokens) + 1):
          if tokens[i:i+len(phrase_tokens)] == phrase_tokens:
              indices = list(range(i, i+len(phrase_tokens)))
              ret_items.append((indices,item))
              
              break
    # if indices:
    #     print(f"The phrase '{phrase}' is found at token indices {indices}.")
    # else:
    #     print(f"The phrase '{phrase}' is not found in the string.")
    
    
  return ret_items

In [None]:
def find_char_in_substitution(subs_,character_dictionary_list , debug = True, TL = False):
  subs = subs_.copy()
  if not TL:
    for ep in subs:
      sub_ep = ep['subs']
      if debug: print(sub_ep)
      for sub in sub_ep:
        new_word = sub['new_word']
        
        info = sub['info']
        character_dict = character_dictionary_list[int(info['arc'])-1] # arc 1 -1 = pos 0 in list
        ep_num = str(info['campaign'])+'x'
        if len(str(info['episode']))==1:
          ep_num+='0'
        ep_num+=str(info['episode'])
        # if debug: print("ep_num",ep_num)

        same_entity={}
        new_word_str = ' '.join(new_word)
        new_word = nltk.word_tokenize(new_word_str)
        if debug: print("\n new word: " , new_word)
        reverse_match = reverse_look_for_char(new_word_str,ep_num,character_dict)
        if debug:print(f"reverse_match , {reverse_match}")


        #aggiusto gli indici nei casi in cui più di un char sia presentenella stessa sub
        prev_popped = 0
        popped_list = []
        add_ind =0
        for indices_to_replace, item in reverse_match:
          popped= 0
          replacement_word = item['name']
          if indices_to_replace not in popped_list:
            for i in sorted(indices_to_replace, reverse=True):
                # print(f"new_word[i]: {new_word[i]} ,  new_word[i-popped]: {new_word[i-popped]}")
                i_ = i-prev_popped
                if i_ not in popped_list:
                  if i_ < len(new_word):new_word.pop(i_)
                  else: print("[pop excepition]", info)
                  popped_list.append(i_)
                  popped+=1
                # print(f"popped {popped} : i {i}, ")
            popped_list.append(indices_to_replace)
            add_ind=0
          else: add_ind += 1
          if popped >0: popped-=1
          
          if (indices_to_replace[0]-prev_popped) != -1 :
            ind_ = indices_to_replace[0]-prev_popped 
          else: ind_ = indices_to_replace[0]
          new_word.insert(ind_+add_ind, (replacement_word, item))

          prev_popped= popped

          sub['new_word'] = new_word
        if debug: 
            for el in new_word:
              if isinstance(el,tuple):
                print(el)
              else: print(el)
  else: 
    for sub in subs_:
        new_word = sub['new_word']
        
        info = sub['info']
        character_dict = character_dictionary_list[int(info['arc'])-1] # arc 1 -1 = pos 0 in list
        ep_num = str(info['campaign'])+'x'
        if len(str(info['episode']))==1:
          ep_num+='0'
        ep_num+=str(info['episode'])

        same_entity={}
        try:
          new_word_str = ' '.join(new_word)
        except TypeError: 
          new_word_str = ' '.join(flatten(new_word))
        new_word = nltk.word_tokenize(new_word_str)
        if debug: print("\n new word: " , new_word)
        reverse_match = reverse_look_for_char(new_word_str,ep_num,character_dict)
        if debug:print(f"reverse_match , {reverse_match}")


        #aggiusto gli indici nei casi in cui più di un char sia presentenella stessa sub
        prev_popped = 0
        popped_list = []
        add_ind =0
        for indices_to_replace, item in reverse_match:
          popped= 0
          replacement_word = item['name']
          if indices_to_replace not in popped_list:
            for i in sorted(indices_to_replace, reverse=True):
                # print(f"new_word[i]: {new_word[i]} ,  new_word[i-popped]: {new_word[i-popped]}")
                i_ = i-prev_popped
                if i_ not in popped_list:
                  if i_ < len(new_word):new_word.pop(i_)
                  else: print("[pop excepition]", info)
                  popped_list.append(i_)
                  popped+=1
                # print(f"popped {popped} : i {i}, ")
            popped_list.append(indices_to_replace)
            add_ind=0
          else: add_ind += 1
          if popped >0: popped-=1
          
          if (indices_to_replace[0]-prev_popped) != -1 :
            ind_ = indices_to_replace[0]-prev_popped 
          else: ind_ = indices_to_replace[0]
          new_word.insert(ind_+add_ind, (replacement_word, item))

          prev_popped= popped

          sub['new_word'] = new_word
        if debug: 
            for el in new_word:
              if isinstance(el,tuple):
                print(el)
              else: print(el)
  return subs
    

# ppr = loadJSON(PPR_PATH, "pprSUMM_C1A1")
# ppr_ = ppr.copy()

# subs_changed = find_char_in_substitution(ppr_,character_dictionary_list, debug = True)

In [None]:
def get_item_position_by_id(item_list, target_id):
    for i, item in enumerate(item_list):
        if target_id in item['id']:
            return i
    return None


### file update

In [None]:
def update_words(original_json, sent_id, word_ids, new_words, info, debug=False, Verbose = False):
    """
    This function updates the 'token' field for each element in the sentence with the given sent_id,
    that has an 'id' field that matches a word_id in the input list. It also maps the old word ids to
    the new word ids and returns a word_id_map.

    Args:
    - original_json: a JSON object containing data for a transcript.
    - sent_id: an integer representing the sentence id.
    - word_ids: a list of integers representing the word ids to be updated.
    - new_words: a list of strings representing the new words.

    Returns:
    - original_json: the updated JSON object.
    """
    old_word_id_TMP=0
    
    if debug: print(f" new_words_tuple : {new_words}")


    erase_flag = []
    # if sent_id == 46: 
    #   Verbose = True
    #   debug = True
    if Verbose:
      sent = ""
      for i,e in enumerate(original_json['sentences'][sent_id]['elements']):
        common_elements = set(word_ids).intersection(set( e['id']))

        if common_elements:
          # sent+=str(i)+"-["+e['token']+"]_"+ str(e['id'])+" "
          sent+="["+e['token']+"]"+" "
        else :
          # sent+= str(i)+"-"+e['token']+"_"+ str(e['id']) + " "
          sent+= e['token']+ " "
      # print(f"before[S_{sent_id}] {sent}")

    # if debug: print(f"[0.3.1]  len(new_words) {len(new_words)} >  len(word_ids) {len(word_ids)} : { len(new_words) > len(word_ids)}")
    if len(new_words) > len(word_ids): #padding
      # if debug: print(f"[0.3.2] len(new_words)-len(word_ids) : {len(new_words)-len(word_ids)}")
      for i in range(len(new_words)-len(word_ids)):
        word_ids+=[-1]
    elif len(new_words) < len(word_ids):
      if debug: print(f"THIS IS A CASE WHERE THE len(new_words) { len(new_words)} <= len(word_ids) { len(word_ids)}")
    if debug: print(f"[0.3.3]word_ids : {word_ids}")
    i=1
    
    # loop through each word id and new word pair
    for i, new_word in enumerate(new_words):
        # if i >= len(word_ids):
        #   old_word_id = old_word_id # doesn't change
        # else:
        #   old_word_id = word_ids[i]
        item=None
        if isinstance(new_word, tuple):
          item = new_word[1]
          new_word = new_word[0]
          
        old_word_id = word_ids[i]
        
        if debug: print(f"[0.4]i, (old_word_id, new_word)  {i, (old_word_id, new_word)}")
        # update the word text
        r_id_flag = True
        sent_pos = get_sentence_position_by_id(original_json['sentences'],sent_id)
        for j,element in enumerate(original_json['sentences'][sent_pos]['elements']):
            if  len(erase_flag) >0: 
              #qui dovrei eliminare l'elemento 
              if debug: print(f"[0.8] erase_flag {erase_flag}")
              if r_id_flag : 
                r_id =  element["id"][0]
                r_id_flag = False
              for z in range(len(erase_flag)):
                if debug: print(f"[0.8.1] r_id : {r_id}")
                e = erase_flag.pop(0)
                if debug: print(f"[0.8.2] e {e}")
                if e:
                  
                  remove_pos_id = get_item_position_by_id(original_json['sentences'][sent_pos]['elements'],r_id)
                  try:
                    removed_element = original_json['sentences'][sent_pos]['elements'].pop(remove_pos_id)
                  except:
                    if debug: 
                      print(f"[exception] {info} : sent_pos: {sent_pos},remove_pos_id {remove_pos_id} ")
                      # Verbose = True
                      debug = True
                  r_id += 1
                  
                  if debug: print(f"[0.8.3]removed_element: {removed_element} ")
              break
            if old_word_id == -1 :
               #add a new json obj
                # if debug: print(f"[0.7] add a new json obj")
                #my_list.insert(3, 4)  # insert 4 at index 3
                if item :
                  new_obj = {
                      'pos': [-1,-1],
                      'id': [-1],
                      'token': new_word,
                      'label': "PERSON",
                      'role': item['role'],
                      'link': item['link-u']
                  }
                else: 
                   new_obj = {
                      'pos': [-1,-1],
                      'id': [-1],
                      'token': new_word,
                      'label': '',
                      'role': [],
                      'link': ''
                  }
                # if debug : print(f"[0.7.1] new_obj {new_obj}")
                if debug : print(f"[0.7.] old_word_id_TMP : {old_word_id_TMP}")
                
                insert_pos_id = get_item_position_by_id(original_json['sentences'][sent_id]['elements'],old_word_id_TMP)
                if debug: print(f"[0.7.1]insert_pos_id+1, {insert_pos_id+i} ")
                slice_end = old_word_id_TMP+i+3
                if debug: print(f"[0.7.2] BEFORE UPGRADING  = {original_json['sentences'][sent_id]['elements'][old_word_id_TMP:slice_end]}")
                original_json['sentences'][sent_id]['elements'].insert(insert_pos_id+i,new_obj )
                if debug: print(f"[0.7.3] AFTER UPGRADING = {original_json['sentences'][sent_id]['elements'][old_word_id_TMP:slice_end]}")
                # old_word_id_TMP+=1
                i+=1
                break
            if old_word_id in element['id']:
                if debug: print(f"[0.5] BEFORE UPGRADING =  {element['token']}")
                old_len_id = len(element['id'])
                if item:
                  element['label']=  "PERSON",
                  element['role']=  item['role'],
                  element['link']=  item['link-u']
                element['token'] = new_word
                if debug: print(f"[0.6] AFTER UPGRADING  =  {element['token']}")
                if debug: print(f"[0.6.1]old_word_id_TMP : {old_word_id_TMP} to  {old_word_id}")
                old_word_id_TMP  = old_word_id
                if i == len(new_words)-1:
                  if len(new_words) < len(word_ids):
                    if old_len_id > 1 :
                      if debug:print(f"[0.6.2]  old_len_id : {old_len_id}, i: {i+1}") 

                    for  h in range(len(word_ids) - len(new_words)- (old_len_id - (i+1))): 
                      #praticamente  vado a considerare quei casi in cui in un teoken c'è piu 
                      #di una parola ma quel token viene sostituito da una parola sola 
                      erase_flag.append(True)
                    continue
                break
    if Verbose:
      print(f"before[S_{sent_id}] {sent}")

      sent = ""
      ind_sent=""
      for i,e in enumerate(original_json['sentences'][sent_id]['elements']):
        if e['id']==[-1]:
          # sent+=str(i)+"-["+e['token']+"]_"+ str(e['id'])+" "
          sent+="["+e['token']+"]"+" "
        else :
          # sent+= str(i)+"-"+e['token']+"_"+ str(e['id']) + " "
          sent+= e['token']+ " "
      print(f"after [S_{sent_id}] {sent}")
      for i,e in enumerate(original_json['sentences'][sent_id]['elements']):
        print(e)
    return original_json #, word_id_map


In [None]:
def get_sentence_position_by_id(item_list, target_id):
    for i, item in enumerate(item_list):
        if target_id == item['id']:
            return i
    return None

In [None]:
def get_TL_json_files(json_files, ep_num, prog, debug=False):
  for (ep,pr),json_f in json_files:
    if debug: print(f"[get_TL_json_files]... (ep,pr): {(ep,pr)} - (ep_num,prog): {(ep_num,prog)}")
    if int(ep.split("x")[-1]) == ep_num and int(pr) == prog:
      return json_f
def get_from_ep_json_files(json_files, ep_num, debug = False):
  for ep,json_f in json_files:
    
    ep_i = int(ep.split("x")[-1])
    ep_num = int(str(ep_num).split("x")[-1])
    if debug: print(f"[get_from_ep_json_files]... (ep_i): {ep_i} - (ep_num): {ep_num}")
    if ep_i == ep_num :
      return json_f

In [None]:
from nltk.parse.projectivedependencyparser import projective_prob_parse_demo
import json

def call_update_words(load_path,save_path, TL,json_files, arc_to_n = ["1","2","3"], campaign = 2):
  for file_name in sorted(os.listdir(load_path)):
      # print(file_name)
      if TL:
        if file_name.endswith(".json"):
            s = file_name.split("-")
            ep = s[0]
            prog =  s[1]
            json_files.append(((ep,prog) ,loadJSON(load_path, file_name[:-5])))
      else:
        if file_name.endswith(".json"):
            s = file_name.split("-")
            ep = s[0]
            json_files.append((ep,loadJSON(load_path, file_name[:-5])))
          

  # original_json = json_files[0]

  # load the substitution file
  debug = False
  verbose = False
  
  ppr=[]
  for a in arc_to_n:
    name= ppr_name+a
    ppr += loadJSON(PPR_PATH, name)
  ppr_ = ppr.copy()
  subs_changed = find_char_in_substitution(ppr_,character_dictionary_list, debug = False, TL = TL)


  if not TL:
    for k,substitutions in enumerate(subs_changed):
      ep_num = substitutions['ep']
      original_json = get_from_ep_json_files(json_files, ep_num, debug = False)
      # loop through each substitution
      for sub in substitutions['subs']:
          # get the sentence and word ids

          
          sent_id = sub['id_sentence']
          if debug: print(f" len(sub): { len(sub)}")
          if debug : print(f"[0.1]sent_id {sent_id} -> len of  original_json['sentences'] = {len(original_json['sentences'])}")
          sent_pos = get_sentence_position_by_id(original_json['sentences'],sent_id)
          if debug : print(f"[0.1.1]sent_id {sent_id} : sent_pos {sent_pos} -> len of  original_json['sentences'] = {len(original_json['sentences'])}")

          if debug: print(f"original_json['sentences'][{sent_id}]['id'] =  {original_json['sentences'][sent_pos]['id']}")

          
          word_ids = sub['id_word']
          if debug: print(f"[0.2]word_ids {word_ids}")
          # get the new words
          new_words_tuple = sub['new_word']
          if debug: print(f"[0.3]new_words  {new_words_tuple}")
          original_json = update_words(original_json, sent_id, word_ids, new_words_tuple,sub['info'], debug= debug , Verbose = verbose)
          if verbose: print("\n")
      ep_num = substitutions['ep']
      file_name = load_path.split('/')[-2]
      if int(ep_num.split("x")[-1])<10:
        json_file = f'{ep_num.split("x")[0]}x0{ep_num.split("x")[-1]}-{file_name}-coref-substitued'
      else:json_file = f'{ep_num}-{file_name}-coref-substitued'

      # write the updated JSON file
      saveJSON(original_json,json_file, save_path, download = False)
  else:
      prev_ep = None
      prev_prog= None
      for k,sub in enumerate(subs_changed):
          ep_num = sub['info']["episode"]
          prog = sub['info']["progressive"]
          campaign_num =  sub['info']["campaign"]
          if ep_num== prev_ep and prev_prog == prog:
            print("same ep_prog ...\n")
            original_json = prev_json
          else:
            original_json = get_TL_json_files(json_files, ep_num, prog, debug = False)
          if original_json== None:
            if debug: print("original_json is None")
          # get the sentence and word ids

          
          sent_id = sub['id_sentence']
          if debug: print(f" sub['info']: { sub['info']}")
          if debug : print(f"[0.1]sent_id {sent_id} -> len of  original_json['sentences'] = {len(original_json['sentences'])}")
          sent_pos = get_sentence_position_by_id(original_json['sentences'],sent_id)
          if debug : print(f"[0.1.1]sent_id {sent_id} : sent_pos {sent_pos} -> len of  original_json['sentences'] = {len(original_json['sentences'])}")

          if debug: print(f"original_json['sentences'][{sent_id}]['id'] =  {original_json['sentences'][sent_pos]['id']}")

          
          word_ids = sub['id_word']
          if debug: print(f"[0.2]word_ids {word_ids}")
          # get the new words
          new_words_tuple = sub['new_word']
          if debug: print(f"[0.3]new_words  {new_words_tuple}")
          original_json = update_words(original_json, sent_id, word_ids, new_words_tuple,sub['info'], debug= debug , Verbose = verbose)
          if verbose: print("\n")
          
          file_name = load_path.split('/')[-2]
          if int(ep_num)<10:
            json_file = f'{campaign_num}x0{ep_num}-{prog}-{file_name}-coref-substitued'
          else: json_file = f'{campaign_num}x{ep_num}-{prog}-{file_name}-coref-substitued'
          # write the updated JSON file
          saveJSON(original_json,json_file, save_path, download = False)

          prev_ep = ep_num
          prev_prog= prog
          prev_json = original_json

#### Sanity check

In [None]:
#exectute twice to be sure
def check_id():
  path = save_path
  json_files = []
  for file_name in sorted(os.listdir(path)):
      if file_name.endswith(".json"):
          json_files.append((loadJSON(path, file_name[:-5]), file_name[:-5]))
  adjust_ids(json_files, path)        
  check_ids_in_jsons(json_files)

  print("---------------------")
  json_files_ = []
  for file_name in sorted(os.listdir(path)):
      if file_name.endswith(".json"):
          json_files_.append((loadJSON(path, file_name[:-5]), file_name[:-5]))
  check_ids_in_jsons(json_files_)
  return

call on recap

In [None]:
# RECAP_COREF_PATH
json_files = []
campaign = 2
if campaign == 2:
  load_path = RECAP_PATH_C2
  save_path = RECAP_COREF_PATH_C2
  ppr_name= "pprREC_C2A"
if campaign == 1:
  load_path = RECAP_PATH
  save_path = RECAP_COREF_PATH 
  ppr_name= "pprREC_C1A"
TL = False

# call_update_words(load_path,save_path, TL,json_files,  arc_to_n = ["1","2"], campaign = 2) 
# check_id()
# check_id()

call on summaries

In [None]:
# # SUM_COREF_PATH
json_files = []
campaign = 2

if campaign == 2:
  load_path = SUM_PATH_C2
  save_path = SUM_COREF_PATH_C2
  ppr_name= "pprSUMM_C2A"
if campaign == 1:
  load_path = SUM_PATH
  save_path = SUM_COREF_PATH
  ppr_name= "pprSUMM_C1A"
# call_update_words(load_path,save_path, TL,json_files ,  arc_to_n = ["1","2"], campaign = campaign) 
# check_id()
# check_id()

call on timeline

In [None]:
# TL_COREF_PATH
 
TL = True

json_files = []
campaign = 2

if campaign == 2:
  load_path = TL_PATH_C2
  save_path = TL_COREF_PATH_C2
  ppr_name= "pprTL_C2A"
if campaign == 1:
  load_path = TL_PATH
  save_path = TL_COREF_PATH
  ppr_name= "pprTL_C1A"
# call_update_words(load_path,save_path, TL,json_files ,  arc_to_n = ["1","2"], campaign = campaign) 
# check_id()
# check_id()


In [None]:
import os
import shutil


def script_check_coref_files():
  # Get the list of files in the TL_PATH folder
  tl_files = sorted(os.listdir(TL_PATH))
  tl_coref_files = sorted(os.listdir(TL_COREF_PATH))
  # Iterate over the files in the TL_PATH folder
  for tl_file in tl_files:
    if tl_file[:-5]==".json":
      # Extract the episode and scene numbers from the file name
      episode, scene, *_ = tl_file.split("-")
      
      # Construct the corresponding file name in the TL_COREF_PATH folder
      tl_coref_file = f"{episode}-{scene}-timeline-coref-substitued.json"
      print("before_ ", tl_coref_file)
      # Check if the file exists in the TL_COREF_PATH folder
      if tl_coref_file not in tl_coref_files:
          # If the file doesn't exist, create a copy in the TL_COREF_PATH folder
          print("the file doesn't exist, create a copy in the TL_COREF_PATH folder: ",tl_coref_file)
          print( "os.path.join(str(TL_COREF_PATH), tl_coref_file)",  os.path.join(str(TL_COREF_PATH), tl_coref_file))
          shutil.copy(os.path.join(str(TL_PATH), tl_file), os.path.join(str(TL_COREF_PATH), tl_coref_file))
      print("after_ ",tl_coref_file)
      # Construct the corresponding file name in the TL_PATH folder
      tl_path_file = f"{episode}-{scene}-TL.json"
      
      # Check if the file exists in the TL_PATH folder
      if tl_path_file not in tl_files:
          # If the file doesn't exist, create a copy in the TL_PATH folder
          shutil.copy(os.path.join(str(TL_COREF_PATH), tl_coref_file), os.path.join(str(TL_PATH), tl_path_file))

# Events

## Verbatlas

In [None]:
# curl -X 'POST' \
#     'https://verbatlas.org/api/model' \
#     -H 'accept: application/json' \
#     -H 'Content-Type: application/json' \
#     -d '[
#     {"text":"The quick brown fox jumps over the lazy dog.", "lang":"EN"},
#     {"text":"I walked along the river bank.", "lang":"EN"}
#     ]'
def get_verbatlas(s):
  cmd = "curl -X 'POST' "
  url =  " 'https://verbatlas.org/api/model' "
  h1 =  " -H " + "'accept: application/json'" 
  h2 =  " -H " +"'Content-Type: application/json'"
  s=s.replace('"',' ')
  s=s.replace("'",' ')
  # print("after first rep",s)
  s = "\""+s+"\""
  # print("after second rep",s)
  l = '\'[{"text":'+s+', "lang":"EN"}]\''
  sent =  " -d "+ l 
  command = cmd + url + h1 +h2 +sent
  res = !{command}
  x =  res[0]

  # parse x:
  try:
    y = json.loads(x)
    # type(y[0])
    # the result is a Python dictionary:
    # print("y[0] ",y[0])
    # print("y[0]['tokens'] ", y[0]['tokens'])
    # print("y[0]['annotations'] ", y[0]['annotations'])
    return y[0]
  except:
    return None

## Utilities for Event Extraction

retrieve-json

In [None]:
def find_arc(ep_num, c):
  eps_arc_C1 = [(1,23),(24,38),(39,83),(84,99),(100,115)]
  eps_arc_C2 = [(1,25),(26,47)]
  if c ==1 :  eps_arc =  eps_arc_C1
  if c ==2 :  eps_arc =  eps_arc_C2

  for arc in range(len(eps_arc)):
    if ep_num in range(eps_arc[arc][0], eps_arc[arc][1]+1):
      return arc

In [None]:

def retrieve_json(path, arcs, campaign):
  """ call loadJSON(file) on every file in path
  Returns:
    A list of dictionaries containing the JSON data from each file.
  """
  # Initialize a list to store the JSON data
  json_data = []
  for n in range(len(arcs)):
    json_data.append([])
  # Loop through each file in the TL_PATH directory
  arc = 0
  for file in sorted(os.listdir(path)):
      # Check if the file is a JSON file
      if file.endswith(".json"):
          # Call the load_json function on the file and store the returned data
          data = loadJSON(path,str(file)[:-5])
          ep_num = int(str(file).split("-")[0].split("x")[1]) 
          arc = find_arc(ep_num, campaign)
          # print(arc)          
          json_data[arc].append(data)

  return json_data


 get token from index in TL json

In [None]:
def get_elem_from_id_in_TL_json(id,sentence):
  """ get the elem with the given id in the TL json
  Parameters:
  id (int): The id of the element to be retrieved.
  sentence (list of dicts): A list of dictionaries representing elements in a sentence in the TL json.

  Returns:
  dict: The dictionary representing the element with the given id.
  """
  for elem in sentence:
      if id in elem['id'] :
          return elem
  return None


 check_role_in_event()

In [None]:

def check_role_in_event(event, roles, debug = True):
  """check if for each role in in roles there is a tuple (char,role) in event['charcters'] """
  if debug: print(f"[1]check_role_in_event . . . event: {event} - roles: {roles}")
  if event['characters']==[]: return False

  roles_in_event = [tpl[1] for tpl in event['characters']]
  unique_roles_in_event = set(roles_in_event)

  if len(unique_roles_in_event) < 2:
    return False
  return True


 get_event

In [None]:
def get_event(el):
  if el["link"] != '' :
    ev = {}
    ev['token'] = el['token']
    ev['role'] = el['role']
    ev['label'] = el['label']

  else:
    return None
  return ev

## Event Extraction

In [None]:
def get_event_from_characters(event, char_only = False):
    """
    This function filters events based on the number of characters present in the event.
    
    Parameters:
    event (dict) : The event dictionary containing information about the event.
    char_only (bool) : A flag indicating whether to return events with only one character.
    
    Returns:
    dict : The filtered event based on the char_only flag.
    """
    
    # If char_only is True, only return events with more than 1 character
    if char_only:
        if len(event['characters']) >= 2:
            return event
    else:
        return event


def extract_events_from_verbatlas(verbatlas_annotations,verbatlas_tokens, s_elements, char_only = False):
    """
    This function extracts events from verbatlas annotations and returns a list of events
    Parameters:
    verbatlas_annotations (list): A list of dictionaries of verbatlas annotations
    verbatlas_tokens (list): A list of dictionaries of verbatlas tokens
    s_elements (list): A list of dictionaries of scene elements
    char_only (bool): A flag indicating whether to include only characters in the event list

    Returns:
    events (list): A list of events

    """
    events = []
    # Iterate over each element in the verbatlas annotations
    for elem in verbatlas_annotations:
        # Extract the token index and verbatlas dictionary for each element
        tokenIndex = elem['tokenIndex']
        verbatlas_dict = elem['verbatlas']
        # Extract the frame name from the verbatlas dictionary
        frameName = verbatlas_dict['frameName']
        # Extract the roles from the verbatlas dictionary
        roles =verbatlas_dict ['roles']
        # Create a dictionary to store the event information
        event = {'characters': [], 'event': frameName}
        # Iterate over each role in the roles list
        for role_dict in roles:
            # Extract the role name
            role = role_dict['role']
            # Extract the span of the role
            span = role_dict['span']
            # Iterate over each token in the span of the role
            for token in verbatlas_tokens[span[0]:span[1]]: 
                # Get the corresponding scene element for the token
                el = get_elem_from_id_in_TL_json(token['index'], s_elements)
                # If a scene element is found
                if el:
                    # Initialize a variable to store the event
                    ev = None
                    # If the char_only flag is set
                    if char_only:
                        # Get the event for the scene element
                        ev = get_event(el)
                        # If an event is found
                        if ev:
                            # Check if the event and role are not already in the event's characters list
                            if (ev, role) not in event['characters']:
                                # Append the event and role to the characters list
                                event['characters'].append((ev, role))
                    else:
                        # Check if the scene element and role are not already in the event's characters list
                        if (el, role) not in event['characters']:
                            # Append the scene element and role to the characters list
                            event['characters'].append((el, role))
        # Call the get_event_from_characters function to check if the event is valid
        event = get_event_from_characters(event, char_only)
        # If the event is valid
        if event:
            # Append the event to the events list
            events.append(event)
    # Return the list of events
    return events

def process_sentence(sentence, character_dict, debug=False, char_only = False):
    """
    This function processes a single sentence and returns the event(s) contained in it.
    
    Parameters:
    sentence (dict): The sentence to process, containing 'id' and 'elements' keys.
    char_only (bool, optional): Determines if only characters are returned or all elements. Defaults to False.
    
    Returns:
    list: The list of events contained in the sentence.
    """
    # extract the sentence id and elements from the sentence
    s_id = sentence['id']
    s_elements = sentence['elements']
    
    # build the text of the sentence
    text = ""
    characters_in_sentence=[]
    for elem in s_elements:
        # add the token and a space to the text
        if text == "":
            try:
              text = elem['token'] +' '
            except:
              print("Exception :  elem['token'] : ",  elem['token'], type( elem['token']))
        else: 
          # if elem['token'].lower() in ['they','group','team','vox','machina','vox machina', 'party','team 1','team 2']:
          #   text+='team_PH' + ' '
          # else:
          text+= elem["token"] + ' '
        # check character presence in sentence
        if elem['link']!='':
          if elem["token"] not in characters_in_sentence:
            characters_in_sentence.append(elem["token"])
        # elif  elem['token'].lower() in ['they','group','team','vox','machina','vox machina', 'party','team 1','team 2']:
        #   if 'team_PH' not in characters_in_sentence:
        #     characters_in_sentence.append('team_PH' )
    
    # remove the last space from the text
    text = text[:-1]
    
    #add sentence to character_dict for the chars in characters_in_sentence list
    for char in characters_in_sentence:
      if char in  character_dict.keys():
        character_dict[char].append(text)
      else:
        character_dict[char]=[text]
      if debug: print("[0]",char, " -> ",text)

    # get the verbatlas annotations for the text
    verbatlas_object = get_verbatlas(text)
    if not verbatlas_object: 
        # return an empty list if the verbatlas annotations are not available
        return character_dict, []
    
    # extract the annotations and tokens from the verbatlas object
    verbatlas_annotations=verbatlas_object['annotations']
    verbatlas_tokens = verbatlas_object['tokens']

    # extract the events from the verbatlas annotations
    ev = extract_events_from_verbatlas(verbatlas_annotations,verbatlas_tokens, s_elements, char_only)
    # print("[0]",character_dict, ev)
    if ev:
      # if debug: print("[1]",character_dict, ev)
      return character_dict, ev
    else:
      # if debug: print("[2]",character_dict, ev)
      return  character_dict,[]


def build_TL_events_(path,save_path,json_file_name, arcs,campaign = 1, debug=False , CHAR_ONLY = False):
    """
    This function extracts the event from text, to do so it uses the verbatlas library to extract events from text and then store it in a JSON file.
    Parameters:
    path (str): path of the JSON file that contains the text.
    json_file_name (str): name of the output JSON file
    debug (bool, optional): True to print debug messages. Defaults to False.
    CHAR_ONLY (bool, optional): True to get only the characters involved in the events. Defaults to False.

    Returns:
    None
    """
    events = [] #initializing events
    character_dict={} #initializing character_dict
    # retrieving data from the JSON file
    json_data = retrieve_json(path, arcs = arcs, campaign = campaign)
    # looping over each sentence in the JSON data
    arc = 0
    for arc_json_data in tqdm(json_data):
      arc+=1
      for sentence_dict in tqdm(arc_json_data):
        for sentence in tqdm(sentence_dict['sentences']):
            # calling the process_sentence function to extract events from the sentence
            if debug: print("character_dict", character_dict)
            character_dict , ev = process_sentence(sentence, character_dict, debug=False,char_only = CHAR_ONLY)
            events += ev
      json_file_name_ = f"arc_{arc}-"+json_file_name
      # Saving the extracted events in the JSON file
      saveJSON(events, json_file_name_, path = save_path , download = False)
      # Saving the extracted events in the JSON file
      saveJSON(character_dict, json_file_name_+'_char_dict', path = save_path , download = False)

### Launch & Save

In [None]:
# Launch

# json_file_name = 'TL_events-json'
json_file_name_CHAR_ONLY = 'TL_coref_events_CHAR_ONLY-json'
campaign = 2

# build_TL_events_(TL_COREF_PATH_C2,TL_events_PATH_C2, json_file_name_CHAR_ONLY,arcs = [5,6] ,campaign = campaign,  debug = False, CHAR_ONLY = True)

In [None]:
# Launch

recap_json_file_name = 'recap_events-json'
recap_json_file_name_CHAR_ONLY = 'recap_coref_events_CHAR_ONLY-json'
campaign = 2


# build_TL_events_(RECAP_COREF_PATH_C2,TL_events_PATH_C2, recap_json_file_name_CHAR_ONLY,arcs = [5,6], campaign =campaign , debug = False, CHAR_ONLY = True)

In [None]:
# Launch

# recap_json_file_name = 'recap_events-json'
recap_json_file_name_CHAR_ONLY = 'summ_coref_events_CHAR_ONLY-json'
campaign = 1


build_TL_events_(SUM_COREF_PATH,TL_events_PATH, recap_json_file_name_CHAR_ONLY,arcs = [1,2,3], campaign =campaign , debug = False, CHAR_ONLY = True)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/22 [00:00<?, ?it/s][A

  0%|          | 0/170 [00:00<?, ?it/s][A[A

  1%|          | 1/170 [01:03<2:57:29, 63.01s/it][A[A

  1%|          | 2/170 [01:31<2:00:13, 42.93s/it][A[A

  2%|▏         | 3/170 [01:48<1:25:59, 30.89s/it][A[A

  2%|▏         | 4/170 [02:12<1:18:20, 28.32s/it][A[A

  3%|▎         | 5/170 [03:19<1:55:58, 42.17s/it][A[A

  4%|▎         | 6/170 [03:20<1:16:46, 28.09s/it][A[A

  4%|▍         | 7/170 [03:21<52:02, 19.15s/it]  [A[A

  5%|▍         | 8/170 [03:22<36:41, 13.59s/it][A[A

  5%|▌         | 9/170 [03:24<26:07,  9.74s/it][A[A

  6%|▌         | 10/170 [04:04<50:54, 19.09s/it][A[A

  6%|▋         | 11/170 [04:09<39:28, 14.90s/it][A[A

  7%|▋         | 12/170 [04:10<28:13, 10.72s/it][A[A

  8%|▊         | 13/170 [04:12<21:01,  8.03s/it][A[A

  8%|▊         | 14/170 [04:13<15:10,  5.84s/it][A[A

  9%|▉         | 15/170 [04:13<11:02,  4.27s/it][A[A

  9%|▉         | 16/170 [04

- saving JSON file: arc_1-summ_coref_events_CHAR_ONLY-json.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/TL_events/ ...





- saving JSON file: arc_1-summ_coref_events_CHAR_ONLY-json_char_dict.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/TL_events/ ...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

 54%|█████▍    | 103/191 [05:30<01:18,  1.11it/s][A[A

 54%|█████▍    | 104/191 [05:31<01:25,  1.02it/s][A[A

 55%|█████▍    | 105/191 [05:32<01:29,  1.04s/it][A[A

 55%|█████▌    | 106/191 [05:34<01:41,  1.19s/it][A[A

 56%|█████▌    | 107/191 [05:35<01:29,  1.07s/it][A[A

 57%|█████▋    | 108/191 [05:36<01:28,  1.07s/it][A[A

 57%|█████▋    | 109/191 [05:37<01:33,  1.13s/it][A[A

 58%|█████▊    | 110/191 [05:38<01:25,  1.06s/it][A[A

 58%|█████▊    | 111/191 [05:39<01:29,  1.12s/it][A[A

 59%|█████▊    | 112/191 [05:40<01:17,  1.02it/s][A[A

 59%|█████▉    | 113/191 [05:41<01:18,  1.01s/it][A[A

 60%|█████▉    | 114/191 [05:43<01:32,  1.20s/it][A[A

 60%|██████    | 115/191 [05:44<01:23,  1.10s/it][A[A

 61%|██████    | 116/191 [05:46<01:55,  1.54s/it][A[A

 61%|██████▏   | 117/191 [05:47<01:50,  1.49s/it][A[A

 62%|██████▏   | 118/191 [05:50<02:19,  1.91s/it][A[A

 62%|██████▏   | 119/1

[{"tokens":[{"index":0,"rawText":"Solemnly"},{"index":1,"rawText":","},{"index":2,"rawText":"Tiberius"},{"index":3,"rawText":"Stormwind"},{"index":4,"rawText":"tells"},{"index":5,"rawText":"the"},{"index":6,"rawText":"rest"},{"index":7,"rawText":"of"},{"index":8,"rawText":"Keyleth"},{"index":9,"rawText":"Percival"},{"index":10,"rawText":"de"},{"index":11,"rawText":"Rolo"},{"index":12,"rawText":"that"},{"index":13,"rawText":"Scanlan"},{"index":14,"rawText":"Shorthalt"},{"index":15,"rawText":"Tiberius"},{"index":16,"rawText":"Stormwind"},{"index":17,"rawText":"Grog"},{"index":18,"rawText":"Strongjaw"},{"index":19,"rawText":"Pike"},{"index":20,"rawText":"Trickfoot"},{"index":21,"rawText":"Trinket"},{"index":22,"rawText":"Vax"},{"index":23,"rawText":"ildan"},{"index":24,"rawText":"Vex"},{"index":25,"rawText":"ahlia"},{"index":26,"rawText":"although"},{"index":27,"rawText":"Tiberius"},{"index":28,"rawText":"Stormwind"},{"index":29,"rawText":","},{"index":30,"rawText":"who"},{"index":31,"raw



 11%|█         | 20/179 [05:50<3:30:50, 79.56s/it][A[A

 12%|█▏        | 21/179 [05:54<2:29:41, 56.84s/it][A[A

 12%|█▏        | 22/179 [07:47<3:13:07, 73.81s/it][A[A

 13%|█▎        | 23/179 [07:50<2:16:51, 52.64s/it][A[A

 13%|█▎        | 24/179 [07:56<1:39:31, 38.52s/it][A[A

 14%|█▍        | 25/179 [08:00<1:12:30, 28.25s/it][A[A

 15%|█▍        | 26/179 [08:03<52:28, 20.58s/it]  [A[A

 15%|█▌        | 27/179 [08:06<38:54, 15.36s/it][A[A

 16%|█▌        | 28/179 [08:08<28:05, 11.16s/it][A[A

 16%|█▌        | 29/179 [08:10<21:18,  8.53s/it][A[A

 17%|█▋        | 30/179 [08:11<15:19,  6.17s/it][A[A

 17%|█▋        | 31/179 [08:12<11:58,  4.85s/it][A[A

 18%|█▊        | 32/179 [08:14<09:37,  3.93s/it][A[A

 18%|█▊        | 33/179 [08:18<09:09,  3.76s/it][A[A

 19%|█▉        | 34/179 [08:20<07:52,  3.26s/it][A[A

 20%|█▉        | 35/179 [09:22<50:21, 20.98s/it][A[A

 20%|██        | 36/179 [09:24<36:16, 15.22s/it][A[A

 21%|██        | 37/179 [09:26<2

- saving JSON file: arc_2-summ_coref_events_CHAR_ONLY-json.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/TL_events/ ...



 67%|██████▋   | 2/3 [3:15:31<1:37:33, 5853.25s/it]

- saving JSON file: arc_2-summ_coref_events_CHAR_ONLY-json_char_dict.json, at: /content/drive/MyDrive/Colab Notebooks/NUANS/project_bisazza_casadei/data/TL_events/ ...


[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
 85%|████████▍ | 105/124 [06:19<00:31,  1.66s/it][A[A

 85%|████████▌ | 106/124 [06:20<00:26,  1.45s/it][A[A

 86%|████████▋ | 107/124 [06:22<00:27,  1.61s/it][A[A

 87%|████████▋ | 108/124 [06:24<00:26,  1.63s/it][A[A

 88%|████████▊ | 109/124 [06:25<00:25,  1.67s/it][A[A

 89%|████████▊ | 110/124 [06:27<00:24,  1.73s/it][A[A

 90%|████████▉ | 111/124 [06:29<00:22,  1.71s/it][A[A

 90%|█████████ | 112/124 [06:30<00:16,  1.40s/it][A[A

 91%|█████████ | 113/124 [06:30<00:13,  1.19s/it][A[A

 92%|█████████▏| 114/124 [06:31<00:10,  1.01s/it][A[A

 93%|█████████▎| 115/124 [06:32<00:08,  1.10it/s][A[A

 94%|█████████▎| 116/124 [06:37<00:17,  2.22s/it][A[A

 94%|█████████▍| 117/124 [06:39<00:15,  2.24s/it][A[A

 95%|█████████▌| 118/124 [06:41<00:11,  1.98s/it][A[A

 96%|█████████▌| 119/124 [06:42<00:09,  1.85s/it][A[A

 97%|█████████▋| 120/124 [06:43<00:06,  1.53s/it][A[A

 98%|█████████▊| 121/124

In [None]:
# Launch

# recap_json_file_name = 'recap_events-json'
recap_json_file_name_CHAR_ONLY = 'summ_coref_events_CHAR_ONLY-json'
campaign = 2
build_TL_events_(SUM_COREF_PATH_C2,TL_events_PATH_C2, recap_json_file_name_CHAR_ONLY,arcs = [5,6], campaign =campaign , debug = False, CHAR_ONLY = True)

# Named Entity Recognition

## Extract Named Entities

In [None]:
"""
ATTENTION, this module needs:
- The NLP model from the coreference resolution section
"""

# --- aux function n°1 to filter the NER categories, for now we are just interessed in Person Named Entity
def get_labels(doc): 
  full_ents = doc.ents  # the full set of annoted spans
  
  per_ents = [] # the annoted spans with categories: {PERSON}
  for ent in full_ents:
    if ent.label_ == "PERSON":
      ent_text = ent.text
      # ent_array = ent_text.split()
      per_ents.append(ent_text)
  return per_ents

# --- aux function n°2 to extract sentences data from json file
def get_textDictSent(sentences_dict):
  sentences_list = sentences_dict['sentences']

  # list for dicts sentences
  sentences_file   = [sentence_dict['elements'] for sentence_dict in sentences_list]

  # list for texts of sentences
  sentences_texts = []
  for sentence_file in sentences_file:
    text = ''
    for word_dict in sentence_file:
      text += ' ' + word_dict['token']
    text = text.strip()
    sentences_texts.append(text)

  return sentences_texts, sentences_file

"""
  input a dict with the structure like a summary json file
"""
def NER(sentences_dict, verbose = True):

  # get text sentences and dict sentences
  sentences_texts, sentences_file = get_textDictSent(sentences_dict)

  sentences_texts, sentences_file = sentences_texts, sentences_file

  # save the initial time at beginning of the task
  if verbose: startTime = time.time()

  docs = list(nlp.pipe(sentences_texts))

  for i,(doc,sentence_file) in enumerate(zip(docs,sentences_file)):
    if verbose: print(doc.text)

    per_ents = get_labels(doc)

    # initialize all the attribute for the NER_person key as False
    for word_dict in sentence_file:
      word_dict['NER_person']= False
    
    if verbose:
      print(f"Found the Person NE in sentence n°{i}")
      print_list(per_ents)

    # case 1, no 'PERSON' labels find in the sentence -> get next sentence
    if per_ents == []: continue

    # case 2, we have found named entities of type PERSON, mark the correct tokens
    
    skip_token = 0   # variable to avoid wrong assignment given the sequentiality nature of the operation
    for per_ent in per_ents:
      # print(per_ent)
      tmp_ids = []
      tmp_per_ent = per_ent.lower().strip()
      attention = False
      for idx, word_file in enumerate(sentence_file):
        # print(word_file['token'])
        if  word_file['id'][0] < skip_token: continue

        if word_file['token'].lower().strip() in per_ent.lower().strip():
          # print('case 1')
          tmp_ids = [*tmp_ids,*word_file['id']]
          tmp_per_ent = tmp_per_ent.replace(word_file['token'].lower().strip(), '').lower().strip()
          attention = True

        elif per_ent.lower().strip() in word_file['token'].lower().strip():
          # print('case 2')

          tmp_ids = [*tmp_ids,*word_file['id']]
          for target in word_file['token'].lower().strip().split():
            tmp_per_ent = tmp_per_ent.replace(target, '').lower().strip()
          attention = True

        else:
          if attention == True: attention = False

        if tmp_per_ent == '':
          break
        
        if attention == False:
          tmp_ids = []
          tmp_per_ent = per_ent.lower().strip()

      for tmp_id in tmp_ids:
        for word_file in sentence_file:
          if tmp_id in word_file['id']:
            word_file['NER_person'] = True
            skip_token = word_file['id'][0]

    print_list(sentence_file)
    print("\n--------\n")


  if verbose: print("Total time elapsed for the task: {} [s]".format((time.time() - startTime)))

  return sentences_dict


# example of input file for the summary of 1st episode
name_file = sorted(os.listdir(PATH_SUMMARIES))[0]
print(name_file)
file_json  = loadJSON(PATH_SUMMARIES, name_file, need_ext=False, verbose = False)
output = NER(file_json)

1x01-summary-json.json


Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

Inference:   0%|          | 0/64 [00:00<?, ?it/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Inference:   0%|          | 0/42 [00:00<?, ?it/s]

Vox Machina have just entered the dwarven city of Kraghammer for the first time on their quest to find Kima of Vord .
Found the following Person NE in sentence n°0
0)   Vox Machina                   
1)   Kima                          
0)   {'pos': [0, 4], 'id': [0], 'token': 'Vox', 'label': '', 'role': [], 'link': '', 'NER_person': True}
1)   {'pos': [4, 12], 'id': [1], 'token': 'Machina', 'label': '', 'role': [], 'link': '', 'NER_person': True}
2)   {'pos': [12, 17], 'id': [2], 'token': 'have', 'label': '', 'role': [], 'link': '', 'NER_person': False}
3)   {'pos': [17, 22], 'id': [3], 'token': 'just', 'label': '', 'role': [], 'link': '', 'NER_person': False}
4)   {'pos': [22, 30], 'id': [4], 'token': 'entered', 'label': '', 'role': [], 'link': '', 'NER_person': False}
5)   {'pos': [30, 34], 'id': [5], 'token': 'the', 'label': '', 'role': [], 'link': '', 'NER_person': False}
6)   {'pos': [34, 42], 'id': [6], 'token': 'dwarven', 'label': '', 'role': [], 'link': '', 'NER_person': False}