# Imports

In [1]:
# - imports parameters 
PIPELINES_TYPE = ["en_core_web_sm","en_core_web_md",\
                 "en_core_web_lg","en_core_web_trf"]
SIZE_IDX = 3 # use transformer models
EL_RESOURCE = "wikidata" # what kind of resource use for Entity linking

# - general imports 
from google.colab       import files
from google.colab       import drive
import os
import sys
import numpy                            as np
import re
import random
import copy
import time
import pandas                           as pd
from zipfile            import ZipFile
import requests
import json
import nltk
from nltk.corpus        import stopwords
nltk.download('stopwords')

# fast implementation of edit distance
!pip install editdistance==0.3.1
import editdistance

# - spacy imports for NER
from spacy.cli.download import download as spacy_download
spacy_download(PIPELINES_TYPE[SIZE_IDX])
if SIZE_IDX==3:
  !pip install spacy-transformers
  import spacy_transformers
!pip install -U spacy[cuda112]
import spacy
spacy.prefer_gpu()

# - Entity linking imports 
if EL_RESOURCE == "wikidata":
  spacy_download(PIPELINES_TYPE[1])
  !pip install spacy-entity-linker
  !python -m spacy_entity_linker "download_knowledge_base"
else:
  import urllib.request as urllib2
  import urllib
  import json
  import gzip
  import io 


# IMPORTANT: one restart of runtime it's needed to correctly install all the packages

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2022-11-20 17:17:44.965363: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


#Load the dataset

In [2]:
load_mod = "local"
fairy_tale_path = "./FAIRY_TALE/texts"
short_story_path = "./SHORT_STORY/texts"

if load_mod == "drive":
  drive.mount("/content/drive", force_remount=True)
  %cd '/content/drive/MyDrive/Colab Notebooks/Narrative understanding and storytelling/homework_1a/FairySum'
  %ls
else: # load locally
  from google.colab import files
  files.upload()

  file_name = "./FairySum.zip"
  with ZipFile(file_name, 'r') as zip_file:
    zip_file.extractall()
  %cd FairySum

fairy_tale_textPaths  = [fairy_tale_path + "/" + text for text in sorted(os.listdir(fairy_tale_path))]
short_Story_textsPaths = [short_story_path + "/" + text for text in sorted(os.listdir(short_story_path))]

# merge paths about fairy tales and short stories
full_list_paths = [*fairy_tale_textPaths,*short_Story_textsPaths]

for x in full_list_paths: print("-Loaded: {}".format(x))

Saving FairySum.zip to FairySum.zip
/content/FairySum
-Loaded: ./FAIRY_TALE/texts/bn_00173084n_The Old Dame and her Hen.txt
-Loaded: ./FAIRY_TALE/texts/bn_00187346n_The Dove.txt
-Loaded: ./FAIRY_TALE/texts/bn_00187442n_Corvetto.txt
-Loaded: ./FAIRY_TALE/texts/bn_00608721n_Sun, Moon, and Talia.txt
-Loaded: ./FAIRY_TALE/texts/bn_00790303n_Childe Rowland.txt
-Loaded: ./FAIRY_TALE/texts/bn_01593735n_Town Musicians of Bremen.txt
-Loaded: ./FAIRY_TALE/texts/bn_01899260n_The Raven.txt
-Loaded: ./FAIRY_TALE/texts/bn_02154461n_Herr Korbes.txt
-Loaded: ./FAIRY_TALE/texts/bn_02173122n_Thumbelina.txt
-Loaded: ./FAIRY_TALE/texts/bn_02238889n_The Nightingale.txt
-Loaded: ./FAIRY_TALE/texts/bn_02277659n_The Ugly Duckling.txt
-Loaded: ./FAIRY_TALE/texts/bn_02442758n_Jack the Giant Killer.txt
-Loaded: ./FAIRY_TALE/texts/bn_03300215n_The Wonderful Birch.txt
-Loaded: ./FAIRY_TALE/texts/bn_03301753n_Cap-o_-Rushes.txt
-Loaded: ./FAIRY_TALE/texts/bn_03326399n_The Story of Pretty Goldilocks.txt
-Loaded: ./FA

## Manage data 

In [3]:
"function to extract the raw text from a .txt file"
def text_from_book(path):
  with open(path,'r') as file:
    text = [line for line in file.readlines()]
    file.close()
    return text

"function to retrieve the span from the list of sentences of a document, start and end char position"
# note: start and end position parameters are in reference to char position in the whole document
# use it to find the correct sentence, and then remove this off-set to have the local index for
# characters in the single sentence.
def get_span(doc_sentences, start_pos, end_pos, verbose = False):
  idx_seek = 0
  idx_start = 0                                 # first char index of the sentence 
  idx_end = len(doc_sentences[idx_seek])        # last char index of the sentence 
  if verbose: print("| {} | {} | {} |".format(idx_seek,idx_start,idx_end))
  while(not(start_pos >= idx_start and end_pos <= idx_end)):
    idx_seek += 1                               # update seek index for the sentences
    idx_start = idx_end                         # update first and last index of the actual sentence 
    try:
      idx_end += len(doc_sentences[idx_seek])
    except IndexError as e:
      print(f"{e}")
      return None

    if verbose: print("| {} | {} | {} |".format(idx_seek,idx_start,idx_end))
  
  # remove off-set global char
  idx_1 = start_pos-idx_end
  idx_2 = end_pos-idx_end

  # get span and return 
  span = doc_sentences[idx_seek][idx_1:idx_2]
  if verbose: print(f"Found {span}")
  return span

"function to retrieve the sentence given the global start and end indices of the char"
def get_sentence(doc_sentences, start_pos, end_pos, verbose = False):
  idx_seek = 0
  idx_start = 0                                 # first char index of the sentence 
  idx_end = len(doc_sentences[idx_seek])        # last char index of the sentence 
  if verbose: print("| {} | {} | {} |".format(idx_seek,idx_start,idx_end))
  while(not(start_pos >= idx_start and end_pos <= idx_end)):
    idx_seek += 1                               # update seek index for the sentences
    idx_start = idx_end                         # update first and last index of the actual sentence 
    try:
      idx_end += len(doc_sentences[idx_seek])
    except IndexError as e:
      print(f"{e}")
      return None

    if verbose: print("| {} | {} | {} |".format(idx_seek,idx_start,idx_end))
  
  return doc_sentences[idx_seek]
  
"""
function to extract just the labels of interest in NER
two modes: 
- LOC: just take the entities with the LOC label
- LOC: include as location also FAC and GPE entities, besides LOC.
"""
def get_labels(doc, mode="LOC+"):        # what to include in LOC?
  full_ents = doc.ents  # the full set of annoted spans
  plo_ents = [] # the annoted spans with categories: {PER,LOC,ORG}
  for ent in full_ents:

    if ent.label_ == "PERSON": ent.label_ = "PER" # just rename the label
       
    if mode == "LOC+":
      if ent.label_ in ["PER","LOC","ORG"]:  
        plo_ents.append(ent)
      if ent.label_ in ["FAC","GPE"]: # chosen to include also facilities and Geopolitical entities
        ent.label_ = "LOC"
        plo_ents.append(ent)
    else:
      if ent.label_ in ["PER","LOC","ORG"]:  
        plo_ents.append(ent)
  return plo_ents

""" priting table not on the line """
def print_table(table):
  for i,(k,v) in enumerate(table.items()):
    print("{:<4} K: {:<30} V: {:<30}".format(str(i)+")",str(k),str(v)))

""" priting list not on the line """
def print_list(list):
  for i,elem in enumerate(list):
    print("{:<4} {:<30}".format(str(i)+")",str(elem)))

""" function to filter and preprocessing strings for the comparison in the external identification """

def reduce_span(span):
  span = span.lower().strip() # make words lower, and exclude extra spaces at begin/end
  filtered_words = [word for word in span.split() if word not in stopwords.words('english')]# filter excluding english stop words
  return " ".join(filtered_words)

# Functions for optional tasks

## Unique name identifying
## (optional 1) 

In [4]:
"""
sorting functions
"""
def first_criterion(item):
  # take the values in the items, so [1] and takes the indices [0]
  value_idxs = item[1][0]
  # and return the first index (first appearance)
  return value_idxs[0]

def longer_criterion(item):
  # return length span
  key = item[0]
  return -len(key) # use minus to reverse the sorting

def mf_criterion(item):
  # return length of the indices list
  value_idxs = item[1][0] 
  return -len(value_idxs)

# map: mode -> sorting criterion
sorting_criterions = {"first":first_criterion ,"longer":longer_criterion, "mf":mf_criterion}


"function to check if a Nemed entity is contained in more than another Named Entity in y_doc"
def uniqueContainment(list_keys, key_a, debug = False):
  try:
    list_keys.remove(key_a)
    count = 0
    for key_b in list_keys:
      if   key_b.__contains__(key_a): 
        count += 1
        if debug: print(f"{key_a} <-> {key_b} | count: {count}")
    if count <= 1:
      return True
    else:
      return False
  except Exception as e:
    print(e + f" with the following instances, list keys: {list_keys}, key_check: {key_a}")


"""
merge dictionary entries for the same Named Entity
"""
def merge_items(table, debug = False):

  if debug: print("------------------start---------------------\n")
  edited = False # default False, stays false iff no edits while scaning the whole set of entries 
  to_delete = [] # list of entries to delete 

  for i,(k,v) in enumerate(table.items()):
    j_elems= list(range(len(table.items())))
    if debug: print(f"1st entry: {i}")
    j_elems.remove(i)
    if debug: print(f"2nd entries: {j_elems}")

    for j in j_elems:
      j_keys = list(table.keys())
      j_values = list(table.values())

      # here you can change the policy for the merging of entries
      # in this case is a simple contains condition

      if (k.__contains__(j_keys[j]) and uniqueContainment(copy.deepcopy(j_keys),j_keys[j])) \
          or (j_keys[j].__contains__(k) and uniqueContainment(copy.deepcopy(j_keys),k)):

        if debug and k.__contains__(j_keys[j]): print(f"{k} <- {j_keys[j]}")
        if debug and j_keys[j].__contains__(k): print(f"{k} -> {j_keys[j]}")
        edited = True
        v[0].extend(j_values[j][0])
        to_delete.append(j_keys[j])
      

    if debug: print(f"entries to delete: {to_delete}")

    for key in to_delete:
      del table[key]
    # exit if we have a merge, it's needed to restart the control
    if edited: break
  if debug: print("------------------end---------------------\n")

  return edited


"""
function to select unique name for same named entities
modes:
first : take as unique name the first that is encountered in the results
longer: take as unique name the longer to express the same Named Entity
mf    : take as unique name the one which is most frequent to represent the same Named Entity
"""
def set_uniqueNames(y_doc, sents_doc, mode="longer", debug= False):
  
  # create a deep copy of the NER results for the document
  y_doc_unique = copy.deepcopy(y_doc)

  # create a map: Named Entity for check -> [indices in y_doc], original Named Entity
  map_ent_idx = {}
  for idx, y_row in enumerate(y_doc):

    span_original = get_span(sents_doc,y_row[1], y_row[2])
    span_check = span_original.strip().lower()  # to exclude too silly cases 
    if debug: print(f"{span_original}, {span_check}")
    if span_check in map_ent_idx.keys():
      value = map_ent_idx[span_check]
      value[0].append(idx)
    else:
      map_ent_idx[span_check] = [[idx],span_original]
    
  # sort the dictionary 
  criterion = sorting_criterions[mode]
  map_ent_idx = dict(sorted(map_ent_idx.items(), key = criterion))

  
  edited = True
  # merge entries if substring
  # do-while loop
  if len(map_ent_idx) > 1:
     edited = merge_items(map_ent_idx)
  while(edited and len(map_ent_idx) > 1):
    edited =  merge_items(map_ent_idx)
  
  # add the unique name in the list of document's output
  for k,v in map_ent_idx.items():
    for idx in v[0]:
      if len(y_doc_unique[idx]) == 4:
        y_doc_unique[idx].insert(4,v[1]) # unique name is the 5th element in the row

  if debug: print_list(y_doc_unique)

  return  map_ent_idx,y_doc_unique

## External identification
##(optional 2)

### Babelnet


In [5]:
""" 
1st function to retrieve babalnet IDs
"""
def query_serverUSEA(text, debug = False):

  if debug: startTime = time.time()

  def extract_synsetsIDs(response, text):
    words_ids = [] # elements -> (word,id)
    words = text.split()
    
    list_wsd = response["response"]["texts"][0]["annotations"]["wsd"]
    for elem in list_wsd:
      word = words[elem["start"]]
      bn_id = elem["features"]["synset"]
      print(f"word: {word} | babelnet id:{bn_id}")
      words_ids.append((word,bn_id))
      
    return words_ids
  
  # sometimes the server gives an error as response
  response = requests.post(
      "https://nlp.uniroma1.it/usea/api", json={"type": "text", "content": text}
  ).json()
  
  if debug:
    endTime = time.time()
    print("time elapsed for Entity Linking {}".format(endTime-startTime))
    print(json.dumps(response, indent=2))

  # get babalnet_IDs
  return extract_synsetsIDs(response, text)


""" 
2nd function to retrieve babalnet IDs
"""
def query_babelfy(text, debug = False):

  span2id = [] # set of tuple with (span,babelnet_id)

  service_url = 'https://babelfy.io/v1/disambiguate'

  lang = 'EN'

  # for a complete execution substiture here e key with no queries limit
  key  = 'd8bea0c4-527f-41e4-9fb1-50a17dacbfe6'

  params = {
    'text' : text,
    'lang' : lang,
    'key'  : key
  }

  url = service_url + '?' + urllib.parse.urlencode(params)
  request = urllib2.Request(url)
  request.add_header('Accept-encoding', 'gzip')
  response = urllib2.urlopen(request)

  if response.info().get('Content-Encoding') == 'gzip':
    buf = io.BytesIO( response.read())
    f = gzip.GzipFile(fileobj=buf)
    data = json.loads(f.read())

    if debug: print(data)
    # retrieving data
    for result in data:
                  # retrieving token fragment
                  tokenFragment = result.get('tokenFragment')
                  tfStart = tokenFragment.get('start')
                  tfEnd = tokenFragment.get('end')
                  if debug:print(str(tfStart) + "\t" + str(tfEnd))

                  # retrieving char fragment
                  charFragment = result.get('charFragment')
                  cfStart = charFragment.get('start')
                  cfEnd = charFragment.get('end')
                  if debug: print(str(cfStart) + "\t" + str(cfEnd))
                  
                  # retrieving BabelSynset ID
                  synsetId = result.get('babelSynsetID')
                  
                  span = " ".join(text.split()[tfStart:tfEnd+1])
                  
                  if debug:
                    print(synsetId)
                    print("inserting the following tuple in the results: ({},{})".format \
                          (span,synsetId))
                  
                  span2id.append((span,synsetId))

  return span2id

"""
get babelnet IDs, two different applications can be chosen for this:
- Babelefy
- Usea
(use Babelfy)
crucial difference from the EL with wikidata: 
limited number of request (free account with 1000 coins), so once found a match for a certain span
use the same IDs for all the repetition of that span.
this obviously reduce the accuracy, but makes the execution feasible.
"""
def get_babelIDs(sentences, y_doc, map_ent_idx, ed_match = 2,  mode ="Babelefy", verbose = False):
  # retrive from the map the rows of y_doc with same Named Entity

  counter = 0
  for k,v in map_ent_idx.items():
    
    indices_y = v[0]
    is_match = False # boolean to handle less queries
    tmp_id = ""
    tmp_span = ""
    for idx_y in indices_y:
      y_row = y_doc[idx_y]
      sentence = get_sentence(sentences,y_row[1],y_row[2])

      # get the ID of the knowledge base
      if mode == "Babelefy":
        try:
          
          span2id = query_babelfy(sentence)
          print(counter)
        except Exception as e:
          print("limit exceeded, counter queries: {}".format(counter))
          return 
        counter += 1
      else:
        span2id = query_serverUSEA(sentence)
      
      # check if there's a linked entity that match the actual Named Entity
      for (new_span,new_babel_id) in span2id:
        if verbose: print(f"{v[1]} | {new_span} | {new_babel_id}")
        # filtering span for mathing
        s1 = reduce_span(v[1])
        s2 = reduce_span(new_span)

        # first prefer to choose as match the span with an edit distance lower than ed_match
        # if you find one, break the loop
        if editdistance.eval(s1,s2) <=  ed_match:
          is_match = True
          tmp_span = new_span
          tmp_id = str(new_babel_id)
          break
 
      if is_match: break
    
    if (tmp_id!= "") and (tmp_span != ""):
      for idx_y in indices_y:
        y_row = y_doc[idx_y] 
        if verbose: print(f"inserting: {v[1]} | {tmp_span} | {tmp_id}")
        y_row.insert(5,tmp_id) #add the last element in the vector

  return

### Wikidata

In [6]:
"""
function that retrive wikidata IDs using a spacy pipeline with entity-linker
ed_match: parameter used to validate a new span equal to the ones found through NER 
"""
def get_wikidataID(el_model, sentences, y_doc, map_ent_idx, ed_match = 2, verbose = False):
  # retrive from the map the rows of y_doc with same Named Entity
  for k,v in map_ent_idx.items():
    indices_y = v[0]

    for idx_y in indices_y:

      y_row = y_doc[idx_y]
      # get the ID of the knowledge base
      sentence = get_sentence(sentences,y_row[1],y_row[2])

      # sometimes there're white spaces at the end of the sentence that causes error
      # handle this problem here
      try: 
        doc = el_model(sentence)
      except Exception as e:
        sentence = sentence.rstrip()
        doc = el_model(sentence)

      # check if there's a linked entity that match the actual Named Entity
      for linked_entity in doc._.linkedEntities:
        new_span = str(linked_entity.get_span())
        new_wikidata_id = "Q"+str(linked_entity.get_id())
        if verbose: print(f"{v[1]} | {new_span} | {new_wikidata_id}")
        # filtering span for mathing
        s1 = reduce_span(v[1])
        s2 = reduce_span(new_span)

        # first prefer to choose as match the span with an edit distance lower than ed_match
        # if you find one, break the loop
        if editdistance.eval(s1,s2) <=  ed_match:
          if verbose: print(f"inserting: {v[1]} | {new_span} | {new_wikidata_id}")
          y_row.insert(5,new_wikidata_id) #add the last element in the vector
          break
          
  return

# Named Entity Recognition

## Initialization 

In [7]:
# loading of the trained pipeline from spacy for NER
nlp = spacy.load(PIPELINES_TYPE[SIZE_IDX])

# loading of the trained pipeline from spacy for EL
if EL_RESOURCE == "wikidata":
  # you can't use transformer models including the default entity linker in the pipeline
  nlp_el = spacy.load("en_core_web_md")
  nlp_el.add_pipe("entityLinker", last=True) # for this the restart of the runtime is needed 

# variables to toogle verbose exectution of the main loop
verbose = False

## Named Entities extraction

In [8]:
# defined table to save results from the texts
output = {}

# save the initial time at beginning of the task
startTime = time.time()

# main loop
for idx,path in enumerate(full_list_paths):
  try:
    file = " ".join(path.split('/')[3].split('_')[2:])
    print(f"Extracting Named Entity in the text n°{idx+1}: {file}")
  except:
    print(f"Extracting Named Entity in the text n°{idx+1}: {path.split('/')[3].split('_')[2]}")

  sentences = text_from_book(path)
  title= path.split('/')[3].replace(".txt","")

  # pipeline over all the sentences of the text
  docs = list(nlp.pipe(sentences))

  # initialize counter char
  char_counter = 0

  # initialize the output vector
  y_doc = []

  for i,(doc,sentence) in enumerate(zip(docs,sentences)):
   
    # show for each sentence: tokens, part of speech tagging (POS), dependency parsing and the lemmatization
    if verbose:
      print(sentence)
      print("{:<30}".format("-"*30))
      print("{:<10}|{:<10}|{:<10}|{:<10}".format("token","DEP","POS","lemma")) 
      print("{:<30}".format("-"*30))
      for token in doc:
        print(f"{token.text:<10}||{token.dep_:<10}{token.pos_:<10}|{token.lemma_:<10}")
      print("\n")

    # show the results from Named Entity Recognition

    tmp_chars = []

    plo_ents = get_labels(doc)
    if len(plo_ents)> 0:
      for ent in plo_ents:
        
        tmp_row = [title, ent.start_char + char_counter, ent.end_char + char_counter,ent.label_]
      
        if verbose: print(f"inserting the following entry: {str(tmp_row)}")
        tmp_chars.append((ent.start_char, ent.end_char)) # local indices for chars 
        y_doc.append(tmp_row)

    char_counter += len(sentence)
  
  print()

  # --- call here functions for optional task and add in each entry ---

  # unique name identifying 
  map_ent_idx, y_doc = set_uniqueNames(y_doc, sentences, "longer", False)

  # external identification
  if EL_RESOURCE == "babelnet":
    get_babelIDs(sentences, y_doc, map_ent_idx, mode ="Babelefy")  # babel ID based on the unique name
  else: # retrieve wikidata IDs
    get_wikidataID(nlp_el, sentences, y_doc, map_ent_idx, verbose= False)

  # store results in the dictionary
  output[title] = y_doc

print("Total time elapsed for the task: {} [s]".format((time.time() - startTime)))

Extracting Named Entity in the text n°1: The Old Dame and her Hen.txt





Extracting Named Entity in the text n°2: The Dove.txt

Extracting Named Entity in the text n°3: Corvetto.txt

Extracting Named Entity in the text n°4: Sun, Moon, and Talia.txt

Extracting Named Entity in the text n°5: Childe Rowland.txt

Extracting Named Entity in the text n°6: Town Musicians of Bremen.txt

Extracting Named Entity in the text n°7: The Raven.txt

Extracting Named Entity in the text n°8: Herr Korbes.txt

Extracting Named Entity in the text n°9: Thumbelina.txt

Extracting Named Entity in the text n°10: The Nightingale.txt

Extracting Named Entity in the text n°11: The Ugly Duckling.txt

Extracting Named Entity in the text n°12: Jack the Giant Killer.txt

Extracting Named Entity in the text n°13: The Wonderful Birch.txt

Extracting Named Entity in the text n°14: Cap-o -Rushes.txt

Extracting Named Entity in the text n°15: The Story of Pretty Goldilocks.txt

Extracting Named Entity in the text n°16: Soria Moria Castle.txt

Extracting Named Entity in the text n°17: The Cat 

# Tests

In [9]:
# toogle testing (to avoid testing when execute all cells)
testing = False

## Test  ER module

In [10]:
if testing: 
  # define a small set of tests  
  test_1 = "Josh moved in the USA three weeks ago for business reasons. Now he is working in Google."
  test_2 = "Yesterday he visited the White House in Washington D.C"
  test_3 = "The K2 peak at 8,611 metres (28,251 ft) above sea level, is the second-highest mountain on Earth, after Mount Everest"
  test_4 = "The Mont Blanc is the highest mountain in the Alps and Western Europe"
  test_5 = "The lake Lough Neagh is the biggest of the United Kingdom"
  test_6 = "the Brooklyn bridge is an amazing construction in a borough of New York City"
  test_7 = "the magic rod picked up in the castle has been broken by Old Richard."
  full_test = [test_1,test_2,test_3,test_4,test_5,test_6,test_7]

  test = full_test

  if type(test)==str: # single test
    doc = nlp(test)
    print("number of sentences %d" % (len(list(doc.sents))))
    for ent in doc.ents:
      print(ent.text, ent.start_char, ent.end_char, ent.label_)
    print("----------------")
    plo_ents = get_labels(doc)
    for ent in plo_ents:
      print(ent.text, ent.start_char, ent.end_char, ent.label_)
    print("-------------------------------- n")

  else: # the array of tests -> full_test
    docs = list(nlp.pipe(test))
    print("Pipeline type: {}\n".format(PIPELINES_TYPE[SIZE_IDX]))
    for doc in docs:
      for ent in doc.ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
      print("----------------")
      plo_ents = get_labels(doc)
      for ent in plo_ents:
        print(ent.text, ent.start_char, ent.end_char, ent.label_)
      print("-------------------------------- n")

## Test unique names 

In [11]:
if testing:
  # define simple dicitonaries for testing 
  test0_merge = {}
  test1_merge = {"AL": [[],"x"]}
  test2_merge = {
      "AL": [[],"x"],
      "B": [[],"x"],
      "A": [[],"x"],
      "L": [[],"x"],
      "X": [[],"x"]
  }
  test3_merge = {
      "B": [[],"x"],
      "A": [[],"x"],
      "AL": [[],"x"],
      "X": [[],"x"],
      "L": [[],"x"]
  }

  test4_merge = {
      "ALOE": [[],"x"],
      "A": [[],"x"],
      "AL": [[],"x"],
      "XE": [[],"x"],
      "ALO": [[],"x"],
      "L": [[],"x"],
      "X": [[],"x"],
      "XEN": [[],"x"],
  }

  test5_merge = {
      "AL": [[],"x"],
      "OE": [[],"x"],
      "A": [[],"x"],
      "O": [[],"x"],
      "L": [[],"x"],
      "E": [[],"x"],
      # "ALOE": [[],"x"],
  }

  test6_merge = {
      "AL": [[],"x"],
      "OE": [[],"x"],
      "X YY": [[],"x"],
      "W YY": [[],"x"],
      "L": [[],"x"],
      "A": [[],"x"],
      "YY": [[],"x"],
      "ALOE": [[],"x"],
  }

  test7_merge = {
      "Sir. Franco Grossi": [[],"x"],
      "Franco Grossi": [[],"x"],
      "Ernesto Grossi": [[],"x"],
      "Grossi": [[],"x"],
      "Franco": [[],"x"],
  }


  def gen_indc_dict(table):
    for i,(k,v) in enumerate(table.items()):
      for j in range(random.randint(1,4)):
        v[0].append(i*4 +j+1)
    return table

  test = gen_indc_dict(test7_merge)


  # --- launch test
  criterion = sorting_criterions["first"]
  test = dict(sorted(test.items(), key = criterion))

  print_table(test)

  if len(test) > 1: edited = merge_items(test, debug = True)
  print(test)
  while(edited and len(test) > 1):
    edited =  merge_items(test, debug = True)
    print(test)

# Generate .tsv file

In [12]:
"""
Directives:
- not include header in the tsv file
- fields: "text_id","start_offset","end_offset","label","name","resource_id"
- "name" and "resource_id" labels are optional
"""

# generate the full list of the final result 
full_list_rows = []
for title,y_doc in output.items():
  print(f"elaborating results from {title}...")
  full_list_rows.extend(y_doc)

# build the dataframe
df = pd.DataFrame(full_list_rows)

# save the dataframe as .tsv file
df.to_csv(path_or_buf= "result.tsv",sep="\t",header=False, index = False, encoding='utf-16')
# download and save locally the result file 
files.download('result.tsv')

elaborating results from bn_00173084n_The Old Dame and her Hen...
elaborating results from bn_00187346n_The Dove...
elaborating results from bn_00187442n_Corvetto...
elaborating results from bn_00608721n_Sun, Moon, and Talia...
elaborating results from bn_00790303n_Childe Rowland...
elaborating results from bn_01593735n_Town Musicians of Bremen...
elaborating results from bn_01899260n_The Raven...
elaborating results from bn_02154461n_Herr Korbes...
elaborating results from bn_02173122n_Thumbelina...
elaborating results from bn_02238889n_The Nightingale...
elaborating results from bn_02277659n_The Ugly Duckling...
elaborating results from bn_02442758n_Jack the Giant Killer...
elaborating results from bn_03300215n_The Wonderful Birch...
elaborating results from bn_03301753n_Cap-o_-Rushes...
elaborating results from bn_03326399n_The Story of Pretty Goldilocks...
elaborating results from bn_03329137n_Soria Moria Castle...
elaborating results from bn_03329494n_The Cat on the Dovrefjell...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>