### Imports



In [1]:
import json
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### load the dataset


In [4]:
with open('./data/classNERData.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


In [9]:
##create BIO labels - for each sentence
def create_bio_labels(data):
    bio_labels = []
    for sentence in data:
        bio = 'O' * len(sentence['request'])
        for entity in sentence['entities']:
            start = entity['start']
            end = entity['end']
            label_type = entity['category']
            tmp = sentence['request'][start:end]
            for i, word in enumerate(tmp.split()):
                if i == 0:
                    bio = bio[:start] + 'B-' + label_type + bio[start + 2:]
                else:
                    bio = bio[:start] + 'I-' + label_type + bio[start + 2:]
                start += len(word) + 1
        bio_labels.append(bio)
    return bio_labels

data = [
    {
      "request": "I want to fly to New York on the 13.3",
      "entities": [
        {"start": 16, "end": 23, "text": "New York", "category": "DESTINATION"},
        {"start": 32, "end": 35, "text": "13.3", "category": "DATE"}
      ]
    } 
]

print(data['request'])
# bio_labels = create_bio_labels(data)
# print(bio_labels)

['OOOOOOOOOOOOOOOOOOB-I-ORII-ORIGB-DI-DESI-DESTB-DATEI-DATEIONNATIONTINATIONNIGINOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO']


### Tokenize the sentences


In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenized_texts = []
labels = []


for example in data:
    tokenized_text = tokenizer.tokenize(example['request'])
    tokenized_texts.append(tokenized_text)
    labels.append(example['entities'])

In [40]:
data = [
    {
      "request": "I want to fly to New York on the 13.3",
      "entities": [
        {"start": 17, "end": 25, "text": "New York", "category": "DESTINATION"},
        {"start": 33, "end": 37, "text": "13.3", "category": "DATE"}
      ]
    } 
]

print(data[0]['request'][33])
bio = ["O"] * len(data[0]['request'].split())
print(bio)

1
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [41]:

data = [
    {
      "request": "I want to fly to New York on the 13.3",
      "entities": [
        {"start": 17, "end": 25, "text": "New York", "category": "DESTINATION"},
        {"start": 33, "end": 37, "text": "13.3", "category": "DATE"}
      ]
    } 
]

# Begeining Inside Outside
# BIO label - O O O O O B-DESTINATION I-DESTINATION O O B-DATE

# a function that replace each word in a sentence with its corresponding indexes in the sentecnce
def replace_words_with_indexes(sentence):
    indexes = ""
    counter=0
    for word in sentence.split(" "):
        indexes += str(counter) + "-" + str(counter + len(word)) + " "
        counter += len(word) + 1
    return indexes.rstrip()

def create_BIO(data):
    # bio = 'O' * len(data['request'].split()) #init with O for num of words
    bio_labels = []
    for item in data:
        bio = ["O"] * len(item['request'].split())
        indexes = replace_words_with_indexes(item['request'])
        for entity in item['entities']:
            start = entity['start']
            end = entity['end']
            label_type = entity['category']
            tmp = item['request'][start:end]
            for i, index in enumerate(indexes.split(" ")):
                if int(index.split("-")[0]) >= start and int(index.split("-")[1]) <= end:
                    if int(index.split("-")[0]) == start:
                        bio[i] = 'B-' + label_type 
                    else:
                        bio[i] = 'I-' + label_type
        bio_labels.append(' '.join(bio))
    return bio_labels

        


string = "I want to fly to New York on the 13.3"
print(replace_words_with_indexes("I want to fly to New York on the 13.3"))
indxes = replace_words_with_indexes("I want to fly to New York on the 13.3").rstrip()
for item in indxes.split(" "):
    print(string[int(item.split("-")[0]):int(item.split("-")[1])])
print (create_BIO(data))

0-1 2-6 7-9 10-13 14-16 17-20 21-25 26-28 29-32 33-37
I
want
to
fly
to
New
York
on
the
13.3
['O O O O O B-DESTINATION I-DESTINATION O O B-DATE']
