In [117]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import tensorflow as tf
import ast
from sklearn.model_selection import train_test_split
tf.keras.utils.set_random_seed(33)


In [118]:
data = pd.read_csv("data/NER_Dataset.csv", encoding = "ISO-8859-1") 
print('ORIGINAL DATA:\n', data.head())



ORIGINAL DATA:
        Sentence_ID                                               Word  \
0      Sentence: 1  ['Thousands', 'of', 'demonstrators', 'have', '...   
1     Sentence: 10  ['Iranian', 'officials', 'say', 'they', 'expec...   
2    Sentence: 100  ['Helicopter', 'gunships', 'Saturday', 'pounde...   
3   Sentence: 1000  ['They', 'left', 'after', 'a', 'tense', 'hour-...   
4  Sentence: 10000  ['U.N.', 'relief', 'coordinator', 'Jan', 'Egel...   

                                                 POS  \
0  ['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...   
1  ['JJ', 'NNS', 'VBP', 'PRP', 'VBP', 'TO', 'VB',...   
2  ['NN', 'NNS', 'NNP', 'VBD', 'JJ', 'NNS', 'IN',...   
3  ['PRP', 'VBD', 'IN', 'DT', 'NN', 'JJ', 'NN', '...   
4  ['NNP', 'NN', 'NN', 'NNP', 'NNP', 'VBD', 'NNP'...   

                                                 Tag  
0  ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '...  
1  ['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '...  
2  ['O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 

In [119]:
sentences = []
tags = []
for index, row in data.iterrows():
    # Convert the string representation of lists into actual lists
    words = ast.literal_eval(row['Word'])
    ner_tags = ast.literal_eval(row['Tag'])
    
    # Combine words into a sentence
    sentence = ' '.join(words)
    
    # Append to the lists
    sentences.append(sentence)
    tags.append(ner_tags)

In [120]:
print(sentences[0])
print(tags[0])
print(len(sentences))
print(len(tags))

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
47959
47959


In [121]:
X_train, X_test, Y_train, Y_test = train_test_split(sentences, tags, test_size=0.2, random_state=42)

In [122]:
def get_sentence_vectorizer(sentences):
    sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None)
    sentence_vectorizer.adapt(sentences)
    vocab = sentence_vectorizer.get_vocabulary()
    return sentence_vectorizer, vocab

    ### END CODE HERE ### 

    



In [123]:
test_vectorizer, test_vocab = get_sentence_vectorizer(X_train[:1000])
print(test_vocab)
print(f"Test vocab size: {len(test_vocab)}")
#print(test_vocab)

sentence = "I like learning new NLP models !"
sentence_vectorized = test_vectorizer(sentence)
print(f"Sentence: {sentence}\nSentence vectorized: {sentence_vectorized}")

# Check if words in the test sentence exist in the vocabulary
for word in sentence.split():
    if word in test_vocab:
        print(f"'{word}' is in the vocabulary.")
    else:
        print(f"'{word}' is NOT in the vocabulary.")

Test vocab size: 5078
Sentence: I like learning new NLP models !
Sentence vectorized: [1205 1023    1   71    1    1 5077]
'I' is in the vocabulary.
'like' is in the vocabulary.
'learning' is NOT in the vocabulary.
'new' is in the vocabulary.
'NLP' is NOT in the vocabulary.
'models' is NOT in the vocabulary.
'!' is in the vocabulary.


In [128]:
print(Y_train[0:3])

[['B-geo', 'I-geo', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [129]:
def get_tags(tags):
    tag_set = set()
    for s in tags:
        for tag in s:
            tag_set.add(tag)
    tag_list = list(tag_set) 
    tag_list.sort()
    return tag_list


In [130]:
tags = get_tags(Y_train)
print(tags)

['B-art', 'B-eve', 'B-geo', 'B-gpe', 'B-nat', 'B-org', 'B-per', 'B-tim', 'I-art', 'I-eve', 'I-geo', 'I-gpe', 'I-nat', 'I-org', 'I-per', 'I-tim', 'O']


In [131]:
def make_tag_map(tags):
    tag_map = {}
    for i,tag in enumerate(tags):
        tag_map[tag] = i 
    return tag_map

In [132]:
tag_map = make_tag_map(tags)
print(tag_map)

{'B-art': 0, 'B-eve': 1, 'B-geo': 2, 'B-gpe': 3, 'B-nat': 4, 'B-org': 5, 'B-per': 6, 'B-tim': 7, 'I-art': 8, 'I-eve': 9, 'I-geo': 10, 'I-gpe': 11, 'I-nat': 12, 'I-org': 13, 'I-per': 14, 'I-tim': 15, 'O': 16}
