In [1]:
import numpy as np
import random as rnd
import os
import pandas as pd
import trax
from trax import layers as tl
from utils_copy_1 import get_params, get_vocab

In [2]:
# Loading data
data = pd.read_csv("./ner_dataset.csv", encoding = "ISO-8859-1")
train_sentences = open("./data/small/train/sentences.txt", "r").readline()
train_labels = open("./data/small/train/labels.txt", "r").readline()
# exploring data
print(f"Sentence 0\n{train_sentences}")
print()
print(f"Labels 0\n{train_labels}")
print()
print(data.head())

Sentence 0
Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .


Labels 0
O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O


    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


In [3]:
# Load preprocessed data
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')
vocab_size = len(vocab)

In [4]:
print(f"'the' index: {vocab['the']}")
print(f"'PAD' index: {vocab['<PAD>']}")

'the' index: 9
'PAD' index: 35180


In [5]:
print(f"Vocab size: {vocab_size}")
print()
print(f"Train size: {t_size}")
print(f"Validation size: {v_size}")
print(f"Test size: {test_size}")
print()
print(f"Train sentence 0: {t_sentences[0]}")
print(f"Train labels 0: {t_labels[0]}")
print()
print(f"Val sentence 0: {v_sentences[0]}")
print(f"Val labels 0: {v_labels[0]}")
print()
print(f"Test sentence 0: {test_sentences[0]}")
print(f"Test labels 0: {test_labels[0]}")
print()
print(f"Tags to be predicted")
print(tag_map)
print()
print(f"Tags size: {len(tag_map)}")

Vocab size: 35181

Train size: 33570
Validation size: 7194
Test size: 7194

Train sentence 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
Train labels 0: [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]

Val sentence 0: [1020, 68, 5092, 50, 9, 29845, 1677, 18327, 1033, 9, 4452, 13, 522, 29846, 45, 10314, 223, 6582, 21]
Val labels 0: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0]

Test sentence 0: [4046, 3007, 18, 3793, 2474, 7895, 93, 45, 1701, 32653, 3179, 93, 134, 19565, 5343, 740, 93, 13, 45, 19116, 4181, 1813, 21]
Test labels 0: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Tags to be predicted
{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}

Tags size: 17


### Data Generator

In [6]:
# data_generator function
def data_generator(batch_size, x, y, pad, shuffle = False):
    # count number of lines
    num_lines = len(x)
    
    # create list of indexes
    data_indexes = [*range(num_lines)]
    
    # shuffle indexes if shuffle is True
    if shuffle:
        rnd.shuffle(data_indexes)

    # track loction of current x, y
    cur_index = 0
    
    while True:
        # temporal array with zeros holds location of batched data
        buffer_x = [0] * batch_size
        buffer_y = [0] * batch_size

        # storing x[index : index + batch_size], y[index : index + batch_size] with padded version
        max_length = 0
        for i in range(batch_size):
                
            # check if cur_index exceeds num_lines
            if cur_index >= num_lines:
                cur_index = 0
                # reshuffle indexes if shuffle set to True
                if shuffle:
                    rnd.shuffle(data_indexes)
                    
            # store raw data from x in buffer_x,also with its corsponding y value
            buffer_x[i] = x[data_indexes[cur_index]]
            buffer_y[i] = y[data_indexes[cur_index]]
            
            # get the length of the current buffer_x[i]
            len_x = len(buffer_x[i])
            
            # if the len_x greater than max_len set mx_length to len_x
            if len_x > max_length:
                max_length = len_x
                
            # increment index by 1
            cur_index += 1
            
        # X, Y arrays of shape (batch_size, max_length) as padded values
        X = np.full((batch_size, max_length), pad)
        Y = np.full((batch_size, max_length), pad)
        # fill X,Y with buffer_x, buffer_y lists
        for i in range(batch_size):
            x_i = buffer_x[i]
            y_i = buffer_y[i]
            # looping through each word in x_i
            for j in range(len(x_i)):
                X[i, j] = x_i[j]
                Y[i, j] = y_i[j]
        
        # yield (X, Y)
        yield (X, Y)

In [7]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
dg = data_generator(batch_size, mini_sentences, mini_labels, vocab["<PAD>"], shuffle=False)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

(5, 30) (5, 30) (5, 30) (5, 30)
[    0     1     2     3     4     5     6     7     8     9    10    11
    12    13    14     9    15     1    16    17    18    19    20    21
 35180 35180 35180 35180 35180 35180] 
 [    0     0     0     0     0     0     1     0     0     0     0     0
     1     0     0     0     0     0     2     0     0     0     0     0
 35180 35180 35180 35180 35180 35180]


### Model architcure

In [10]:
def LSTM(vocab_size = vocab_size, d_model = 128, tags = tag_map):
    model = tl.Serial(
        tl.Embedding(vocab_size, d_model),
        tl.LSTM(n_units = d_model),
        tl.Dense(len(tags)),
        tl.LogSoftmax()
    )
    return model

In [11]:
tmp_model = LSTM(vocab_size = vocab_size, d_model = 50, tags = tag_map)
display(tmp_model)

Serial[
  Embedding_35181_50
  LSTM_50
  Dense_17
  LogSoftmax
]

### Train model

In [12]:
from trax.supervised import training
from trax.data import add_loss_weights


def train_model(model, train_data, train_labels, eval_data, eval_labels, data_generator = data_generator, batch_size = 64, pad = vocab["<PAD>"], n_steps = 1, shuffle = True, output_dir = "model/"):
    # creating train_generator
    train_generator = add_loss_weights(
        data_generator(batch_size, train_data, train_labels, pad, True),
        id_to_mask = pad)
    
    # creating eval generator
    eval_generator = add_loss_weights(
        data_generator(batch_size, eval_data, eval_labels, pad, True),
        id_to_mask = pad)

    # creating train task
    train_task = training.TrainTask(
        labeled_data = train_generator,
        loss_layer = tl.CrossEntropyLoss(),
        optimizer = trax.optimizers.Adam(0.001),
        n_steps_per_checkpoint = 10
    )
    # creatng eval task
    eval_task = training.EvalTask(
        labeled_data = eval_generator,
        metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches = 10
    )
    # define trainig loop
    training_loop = training.Loop(
        model,
        train_task,
        eval_tasks = [eval_task],
        output_dir = output_dir
    )
    # run model for # epochs
    training_loop.run(n_steps)
    # return training_loop
    return training_loop

In [25]:
BATCH_SIZE = 64
EPOCHS = 350
ner_model = LSTM(vocab_size = vocab_size, d_model = 128, tags = tag_map)

In [26]:
# train model
training_loop = train_model(ner_model, t_sentences, t_labels, v_sentences, v_labels, batch_size = BATCH_SIZE, n_steps = EPOCHS, output_dir = "model_3/")

  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 4636945
Step      1: Ran 1 train steps in 16.07 secs
Step      1: train CrossEntropyLoss |  2.36170220


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  1.97816901
Step      1: eval          Accuracy |  0.35201188

Step     10: Ran 9 train steps in 67.00 secs
Step     10: train CrossEntropyLoss |  1.18887854
Step     10: eval  CrossEntropyLoss |  0.86785098
Step     10: eval          Accuracy |  0.84948053

Step     20: Ran 10 train steps in 42.77 secs
Step     20: train CrossEntropyLoss |  0.86420357
Step     20: eval  CrossEntropyLoss |  0.81392171
Step     20: eval          Accuracy |  0.84810922

Step     30: Ran 10 train steps in 18.63 secs
Step     30: train CrossEntropyLoss |  0.80493563
Step     30: eval  CrossEntropyLoss |  0.79448045
Step     30: eval          Accuracy |  0.84451742

Step     40: Ran 10 train steps in 30.73 secs
Step     40: train CrossEntropyLoss |  0.77113956
Step     40: eval  CrossEntropyLoss |  0.77103152
Step     40: eval          Accuracy |  0.84634164

Step     50: Ran 10 train steps in 15.75 secs
Step     50: train CrossEntropyLoss |  0.76410991
Step     50: eva

### Compute accuracy

In [27]:
test_gen = data_generator(len(test_sentences), test_sentences, test_labels, vocab["<PAD>"], False)
test_x, test_y = next(test_gen)
print(test_x.shape)
print(test_y.shape)

(7194, 70)
(7194, 70)


In [28]:
def load_trained_lstm(model_path, vocab_size = vocab_size, tag_map = tag_map, d_model=128, seq_len=10):
    model = LSTM(vocab_size=vocab_size, d_model=d_model, tags=tag_map)
    model.init(trax.shapes.ShapeDtype((1, seq_len), dtype=np.int32))
    model.init_from_file(model_path, weights_only=True)
    return model

In [29]:
model = load_trained_lstm('./model_3/model.pkl.gz')

# Initialize with a dummy input shape
# model.init(trax.shapes.ShapeDtype((1, 10), dtype=np.int32))  # sequence length = 10

# # Load trained weights
# model.init_from_file('./model_3/model.pkl.gz', weights_only=True)

In [30]:
display(model)

Serial[
  Embedding_35181_128
  LSTM_128
  Dense_17
  LogSoftmax
]

In [31]:
# make dummy prediction
tmp_preds = model(test_x)
print(tmp_preds.shape)

(7194, 70, 17)


In [32]:
np.argmax(tmp_preds, axis = 2).shape

(7194, 70)

In [33]:
# function for evaluating model performance
def evaluate_model(preds, labels, pad):
    # choose max entity label index from axis 2
    outputs = np.argmax(preds, axis = 2) # preds shape = (batch size, padded example length, entities)
    print(f"Outputs shape: {outputs.shape}") # (batch size, padded example length)
    # create mask of non padded sentences
    mask = (labels != pad)
    # get accuracy
    accuracy = np.sum(outputs == labels) / float(np.sum(mask))
    # return accuracy
    return accuracy

In [34]:
# test evaluation function
tmp_preds = model(test_x)
accuracy = evaluate_model(tmp_preds, test_y, vocab["<PAD>"])
print(f"Accuracy = {accuracy * 100}%")

Outputs shape: (7194, 70)
Accuracy = 94.49607849121094%


### Inference case

In [35]:
def predict(sentence, model, vocab, tags):
    # create a tensor of sentence
    s = [vocab[token] if token in vocab else vocab['UNK'] for token in sent.split(' ')]
    # create batch of data
    batch = np.ones((1, len(s)))
    batch[0][:] = s
    sentence = np.array(batch).astype(int)
    # get output
    output = model(sentence)
    # get entities
    outputs = np.argmax(output, axis = 2)
    # get labels
    labels = list(tags.keys())
    # initialize empty list
    pred = []
    for i in range(len(outputs[0])):
        idx = outputs[0][i]
        pred_label = labels[idx]
        pred.append(pred_label)
    return pred

In [36]:
sent = "Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come"
s = [vocab[token] if token in vocab else vocab['UNK'] for token in sent.split(' ')]
pred_labels = predict(sent, model, vocab, tag_map)
print(f"Sentence: {sent}")
print(f"Tokenized sentence: {s}")
print(f"Named Entities: {pred_labels}")

for x,y in zip(sent.split(' '), pred_labels):
    if y != 'O':
        print(x,y)

Sentence: Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come
Tokenized sentence: [4921, 35179, 9, 2046, 2047, 4512, 1, 731, 13, 6716, 4313, 1, 35179, 172, 11, 134, 1470, 63, 350, 3525, 19, 9, 2046, 2047, 59, 2594, 7, 359, 223, 9, 8138, 1, 45, 103, 1003, 1, 9, 35179, 11, 9, 35179, 3415, 502, 172, 58, 35179, 21257, 1140]
Named Entities: ['B-per', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O', 'O', 'B-org', 'I-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Peter B-per
White B-org
House I-org
Sunday B-tim
morning I-tim
White B-org
House I-org
