In [40]:
import numpy as np

In [2]:
%load_ext autoreload
%autoreload 2

## Dataset loading

In [75]:
from utils.dataset_loader import load_datasets

(texts_train, labels_train), (texts_val, labels_val), (texts_test, labels_test) = load_datasets(folder_path='dataset', 
                                                                                                divide_by_sentence=True)


Vectorizing the texts

In [76]:
from tensorflow.keras.layers import TextVectorization

texts = texts_train + texts_val + texts_test

sequence_length = 50

vectorizer = TextVectorization(output_sequence_length=sequence_length, standardize=None)  # standaridze=None

vectorizer.adapt(data=texts)

x = vectorizer(texts)

# Mapping from integers to word types
vocabulary = np.array(vectorizer.get_vocabulary())

x_train = x[:len(texts_train)]
x_val = x[len(texts_train):len(texts_train)+len(texts_val)]
x_test = x[len(texts_train)+len(texts_val):]

Vectorizing the labels

In [81]:
from tensorflow.keras.layers import TextVectorization

labels = labels_train + labels_val + labels_test

sequence_length = 50

vectorizer_labels = TextVectorization(output_sequence_length=sequence_length, standardize=None)  # standaridze=None

vectorizer_labels.adapt(data=labels)

y = vectorizer_labels(labels)

# Mapping from integers to POS tags
vocabulary_labels = np.array(vectorizer_labels.get_vocabulary())

y_train = y[:len(labels_train)]
y_val = y[len(labels_train):len(labels_train)+len(labels_val)]
y_test = y[len(labels_train)+len(labels_val):]

n_classes = len(vocabulary_labels)

Glove embeddings

In [82]:
from utils.glove_loader import load_glove

embedding_dimension = 50

GLOVE_embeddings = load_glove(folder_path='glove_pretrained', embedding_dim=embedding_dimension)

Embedding matrix

In [83]:
from utils.embedding_matrix_builder import build_embedding_matrix

embedding_matrix = build_embedding_matrix(vocabulary=vocabulary, GLOVE_embeddings=GLOVE_embeddings, embedding_dimension=embedding_dimension)

## Baseline Model

In [84]:
from models.baseline_model import build_baseline_model

baseline_model = build_baseline_model(sequence_length=sequence_length, n_classes=n_classes, embedding_matrix=embedding_matrix)

In [78]:
baseline_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_2 (Embedding)     (None, 50, 50)            547450    
                                                                 
 lstm_2 (LSTM)               [(None, 50, 128),         91648     
                              (None, 128),                       
                              (None, 128)]                       
                                                                 
 dense_1 (Dense)             (None, 50, 47)            6063      
                                                                 
Total params: 645,161
Trainable params: 645,161
Non-trainable params: 0
_________________________________________________________________


In [85]:
import tensorflow as tf

baseline_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [26]:
# Without using punctuation
baseline_model.fit(x=x_train, y=y_train, batch_size=32, epochs=50, validation_data=(x_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x25dedce55d0>

In [86]:
baseline_model.fit(x=x_train, y=y_train, batch_size=32, epochs=50, validation_data=(x_val, y_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x25e55bc2c80>

Check the predicted labels on a single test sentence

In [63]:
texts_test[0]

'intelogic trace inc. , san antonio , texas , said it bought 2.7 million shares , or about 18 % , of its common stock from an unaffiliated shareholder for $ 3.625 a share , or $ 9.9 million .'

In [64]:
labels_test[0]

'NNP NNP NNP , NNP NNP , NNP , VBD PRP VBD CD CD NNS , CC IN CD NN , IN PRP$ JJ NN IN DT JJ NN IN $ CD DT NN , CC $ CD CD .'

In [67]:
# Without using punctuation

import numpy as np

vocabulary_labels[np.argmax(baseline_model(x_test[0:1]), axis=2)]

array([['nnp', 'nns', 'nnp', 'nnp', 'nnp', 'nnp', 'vbd', 'prp', 'vbd',
        'cd', 'cd', 'nns', 'cc', 'rb', 'cd', 'nn', 'in', 'prp', 'nn',
        'nn', 'in', 'dt', 'jj', 'nn', 'in', 'jj', 'dt', 'nn', 'cc', 'cd',
        'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd',
        'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd', 'cd']],
      dtype='<U5')

In [87]:
import numpy as np

vocabulary_labels[np.argmax(baseline_model(x_test[0:1]), axis=2)]

array([['NNP', 'NNS', 'NNP', ',', 'NNP', 'NNP', ',', 'NNP', ',', 'VBD',
        'PRP', 'VBD', 'CD', 'CD', 'NNS', ',', 'CC', 'RB', 'CD', 'NN',
        ',', 'IN', 'PRP$', 'JJ', 'NN', 'IN', 'DT', 'JJ', 'NN', 'IN', '$',
        'CD', 'DT', 'NN', ',', 'CC', '$', 'CD', 'CD', '.', '.', '.', '.',
        '.', '.', '.', '.', '.', '.', '.']], dtype='<U5')

Test score

In [38]:
# Without using puncutation
baseline_model.evaluate(x_test, y_test)



[0.9275322556495667, 0.8213129043579102]

In [88]:
baseline_model.evaluate(x_test, y_test)



[0.45929422974586487, 0.899909496307373]