# Data Source

In [0]:
%%shell

git clone https://github.com/SmartyPants042/NER-Deep-Learning.git
pwd
cd NER-Deep-Learning/Data/
unzip dataset.zip
unzip test_dataset.zip
# downloads the keras_contrib module required for the CRF Layer
pip install git+https://www.github.com/keras-team/keras-contrib.git

Cloning into 'NER-Deep-Learning'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects:   3% (1/28)[Kremote: Counting objects:   7% (2/28)[Kremote: Counting objects:  10% (3/28)[Kremote: Counting objects:  14% (4/28)[Kremote: Counting objects:  17% (5/28)[Kremote: Counting objects:  21% (6/28)[Kremote: Counting objects:  25% (7/28)[Kremote: Counting objects:  28% (8/28)[Kremote: Counting objects:  32% (9/28)[Kremote: Counting objects:  35% (10/28)[Kremote: Counting objects:  39% (11/28)[Kremote: Counting objects:  42% (12/28)[Kremote: Counting objects:  46% (13/28)[Kremote: Counting objects:  50% (14/28)[Kremote: Counting objects:  53% (15/28)[Kremote: Counting objects:  57% (16/28)[Kremote: Counting objects:  60% (17/28)[Kremote: Counting objects:  64% (18/28)[Kremote: Counting objects:  67% (19/28)[Kremote: Counting objects:  71% (20/28)[Kremote: Counting objects:  75% (21/28)[Kremote: Counting objects:  78% (22/28)[Kremote:



In [0]:
data_input = '/content/NER-Deep-Learning/Data/dataset.csv'
test_input = '/content/NER-Deep-Learning/Data/test_dataset.csv'

In [0]:
# Dataframe manipulations
import pandas as pd
# Array manipulations
import numpy as np

# Library used for optimizers
import keras
# Not all sentences are of same length, padding is required
from keras.preprocessing.sequence import pad_sequences
# Converts the target labels to categories that the neural net can predict
from keras.utils import to_categorical
# Type of model used for DL
from keras.models import Sequential, Model
# Layers present in the network. 
# Refer README.md for more details.
# Analysis of different combinations of layers in README.md
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
# A condtitional Random Field
from keras_contrib.layers import CRF
# For the loss by CRF Layer
from keras_contrib.losses import crf_loss
# For the accuracy given by the CRF Layer. 
# The accuracy otherwise is extremely low
# Causing it to be interpreted in a wrong way
from keras_contrib.metrics import crf_viterbi_accuracy

Using TensorFlow backend.


# `Data Manipulation`

In [0]:
df = pd.read_csv(data_input)
df_test = pd.read_csv(test_input)

In [0]:
aggregate_function = lambda s: [(w, p, t) for w, p, t in zip(
    list(s['Word'].values),
    list(s['POS Tag'].values),
    list(s['NER Tag'].values)
)]

In [0]:
sentences = df.groupby("Sentence ID").apply(aggregate_function)
sentences_test = df_test.groupby("Sentence ID").apply(aggregate_function)

In [0]:
max_length = max(len(s) for s in sentences)

In [0]:
words = list(set(df["Word"].values))
words.append("__PAD__")
n_words = len(words)

In [0]:
tags = list(set(df["NER Tag"].values))
n_tags = len(tags)

In [0]:
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}

# `Training & Testing`

In [0]:
# Creates vector of sentences, where each sentence is itself a vector of 62 words maximum.
# We have not yet converted the words to thier respective IDs.
# We have not yet made the sentences of the same length also known as padding.
X_train_sent = [[tup[0] for i, tup in enumerate(sent) if i<max_length] for sent in sentences]
y_train_sent = [[tup[2] for i, tup in enumerate(sent) if i<max_length] for sent in sentences]
X_test_sent = [[tup[0] for i, tup in enumerate(sent) if i<max_length] for sent in sentences_test]
y_test_sent = [[tup[2] for i, tup in enumerate(sent) if i<max_length] for sent in sentences_test]

In [0]:
def generate_encodings(X_sent, y_sent):
    """
    Description: 
    Converts the list of sentences containing words to a list of sentences conataining just numbers.
    If the word is present in the vocabulary, it is assigned the correct corresponding id;
    If the word is present in the twitter data, but not in the GMB data,
    we simply assign it the value of '__PAD__'.
    
    Returns: Two tuple of encoded sentences, encoded target labels
    
    Input Params: Two tuple of sentences and target labels.
    """
    X_train = []
    y_train = []
    
    for x_s, y_s in zip(X_sent, y_sent):

        temp_x = []
        temp_y = []

        for x, y in zip(x_s, y_s):
            try:
                x = word2id[x]
            except:
                x = word2id['__PAD__']
            try:
                y = tag2id[y]
            except:
                y = tag2id['O']

            temp_x.append(x)
            temp_y.append(y)
        
        temp_x = np.array(temp_x)
        temp_y = np.array(temp_y)
        
        X_train.append(temp_x)
        y_train.append(temp_y)
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)

    return (X_train, y_train)

In [0]:
X_train, y_train = generate_encodings(X_train_sent, y_train_sent)
X_test, y_test = generate_encodings(X_test_sent, y_test_sent)

In [0]:
# We make the sentences and the target labels of each of the same length, 62.
X_train = pad_sequences(X_train, maxlen=max_length, padding='post', value=word2id['__PAD__'])
y_train = pad_sequences(y_train, maxlen=max_length, padding='post', value=tag2id['O'])
X_test = pad_sequences(X_test, maxlen=max_length, padding='post', value=word2id['__PAD__'])
y_test = pad_sequences(y_test, maxlen=max_length, padding='post', value=tag2id['O'])

In [0]:
y_train = [to_categorical(i, n_tags) for i in y_train]
y_test = [to_categorical(i, n_tags) for i in y_test]

In [0]:
model = Sequential([
                    # Takes the ids of words and returns the associated vectors
                    # the dimensions of the resulting vector is 64
                    Embedding(input_dim=n_words, output_dim=64, input_length=max_length),
                    # The dropout layer takes a random 30% of the nodes
                    # and disconnects it. This prevents overfitting
                    Dropout(0.5),
                    # A bidirectional wrapper over LSTM Layer
                    # This helps capture contexts in both directions
                    Bidirectional(LSTM(
                        # The number of units in LSTM Layer
                        128,
                        # The Activation function can be expiremented with.
                        # Try using 'sigmoid' function or the more famous,
                        # 'relu' functions
                        activation='tanh',
                        # This layer returns the full output.
                        # If not present, just returns the last output 
                        return_sequences=True, 
                        # This can also be experimented with
                        recurrent_activation='sigmoid', 
                        use_bias=True,
                    )),
                    # Another measure to prevent overfitting
                    Dropout(0.5),
                    # keep one-to-one relations on input and output
                    # and lets sequence map indipendently
                    TimeDistributed(Dense(
                        # The number of possible 'buckets' we can put the final
                        # predicted tag into
                        n_tags,
                        # Used to calculate probability distributions,
                        # can not be experimented with without major loss of 
                        # logic and accuracy
                        activation='relu'
                    )),
                    CRF(n_tags)
])

In [0]:
adam = keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

In [0]:
model.compile(
    # Can be experimented by replacing with 'rmsprop' 
    # Both show almost the same result and take almost same training time
    optimizer=adam,
    # The way to measure how accurately the network has predicted the tag
    # from the given n_tags
    loss=crf_loss,
    # We will measure performance on the basis of accuracy
    metrics=['accuracy', crf_viterbi_accuracy])

In [0]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 62, 64)            3319552   
_________________________________________________________________
dropout_1 (Dropout)          (None, 62, 64)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 62, 256)           197632    
_________________________________________________________________
dropout_2 (Dropout)          (None, 62, 256)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 62, 5)             1285      
_________________________________________________________________
crf_1 (CRF)                  (None, 62, 5)             65        
Total params: 3,518,534
Trainable params: 3,518,534
Non-trainable params: 0
____________________________________________

In [0]:
history = model.fit(
    #  The Training data: Features and Labels
    X_train, np.array(y_train),
    #  The number of training examples in one forward/backward pass
    batch_size=256,
    # The number of iterations over the training data
    epochs=4,
    # Shows the details of training
    verbose=1,
    # Shuffles the training data
    # so that each time the network sees a new sequence of batches
    shuffle=True,
    # Setting the validation data to be that of the GMB extract
    validation_data=(X_test, np.array(y_test)),
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 530000 samples, validate on 47959 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
