In [160]:
# Ignore all levels of warnings and info which can be exist from Tensorflow
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import logging
logging.getLogger("tensorflow").setLevel(logging.WARNING)

In [161]:
import numpy as np
import os
import pandas as pd
import csv
import string
from tqdm import tqdm

In [72]:
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, Bidirectional, TimeDistributed

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model

In [162]:
import tensorflow_hub as hub
import keras

In [75]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [120]:
import gensim.models.keyedvectors as word2vec
from gensim.models import KeyedVectors

word2vec_path = "D:\\WordEmbeddings\\GoogleNews-vectors-negative300.bin"
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [3]:
# Read dataset
def read_dataset(dataset):
    # read line by line
    with open(dataset) as f:
        reader = csv.reader(f, delimiter="\t")
        d = list(reader)
    
    tags = []
    sentences = []
    
    tag = []
    sentence = []
    
    for elem in d:
        # means that empty line
        if len(elem) == 0:
            tags.append(tag)
            sentences.append(sentence)
            
            tag = []
            sentence = []
        
        else:
            w = elem[1]
            t = elem[0]
            
            if w not in string.punctuation:
                tag.append(t)
                sentence.append(w)               
            
    
    # create pandas dataframe
    df = pd.DataFrame(list(zip(sentences, tags)), 
               columns =['words', 'tags'])
        
    return df

In [4]:
df = read_dataset("train.txt")

In [5]:
df.head()

Unnamed: 0,words,tags
0,"[what, movies, star, bruce, willis]","[O, O, O, B-ACTOR, I-ACTOR]"
1,"[show, me, films, with, drew, barrymore, from,...","[O, O, O, O, B-ACTOR, I-ACTOR, O, O, B-YEAR]"
2,"[what, movies, starred, both, al, pacino, and,...","[O, O, O, O, B-ACTOR, I-ACTOR, O, B-ACTOR, I-A..."
3,"[find, me, all, of, the, movies, that, starred...","[O, O, O, O, O, O, O, O, B-ACTOR, I-ACTOR, O, ..."
4,"[find, me, a, movie, with, a, quote, about, ba...","[O, O, O, O, O, O, O, O, O, O, O]"


In [11]:
def get_unique_tags(df):
    words = set()
    tags = set()

    # for padding operation over sentence
    words.add("ENDPAD")
    tags.add(np.nan)
    
    for index, row in df.iterrows():
        # for each word in each row
        for word in row["words"]:
            words.add(word)
        for tag in row["tags"]:
            tags.add(tag)

    words = list(words)
    tags = list(tags)
    
    return words,tags

In [12]:
words, tags = get_unique_tags(df)

In [14]:
len(words)

6709

In [15]:
"ENDPAD" in words

True

In [23]:
for i, e in enumerate(words):
    if e == "what":
        print(i)
        print("var")

6357
var


In [31]:
word2idx = {words[idx] : idx for idx, elem in enumerate(words)}
tag2idx = {tags[idx] : idx for idx, elem in enumerate(tags)}

In [32]:
len(words)

6709

In [33]:
word2idx["what"]

6357

In [34]:
tag2idx

{nan: 0,
 'I-ACTOR': 1,
 'B-YEAR': 2,
 'B-TRAILER': 3,
 'B-REVIEW': 4,
 'I-RATINGS_AVERAGE': 5,
 'B-PLOT': 6,
 'I-GENRE': 7,
 'I-CHARACTER': 8,
 'B-ACTOR': 9,
 'I-DIRECTOR': 10,
 'I-SONG': 11,
 'B-RATINGS_AVERAGE': 12,
 'I-RATING': 13,
 'B-SONG': 14,
 'I-TITLE': 15,
 'O': 16,
 'B-TITLE': 17,
 'B-DIRECTOR': 18,
 'B-GENRE': 19,
 'I-PLOT': 20,
 'I-REVIEW': 21,
 'B-CHARACTER': 22,
 'I-TRAILER': 23,
 'B-RATING': 24,
 'I-YEAR': 25}

In [59]:
# convert data into numerical form
X = [[word2idx[word] for word in row_elem['words']] for index, row_elem in df.iterrows()]
y = [[tag2idx[tag] for tag in row_elem['tags']] for index, row_elem in df.iterrows()]

In [60]:
word2idx["what"]

6357

In [61]:
max_seq_lenth = 0
for sentence in X:
    if len(sentence) > max_seq_lenth:
        max_seq_lenth = len(sentence)

In [62]:
max_seq_lenth

47

In [63]:
X[0]

[6357, 860, 4392, 4169, 5545]

In [64]:
# padding sequences for LSTM
X = pad_sequences(X, maxlen=max_seq_lenth, dtype='int32', padding='post', value=word2idx["ENDPAD"])
y = pad_sequences(y, maxlen=max_seq_lenth, dtype='int32', padding='post', value=tag2idx["O"])

In [65]:
X.shape

(9774, 47)

In [66]:
y.shape

(9774, 47)

In [67]:
y[0]

array([16, 16, 16,  9,  1, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16])

In [68]:
# one hot conversion for labels
y = to_categorical(y, num_classes=len(tags))

In [69]:
y.shape

(9774, 47, 26)

In [163]:
url = "https://tfhub.dev/google/elmo/2"
elmo = hub.Module(url)

In [164]:
# just a random sentence
x = ["Roasxxx ants are a popular snack in Columbia"]
x2 = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(x, signature="default", as_dict=True)["elmo"]
embeddings2 = elmo(x2, signature="default", as_dict=True)["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [165]:
def elmo_vectors(x):
    embeddings = elmo(x.tolist(), signature="default", as_dict=True)["elmo"]
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [167]:
# Initialize embedding layer
EMBEDDING_DIM = 1024

embedding_matrix = np.zeros((len(words), EMBEDDING_DIM))
for word, i in word2idx.items():
    try:
        embedding_vector = elmo(word.lower(), signature="default", as_dict=True)["elmo"]
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    except:
        # assign normal noise vector instead of full zeros
        noise = np.random.normal(0, 1, (1024,))
        embedding_matrix[i] = noise


In [168]:
# Early Stopping Callback
callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='best_model_NER.h5', monitor='val_loss', save_best_only=True)]

In [170]:
# def create model
input_node = Input(shape=(max_seq_lenth,))
embedding = Embedding(input_dim=len(words), output_dim=1024, input_length=max_seq_lenth, weights=[embedding_matrix], trainable = True)(input_node)
rec_layer = Bidirectional(LSTM(50, return_sequences= True, recurrent_dropout=0.2))(embedding)
output = TimeDistributed(Dense(len(tag2idx), activation="softmax"))(rec_layer)

NER = Model(input_node, output)

In [171]:
NER.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [172]:
NER.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 47)                0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 47, 1024)          6870016   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 47, 100)           430000    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 47, 26)            2626      
Total params: 7,302,642
Trainable params: 7,302,642
Non-trainable params: 0
_________________________________________________________________


In [173]:
history = NER.fit(X, np.array(y), batch_size=32, epochs=20, validation_split=0.3,callbacks=callbacks, verbose=1)

Train on 1709 samples, validate on 733 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20


Epoch 11/20
Epoch 12/20
Epoch 13/20


Epoch 14/20
Epoch 15/20
Epoch 16/20


Epoch 17/20


In [174]:
# Testing
test_set = read_dataset("test.txt")

In [175]:
# convert data into numerical form
X  =[]

for idx,d_elem in test_set.iterrows():
    seq = []
    for w in d_elem[0]:
        try:
            num = word2idx[w]
        except:
            num = word2idx["ENDPAD"]
        finally:
            seq.append(num)
    X.append(seq)

y = [[tag2idx[t] for t in d_elem[1]] for idx,d_elem in test_set.iterrows()]


In [176]:
X = pad_sequences(maxlen=max_seq_lenth, sequences=X, padding='post', value=word2idx["ENDPAD"])
y = pad_sequences(maxlen=max_seq_lenth, sequences=y, padding='post', value=tag2idx["O"])

In [177]:
# One-hot Encoding
y = to_categorical(y, num_classes=len(tag2idx))

In [178]:
# Load Best Model
best_model = load_model("best_model_NER.h5")

In [179]:
score, acc = best_model.evaluate(X,y)



In [180]:
print("Test Set Acc:",acc)

Test Set Acc: 0.9882464681269203


In [181]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("ENDPAD", "O"))
        out.append(out_i)
    return out

In [182]:
test_pred = best_model.predict(X)

In [183]:
idx2tag = dict(map(reversed, tag2idx.items()))

In [184]:
pred_labels = pred2label(test_pred)
test_labels = pred2label(y)

In [185]:
print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))

F1-score: 85.5%


In [186]:
print(classification_report(test_labels, pred_labels))

                 precision    recall  f1-score   support

          TITLE       0.73      0.69      0.71       562
RATINGS_AVERAGE       0.84      0.85      0.84       451
       DIRECTOR       0.79      0.77      0.78       456
           YEAR       0.94      0.97      0.96       720
           PLOT       0.76      0.75      0.75       491
         RATING       0.94      0.95      0.94       500
          ACTOR       0.81      0.84      0.82       812
          GENRE       0.95      0.96      0.95      1117
         REVIEW       0.66      0.70      0.68        56
        TRAILER       0.81      0.87      0.84        30
      CHARACTER       0.71      0.67      0.69        89
           SONG       0.69      0.74      0.71        54

      micro avg       0.85      0.86      0.86      5338
      macro avg       0.85      0.86      0.85      5338

