**BI-LSTM FOR NER**

Importing basic libraries

In [None]:
import numpy as np
import pandas as pd

Importing data

In [None]:
dframe = pd.read_csv("ner_data.csv", encoding = "ISO-8859-1", error_bad_lines=False)

In [None]:
test = pd.read_csv("test_data.csv")

In [None]:
test.head(20)

In [None]:
dframe.head()

DATA EXPLORATION AND DATA PREPROCESSING

In [None]:
dframe.columns

In [None]:
dataset=dframe.drop(['Unnamed: 0', 'lemma', 'next-lemma', 'next-next-lemma', 'next-next-pos',
       'next-next-shape', 'next-next-word', 'next-pos', 'next-shape',
       'next-word', 'prev-iob', 'prev-lemma', 'prev-pos',
       'prev-prev-iob', 'prev-prev-lemma', 'prev-prev-pos', 'prev-prev-shape',
       'prev-prev-word','shape', 'prev-shape', 'prev-word',"pos"],axis=1)

In [None]:
dataset.info()

In [None]:
dataset.head(30)

**Create list of list of tuples to differentiate each sentence from each other**

In [None]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = SentenceGetter(dataset)
getter1 = SentenceGetter(test) 

In [None]:
sentences = getter.sentences
test_sentence = getter1.sentences

In [None]:
print(sentences[5])
print(test_sentence[1])

In [None]:
maxlen = max([len(s) for s in sentences])
print ('Maximum sequence length:', maxlen)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [None]:
plt.hist([len(s) for s in sentences], bins=50)
plt.show()

In [None]:
dataset

In [None]:
test

In [None]:
words = list(set(dataset["word"].values))
test_words = list(set(test["word"].values))
words.append("ENDPAD")
test_words.append("ENDPAD")

In [None]:
n_words = len(words); n_words


In [None]:
n_test_words = len(test_words); n_test_words

In [None]:
tags = list(set(dataset["tag"].values))
test_tags = list(set(test["tag"].values))


In [None]:
n_tags = len(tags); n_tags

In [None]:
n_test_tags = len(test_tags); n_test_tags

Converting words to numbers and numbers to words

In [None]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
word2idx_test = {w: i for i, w in enumerate(test_words)}
tag2idx_test = {t: i for i, t in enumerate(test_tags)}

In [None]:
word2idx['Obama']

In [None]:
tag2idx["O"]

In [None]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [None]:
from keras.preprocessing.sequence import pad_sequences
X_test = [[word2idx_test[w[0]] for w in s] for s in test_sentence]

In [None]:
X = pad_sequences(maxlen=140, sequences=X, padding="post",value=n_words - 1)

In [None]:
X_test_t = pad_sequences(maxlen=140, sequences=X_test, padding="post",value=n_words - 1)

In [None]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]

In [None]:
y_test = [[tag2idx_test[w[1]] for w in s] for s in test_sentence]

In [None]:
y = pad_sequences(maxlen=140, sequences=y, padding="post", value=tag2idx["O"])

In [None]:
y_test_t = pad_sequences(maxlen=140, sequences=y_test, padding="post", value=tag2idx_test["O"])

In [None]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [None]:
from keras.utils import to_categorical
y_test_t = [to_categorical(i, num_classes=n_test_tags) for i in y_test_t]

In [None]:
X_test_t


In [None]:
X_test

In [None]:
y_test_t

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [None]:
input = Input(shape=(140,))
model = Embedding(input_dim=n_words, output_dim=140, input_length=140)(input)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [None]:
model = Model(input, out)

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
X_test

In [None]:
history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=4, validation_split=0.2, verbose=1)

In [None]:
model.predict(y_test)

In [None]:
model.summary()