# NLP PROJECT FOR UE16CS333 BY TEAM 13
## Team Members :
## Abhishek Narayanan (01FB16ECS016)
## Abhishek Prasad (01FB16ECS017)
## Abijna Rao (01FB16ECS019)

## Character level BI-Directional LSTM model

###  Read Datasets

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("C:\\Users\Abhishek\\Desktop\\Named-Entity-Recognition\\Twitterdata\\annotatedData.csv")
data = data.fillna(method="ffill")


In [2]:
data.tail(10)

Unnamed: 0,Sent,Word,Tag
72133,sent: 3637,bn,Other
72134,sent: 3637,koi,Other
72135,sent: 3637,dokhe,Other
72136,sent: 3637,se,Other
72137,sent: 3637,idhr,Other
72138,sent: 3637,udhr,Other
72139,sent: 3637,kar,Other
72140,sent: 3637,gya,Other
72141,sent: 3637,#pnb,B-Org
72142,sent: 3637,#fraud,Other


In [3]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

14866

In [4]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

7

### Form Sentences using sentence numbers in the word tag list provided

In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sent").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["sent: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(data)


In [8]:
sent = getter.get_next()


In [9]:
print(sent)


[('agar', 'Other'), ('#notebandi', 'Other'), ('ke', 'Other'), ('time', 'Other'), ('political', 'B-Org'), ('party', 'I-Org'), ('bhi', 'Other'), ('#rti', 'Other'), ('ke', 'Other'), ('daayre', 'Other'), ('me', 'Other'), ('aa', 'Other'), ('jati', 'Other'), ('to', 'Other'), ('#sukmaattack', 'Other'), ('#kashmir', 'B-Loc'), ('me', 'Other'), ('patthar', 'Other'), ('attack', 'Other'), ('na', 'Other'), ('hote', 'Other'), ('@PMOIndia', 'Other'), ('@PMOIndia', 'Other')]


In [10]:
sentences = getter.sentences

In [11]:
max_len = 75
max_len_char = 10

### Map word to index

In [12]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [13]:
print(word2idx["agar"])
print(tag2idx["B-Org"])

1925
2


In [14]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [15]:
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')


### Pad word sequences to form uniform length vectors

In [16]:
max_len_char


10

In [17]:
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)

108


In [19]:
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0

In [20]:
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

### Map tag labels to index

In [21]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]


In [22]:
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')


### Split into training and testing set

In [24]:
from sklearn.model_selection import train_test_split


In [25]:
X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)

In [26]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

### Character level Bidirectional LSTM Model

In [27]:
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len, mask_zero=True)(word_in)

# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char, mask_zero=True))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5))(emb_char)

# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x)
out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)

model = Model([word_in, char_in], out)

In [28]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 75, 10)       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 75)           0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 75, 10, 10)   1100        input_2[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 75, 20)       297360      input_1[0][0]                    
__________________________________________________________________________________________________
time_distr

In [29]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr).reshape(len(y_tr), max_len, 1),
                    batch_size=32, epochs=20, validation_split=0.1, verbose=1)

Train on 2498 samples, validate on 278 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [32]:
hist = pd.DataFrame(history.history)


### Perform Prediction

In [33]:
y_pred = model.predict([X_word_te,
                        np.array(X_char_te).reshape((len(X_char_te),
                                                     max_len, max_len_char))])

In [34]:
i = 250
p = np.argmax(y_pred[i], axis=-1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_word_te[i], y_te[i], p):
    if w != 0:
        print("{:15}: {:5} {}".format(idx2word[w], idx2tag[t], idx2tag[pred]))

Word           ||True ||Pred
pagal          : Other Other
he             : Other Other
vo             : Other Other
Israel         : B-Loc B-Loc
ne             : Other Other
kabi           : Other Other
apne           : Other Other
country        : Other Other
me             : Other Other
rahene         : Other Other
vale           : Other Other
Muslim         : B-Org B-Org
ko             : Other Other
ya             : Other Other
unki           : Other Other
masjido        : Other Other
ko             : Other Other
nai            : Other Other
giraya         : Other Other
...            : Other Other
na             : Other Other
hi             : Other Other
kisi           : Other Other
Muslim         : B-Org B-Org
ki             : Other Other
ladies         : Other Other
pe             : Other Other
Rape           : Other Other
kiya           : Other Other
..             : Other Other
vaha           : Other Other
Yahudi         : Other Other
ladkiya        : Other Other
showkh        

In [64]:
predicted=[ list(np.argmax(y_pred[i],axis=-1)) for i in range(len(y_pred))]
correct_flat = [item for sublist in y_te for item in sublist]
predicted_flat = [item for sublist in predicted for item in sublist]

remove_padded_correct=[correct_flat[i] for i in range(len(correct_flat)) if correct_flat[i] != 0]
remove_padded_predicted=[predicted_flat[i] for i in range(len(correct_flat)) if correct_flat[i] != 0 ]
from sklearn.metrics import classification_report
print(classification_report(remove_padded_correct,remove_padded_predicted, target_names=target_names))

             precision    recall  f1-score   support

      I-Loc       0.79      0.67      0.73       259
      B-Org       0.74      0.72      0.73       158
      I-Per       0.80      0.65      0.72        69
      Other       0.00      0.00      0.00         1
      B-Per       0.00      0.00      0.00        12
      I-Org       0.53      0.51      0.52        65
      B-Loc       0.98      0.99      0.98      6629

avg / total       0.96      0.96      0.96      7193



  'precision', 'predicted', average, warn_for)
