In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import json
from nltk.tokenize import  word_tokenize
import tensorflow as tf
import ast

  from ._conv import register_converters as _register_converters


In [2]:
# Import word embeddings and reformat
import gensim
from gensim.models.keyedvectors import KeyedVectors
word_vectors = gensim.models.Word2Vec.load('nearby_embeddings2')
wordVectors = []
for word in word_vectors.wv.index2word:
    wordVectors.append(word_vectors.wv[word])
wordVectors = np.array(wordVectors)
wordsList = word_vectors.wv.index2word

max_sequence_length = 20

In [3]:
raw_data = pd.read_csv('/home/pratik/NER/data/large_positive.csv')
len(raw_data)

21682

In [4]:
raw_data.drop_duplicates(subset=['body'], inplace=True)
len(raw_data)

16623

In [5]:
def convert_to_list(original_text_list):
    l = ast.literal_eval(original_text_list)
    l = [i.strip() for i in l]
    return l
raw_data['original_text'] = raw_data['original_text'].apply(lambda x: convert_to_list(x))

In [6]:
def generate_y(text, original_text_list, k):
    y = np.zeros([max_sequence_length, 2])
    try:
        for i in range(len(y)):
            y[i][0] = 1
        for original_text_words in original_text_list:
            for original_text in word_tokenize(original_text_words):
                words = word_tokenize(text)
                for i in range(len(words)):
                    if words[i].lower() == original_text.lower():
                      #  print(words[i], original_text)
                        y[i][1] = 1
                        y[i][0] = 0
    except:
        pass
    return y


In [7]:
y_list = np.zeros([len(raw_data), max_sequence_length, 2])
for body, original_text_list, i in zip(raw_data['body'], raw_data['original_text'], range(len(raw_data))):
    
    y_list[i] = generate_y(body, original_text_list, i)

In [8]:
len(raw_data), len(y_list)

(16623, 16623)

In [9]:
word_tokenize(raw_data['body'].iloc[11]), raw_data['original_text'].iloc[11], y_list[11]

(['No', 'I', 'ask', 'Paytm', 'debit', 'card', 'queries'],
 ['paytm debit'],
 array([[1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.]]))

In [10]:
ids = np.load('nearby_ids2.npy')

In [11]:
max_sequence_length = 20
batch_size = 100
raw_data_len = len(raw_data)

In [12]:
import tensorflow as tf
tf.reset_default_graph()


In [13]:
lstmUnits = 64
numClasses = 2
epochs = 10
numDimensions = 100
batchSize = 100
train_len = 14000

X_train = ids[:train_len]
y_train = y_list[:train_len]
X_test = ids[train_len:]
y_test = y_list[train_len:]




#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test, ids_char_train, ids_char_test = train_test_split(ids,y_list,ids_char, test_size=.2)

len(X_train), len(y_test)

(14000, 2623)

In [14]:
def getTrainBatch(j):
    i = j*batchSize
    arr = X_train[i:i+batchSize]
    labels = y_train[i:i+batchSize]
    #print(labels[0])
    return arr, labels

In [15]:
def getTestBatch(j):
    i = j*batchSize
    arr = X_test[i:i+batchSize]
    labels =   y_test[i:i+batchSize]
  
    return arr, labels

In [16]:
labels = tf.placeholder(tf.float32, [batchSize, max_sequence_length, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, max_sequence_length])

In [17]:
wordVectors = tf.convert_to_tensor(wordVectors, dtype=tf.float32)

In [18]:
data = tf.placeholder(shape = (batchSize,max_sequence_length,numDimensions), dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

In [19]:
fw_cell3 = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True) for _ in range(2)])
bw_cell3 = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell(lstmUnits, state_is_tuple=True) for _ in range(2)])

In [20]:
outputs,value2 = tf.nn.bidirectional_dynamic_rnn(fw_cell3, bw_cell3,data,dtype=tf.float32)

In [21]:
#print(outputs)

outputs = tf.concat(outputs, 2)
#print(outputs.shape)

In [22]:
weight = tf.Variable(tf.truncated_normal([lstmUnits*2,numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[batchSize,max_sequence_length,numClasses]))

In [23]:
prediction = (tf.tensordot(outputs, weight, axes=((2,),(0,))) + bias)
print(prediction.shape)
p2 = tf.argmax(labels, axis=2)
print('p2', p2.shape)
print(labels)

prediction = tf.cast(prediction, dtype=tf.float32)

p2 = tf.cast(p2, dtype=tf.int32)

sequence_lengths = tf.constant(shape=[batch_size], value=np.zeros(batch_size)+max_sequence_length, dtype=tf.int32)


log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
inputs=prediction, 
sequence_lengths=sequence_lengths,
tag_indices=p2)
crf_loss = tf.reduce_mean(-log_likelihood)

prediction_arg = tf.argmax(prediction,axis=2)
label_arg = tf.argmax(labels,axis=2)

#false_positives
false_positives = tf.metrics.false_positives(predictions=prediction_arg, labels=label_arg)
false_negatives = tf.metrics.false_negatives(predictions=prediction_arg, labels=label_arg)
true_positives = tf.metrics.true_positives(predictions=prediction_arg, labels=label_arg)
true_negatives = tf.metrics.true_negatives(predictions=prediction_arg, labels=label_arg)
prec = tf.metrics.precision(predictions=prediction_arg, labels=label_arg)
rec = tf.metrics.recall(predictions=prediction_arg, labels=label_arg)
#false_positives = 1
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
#with tf.Session() as sess:
    #tf.initialize_all_variables().run()
#    sess.run(init_g)
#    sess.run(init_l)
#    nextBatch, nextBatchLabels, nextChars = getTrainBatch(2)
        #print(j/2)
#    pred, pa, la, fp, fn, tp, tn = sess.run([prediction, prediction_arg, label_arg, false_positives, false_negatives, true_positives, true_negatives], {input_data: nextBatch, labels: nextBatchLabels, char_input_data: nextChars})
#    print(pa.shape, la.shape, fp, fn, tp, tn)
    #a = pred[0]
    #b = nextBatchLabels
   # print(a.shape,b.shape)
    #print(a.shape)
    #print(np.argmax(a, axis=2)[0])
    #print(np.argmax(b, axis=2)[0])
    #print(np.logical_and(np.argmax(a, axis=2)[0], np.argmax(b, axis=2)[0] ))
    #print((np.equal(np.argmax(a, axis=2), np.argmax(b, axis=2)))[0])
    #print(np.mean(np.equal(np.argmax(a, axis=2), np.argmax(b, axis=2)), axis=1))
    #print(np.mean(np.equal(np.argmax(a, axis=2), np.argmax(b, axis=2)), axis=0))
    #print(np.mean(np.equal(np.argmax(a, axis=2), np.argmax(b, axis=2))))

correctPred = tf.equal(tf.argmax(prediction,axis=2), tf.argmax(labels,axis=2))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))


(100, 20, 2)
p2 (100, 20)
Tensor("Placeholder:0", shape=(100, 20, 2), dtype=float32)


In [24]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(crf_loss)


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.



In [25]:
#with tf.Session() as sess:
#    sess.run(tf.global_variables_initializer())#

#    o,w,b,p = sess.run([outputs,weight,bias,prediction],feed_dict={input_data: X_train[:100], labels: y_train[:100]})
#    print(np.array(o).shape,np.array(w).shape,np.array(b).shape,np.array(p).shape)

In [28]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

In [29]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
sess.run(init_g)
sess.run(init_l)

for i in range(epochs):
    print("epoch:  "+str(i))
    #Next Batch of reviews
    losses = 0
    for j in range(len(X_train)//batchSize):
        nextBatch, nextBatchLabels = getTrainBatch(j);
        #print(j/2)
        ll, _,pred,lab = sess.run([loss, optimizer,prediction,labels], {input_data: nextBatch, labels: nextBatchLabels})
        print(ll)
        losses += ll

    #Write summary to Tensorboard
        if (j % 5 == 0):
            summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
            writer.add_summary(summary, j)

        #Save the network every 10,000 training iterations
        if (j % 50 == 0 and j != 0):
            save_path = saver.save(sess, "models/pretrained_lstm.ckpt", global_step=i)
            print("saved to %s" % save_path)
    print(losses)
writer.close()

epoch:  0
0.4027063
0.3309901
0.38827997
0.41378832
0.3899834
0.37259337
0.30452192
0.25671157
0.20302314
0.16286886
0.16871305
0.16165435
0.16781393
0.1733327
0.18626887
0.21639888
0.23308145
0.21231525
0.23622772
0.20750391
0.22564282
0.20534225
0.24339208
0.19860339
0.18180557
0.21516585
0.18494153
0.15920922
0.15644604
0.14831796
0.14220293
0.14348264
0.17704642
0.16923255
0.18772186
0.18141694
0.17896952
0.19567922
0.16640878
0.1884636
0.20976473
0.15641302
0.17839931
0.17226258
0.16770257
0.165515
0.15009901
0.1459011
0.13996325
0.13449731
0.18185288
saved to models/pretrained_lstm.ckpt-0
0.18411303
0.21966772
0.18482776
0.2224801
0.19679184
0.22691733
0.17889321
0.19005328
0.16761215
0.19644405
0.1740489
0.1729865
0.16340843
0.14151023
0.15550362
0.14954147
0.16777688
0.17466296
0.17338832
0.16029991
0.16679135
0.16154091
0.14572772
0.13140683
0.13153909
0.13649957
0.13767393
0.13289532
0.11732234
0.1265944
0.12750772
0.12192594
0.1193729
0.112074
0.11443042
0.12623559
0.1780197

0.067369245
0.062959835
0.07811046
0.1077865
0.08818449
0.11099157
0.11922159
0.079096556
0.07261071
0.06774779
0.057838425
0.06464248
0.06602424
10.652825873345137
epoch:  5
0.0854098
0.08388153
0.060076162
0.058314182
0.06780941
0.06426245
0.046940207
0.068201475
0.07964058
0.058488727
0.061587997
0.076229244
0.06307595
0.079083145
0.07076149
0.091447964
0.106373005
0.07574722
0.069147415
0.069663994
0.061640475
0.060988106
0.06908252
0.06370289
0.056979608
0.06167524
0.073553875
0.048863143
0.05336194
0.05743119
0.054246698
0.05627713
0.09204719
0.09872565
0.08865683
0.060177643
0.062195294
0.07775209
0.051705025
0.06158237
0.06681813
0.05579348
0.06549164
0.06760609
0.06628568
0.07187007
0.06142081
0.06805907
0.058642045
0.0585137
0.08218758
saved to models/pretrained_lstm.ckpt-5
0.07563986
0.09907689
0.07187737
0.08487186
0.07118947
0.089323215
0.07403148
0.08563897
0.065151565
0.08173203
0.07492241
0.09161148
0.08819669
0.068335295
0.07815093
0.0948141
0.07690681
0.090427496
0.08

0.068432555
0.06619001
0.053597894
0.06260333
0.07101263
0.07472243
0.062079925
0.05023168
0.06442846
0.044090133
0.051365927
0.054672588
0.068066165
0.078293234
0.083468094
0.08194931
0.072679624
0.08049036
0.049375802
0.06656548
0.05534237
0.059101716
0.05709188
0.07066268
0.09276583
0.077329725
0.08698394
0.0972008
0.07071643
0.06794863
0.0655919
0.051921774
0.05762071
0.061993685
9.318816099315882


In [32]:
test_batches = 25
fin_pred = np.zeros([test_batches*100, 20, 1])
jj = 0
for j in range(test_batches):
    jj = j * 100
    nextBatch, nextBatchLabels = getTestBatch(j);
    p, t = sess.run([prediction,transition_params], {input_data: nextBatch, labels: nextBatchLabels})
    
    for k in range((batchSize)):
        viterbi_sequence, viterbi_score = tf.contrib.crf.viterbi_decode(p[k], t)
        print(np.array(viterbi_sequence).shape)
        viterbi_sequence = np.array(viterbi_sequence).reshape(20,1)
        fin_pred[jj:jj+100][k] = viterbi_sequence

(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,

(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,)
(20,

In [33]:
prediction_analysis = {'original_text': [], 'pred_original_text': [], 'body': []}
#for i in range(len(y_test)):
for i in range((test_batches*100)):
    body = word_tokenize(str(raw_data[train_len:].iloc[i]['body']))
    lab_index_test = np.where(np.argmax(y_test, axis=2)[i]==1)[0]
    lab_index_pred = np.where(fin_pred[i]==1)[0]
    original_text = []
    pred_original_text = []
    for j in lab_index_test:
        original_text.append(body[j])
    for j in lab_index_pred:
        pred_original_text.append(body[j])

    prediction_analysis['original_text'].append(original_text)
    prediction_analysis['pred_original_text'].append(pred_original_text)
    prediction_analysis['body'].append(body)

In [34]:
prediction_analysis_pandas = pd.DataFrame.from_dict(prediction_analysis)

In [35]:
tru_pos = 0
fals_pos = 0 
fals_neg = 0
true_pos = []
false_pos = []
false_neg = []
def check_tp(original_text, pred_original_text):
    global tru_pos, fals_pos, fals_neg
    #print((set(original_text)))
    #print((set(pred_original_text)))
    og = set(original_text)
    pr = set(pred_original_text)
    res = og.intersection(pr)
    true_pos.append(res)
    
    false_pos.append(pr - res)
    false_neg.append(og - res)
    tru_pos += len((set(original_text).intersection(pred_original_text)))
    fals_pos += len(pr -res)
    fals_neg += len(og - res)
    
for i in range(2500):
    check_tp(prediction_analysis_pandas.iloc[i]['original_text'], prediction_analysis_pandas.iloc[i]['pred_original_text'])

In [36]:
pp = tru_pos/ (tru_pos + fals_pos)
rr = tru_pos/ (tru_pos + fals_neg)
tru_pos, fals_pos, fals_neg, pp, rr, (2*pp*rr)/(pp+rr)

(3764, 667, 672, 0.8494696456781765, 0.848512173128945, 0.8489906394496447)

In [37]:
prediction_analysis_pandas['false_positives'] = false_pos
prediction_analysis_pandas['true_positives'] = true_pos
prediction_analysis_pandas['false_negatives'] = false_neg

In [38]:
prediction_analysis_pandas.to_csv('')

Unnamed: 0,body,original_text,pred_original_text,false_positives,true_positives,false_negatives
0,"[i, call, on, just, dial, .., its, better, the...",[samsung],[samsung],{},{samsung},{}
1,"[i, want, ambala, bus, stand, context, no, .]","[bus, stand]",[],{},{},"{bus, stand}"
2,"[Beauty, parlour, for, women]","[Beauty, parlour]",[parlour],{},{parlour},{Beauty}
3,"[Its, not, a, night, club, dear]","[night, club]","[night, club, dear]",{dear},"{club, night}",{}
4,"[More, Nearby, :, workshop]",[workshop],[workshop],{},{workshop},{}
5,"[Not, send, me, agai]",[agai],[agai],{},{agai},{}
6,"[Car, workshop]","[Car, workshop]","[Car, workshop]",{},"{Car, workshop}",{}
7,[Daraganj],[Daraganj],[Daraganj],{},{Daraganj},{}
8,"[Look, for, him]","[for, him]","[Look, him]",{Look},{him},{for}
9,"[Nearby, sbi, branch]","[sbi, branch]","[sbi, branch]",{},"{sbi, branch}",{}
