<a href="https://colab.research.google.com/github/TraktylCo/Using-Recurrent-NN-for-Part-of-Speech-tagging-and-Subject-Predicate-Classification/blob/master/ESCIM19_DB3b2X_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# Mount GDrive Data

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import time

intervals = (
    ('weeks', 604800),  # 60 * 60 * 24 * 7
    ('days', 86400),    # 60 * 60 * 24
    ('hours', 3600),    # 60 * 60
    ('minutes', 60),
    ('seconds', 1),
    )

def display_time(seconds, granularity=2):
    result = []

    for name, count in intervals:
        value = seconds // count
        if value:
            seconds -= value * count
            if value == 1:
                name = name.rstrip('s')
            result.append("{} {}".format(value, name))
    return ', '.join(result[:granularity])

In [0]:
# ************************************************************
# *******************  TEST MODEL  ***************************
# ************************************************************
%tensorflow_version 1.x

import numpy as np
import nltk
import tensorflow as tf
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import *
from tensorflow.python.keras.optimizers import RMSprop

# Load models
model1 = load_model('/content/gdrive/My Drive/datasets/PM_ESCIM19_DB3b_N1.h5')
model1.compile(loss='sparse_categorical_crossentropy', optimizer=RMSprop(lr=0.0001), metrics=['sparse_categorical_accuracy'])

model2 = load_model('/content/gdrive/My Drive/datasets/PM_ESCIM19_DB3b_N2.h5')
model2.compile(loss='sparse_categorical_crossentropy', optimizer=RMSprop(lr=0.0001), metrics=['sparse_categorical_accuracy'])

# Load dictionaries
word_index = np.load('/content/gdrive/My Drive/datasets/word_index_db3b.npy', allow_pickle=True).item()
pos_index = np.load('/content/gdrive/My Drive/datasets/pos_index_db3b.npy', allow_pickle=True).item()
sp_index = np.load('/content/gdrive/My Drive/datasets/sp_index_db3b.npy', allow_pickle=True).item()

In [0]:
# Load test data sets

test_sentences = np.load('/content/gdrive/My Drive/datasets/test_sentences_db3b.npy', allow_pickle=True).tolist()
test_pos = np.load('/content/gdrive/My Drive/datasets/test_pos_db3b.npy', allow_pickle=True).tolist()
test_sp = np.load('/content/gdrive/My Drive/datasets/test_sp_db3b.npy', allow_pickle=True).tolist()

In [0]:
# Build test data

test_enc_sent = []
test_enc_pos = []

for s in test_sentences:
    test_enc_sent.append([word_index[word.lower()] for word in s])     

for s in test_pos:
    test_enc_pos.append([pos_index[word.lower()] for word in s])

In [0]:
# Convert data to numpy array
test_enc_sent = np.array(test_enc_sent)
test_enc_pos = np.array(test_enc_pos)

for i, v in enumerate(test_enc_sent):
    v = np.array(v)
    test_enc_sent[i] = v
    
for i, v in enumerate(test_enc_pos):
    v = np.array(v)
    test_enc_pos[i] = v

# Show data shape (it only shows first dimension, since the others are different for each sequence)
print(test_enc_sent.shape)
print(test_enc_pos.shape)

for i in range(len(test_enc_sent)):
  test_enc_sent[i] = test_enc_sent[i].reshape(1, len(test_enc_sent[i]), 1)
  test_enc_pos[i] = test_enc_pos[i].reshape(1, len(test_enc_pos[i]), 1)

for i in range(len(test_enc_sent)):
  test_enc_sent[i] = np.array(test_enc_sent[i])
  test_enc_pos[i] = np.array(test_enc_pos[i])

(2633,)
(2633,)


In [0]:
# RESULTS NETWORK 1

# Variable to store accuracy of Network
network_accs = []

# Function to decode network prediction
def logits_to_tokens(sequences, index):
  token_sequences = []
  for categorical_sequence in sequences:
    token_sequence = []
    for categorical in categorical_sequence:
      token_sequence.append(index[np.argmax(categorical)])
 
      token_sequences.append(token_sequence)
    
  return token_sequences

print("Size of test set: ", len(test_sentences))

start = time.time()

model1.predict(test_enc_sent[0], 1)

for i in range(len(test_sentences)):
  
  # Network prediction requires encoded data
  prediction = model1.predict(test_enc_sent[i], 1)

  # Decode the network prediction
  network_results = logits_to_tokens(prediction, {i: t for t, i in pos_index.items()})[0]

  count_network = 0

  # Check number of correctly identified tags from Network results
  for j in range(len(test_pos[i])):
    if test_pos[i][j].lower() == network_results[j]:
      count_network += 1

  acc_log_network = count_network / len(test_sentences[i])
  network_accs.append(acc_log_network)

end = time.time()

elapsed_time = end - start

# Calculate final accuracy (average)
network_acc = sum(network_accs) / len(test_sentences)

print("Network Accuracy: ", network_acc*100, "%")
print("Elapsed time: ", elapsed_time)

Size of test set:  2633
Network Accuracy:  90.38760796242971 %
Elapsed time:  8.940792798995972


In [0]:
# TEST TIME NETWORK

print("Size of test set: ", len(test_sentences))

model1.predict(test_enc_sent[0], 1)

start = time.time()


for i in range(len(test_sentences)):
  
  # Network prediction requires encoded data
  prediction = model1.predict(test_enc_sent[i], 1)

end = time.time()

elapsed_time = end - start

print("Elapsed time: ", elapsed_time)

Size of test set:  2633
Elapsed time:  8.86074709892273


In [0]:
# RESULTS NLTK POS TAG
import nltk

nltk.download('averaged_perceptron_tagger')

# Variable to store accuracy of NLTK
nltk_accs = []

start = time.time()

for i in range(len(test_sentences)):
  # NLTK receives sentences (without encoding)
  nltk_results = nltk.pos_tag(test_sentences[i])

  # Check number of correctly identified tags from NLTK results
  for k in range(len(nltk_results)):
    nltk_results[k] = nltk_results[k][1]

  count_nltk = 0

  # Check number of correctly identified tags from Network results
  for j in range(len(test_pos[i])):
    if test_pos[i][j] == nltk_results[j]:
      count_nltk += 1

  # Calculate accuracy for each sentence, and append to accuracy logs
  acc_log_nltk = count_nltk / len(test_sentences[i])
  nltk_accs.append(acc_log_nltk)

end = time.time()

elapsed_time = end - start

# Calculate final accuracy (average)
nltk_acc = sum(nltk_accs) / len(test_sentences)

print("NLTK POS tagger: ", nltk_acc*100, "%")
print("Elapsed time: ", elapsed_time)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
NLTK POS tagger:  90.15603326518324 %
Elapsed time:  1.0786750316619873


In [0]:
print("Difference: ", (network_acc-nltk_acc)*100, "%")

Difference:  0.23157469724648383 %


In [0]:
!pip install git+git://github.com/emilmont/pyStatParser

Collecting git+git://github.com/emilmont/pyStatParser
  Cloning git://github.com/emilmont/pyStatParser to /tmp/pip-req-build-wci8jlde
  Running command git clone -q git://github.com/emilmont/pyStatParser /tmp/pip-req-build-wci8jlde
Building wheels for collected packages: pyStatParser
  Building wheel for pyStatParser (setup.py) ... [?25l[?25hdone
  Created wheel for pyStatParser: filename=pyStatParser-0.0.1-cp36-none-any.whl size=720546 sha256=eb4e1fb712b396556904f3a3051f7eb0da7e39b8139aaf03638eed8f71142e3d
  Stored in directory: /tmp/pip-ephem-wheel-cache-vh35c9sk/wheels/3f/c7/73/8cdd13678ef0f1d3bcc15d8b5c992662d77c32dab69fea8504
Successfully built pyStatParser
Installing collected packages: pyStatParser
Successfully installed pyStatParser-0.0.1


In [0]:
from stat_parser import Parser
from nltk import Tree

exceptions = []
print(len(test_sentences))
parser = Parser()

sentences = []

for i, sentence in enumerate(test_sentences):
  print(i)
  
  try:
    tree = parser.parse(str(" ".join(sentence)))
  except:
    print("Exception!")
    exceptions.append(i)

  sentences.append(tree)

2633
Building the Grammar Model
Time: (2.70)s

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
Exception!
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
Exception!
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
Exception!
248
249
250
251
252
253
254
255
Except

In [0]:
import numpy as np

np.save('pystat_results.npy', sentences)
np.save('exceptions.npy', exceptions)

sentences = np.load('pystat_results_db3b.npy', allow_pickle=True).tolist()
exceptions = np.load('exceptions_db3b.npy', allow_pickle=True).tolist()

In [0]:
sentences = np.load('pystat_results_db3b.npy', allow_pickle=True).tolist()
exceptions = np.load('exceptions_db3b.npy', allow_pickle=True).tolist()

In [0]:
for i in reversed(range(len(test_sentences))):
  if i in exceptions:
    sentences.pop(i)
    test_enc_pos.pop(i)
    test_sp.pop(i)

In [0]:
print(len(test_sp))

2604


In [0]:
# Read dataset & build data variables
import nltk

# Global variables
results_pystat = [[] for x in range(len(sentences))] # Save sentences tags
tag = ""
buff = ""
count = 0

# Recursive iteration through NLTK Trees
def iterate_tree(sent):
    
    global tag
  
    if not tag.endswith("_"):
        tag += "_"
    
    buff = str(sent.label())
    buff = buff.split("-")[0]

    tag += buff + "_"
    buff = ""
    
    for tok in sent:
        if type(tok) is nltk.tree.Tree:
            iterate_tree(tok)
        elif type(tok) is str:
            results_pystat[count].append(tag.split('_')[1:-1])
        
        tag = tag.split('_')
        tag = tag[:-1]
        tag = '_'.join(tag)

for sentence in sentences:
    iterate_tree(sentence)
    tag = ""
    count += 1

print("Finished!")

Finished!


In [0]:
sp_pystat = [[] for _ in range(len(results_pystat))]

for i, sentence in enumerate(results_pystat):
    for j, word in enumerate(sentence):
      for k, tag in enumerate(word):
        if tag == "NP" or tag == "VP":
          sp_pystat[i].append(tag)
          break
      
      try:
        sp_pystat[i][j]
      except:
        sp_pystat[i].append("-PAD-")

In [0]:
print(len(test_enc_pos))
print(len(sp_pystat))
print(len(test_sp))

2604
2604
2604


In [0]:
for i in reversed(range(len(test_sp))):
  if len(sp_pystat[i]) != len(test_sp[i]):
    print(i)
    sp_pystat.pop(i)
    test_enc_pos.pop(i)
    test_sp.pop(i)

2547
2146
2133
2120
2096
1920
1376
1122
1070
1067
998
742
551
469
300
284
152
81
37


In [0]:
print(len(sp_pystat))
print(len(test_enc_pos))
print(len(test_sp))

2585
2585
2585


In [0]:
# Convert data to numpy array
test_enc_pos = np.array(test_enc_pos)
    
for i, v in enumerate(test_enc_pos):
    v = np.array(v)
    test_enc_pos[i] = v

print(test_enc_pos.shape)

for i in range(len(test_enc_pos)):
  test_enc_pos[i] = test_enc_pos[i].reshape(1, len(test_enc_pos[i]), 1)

for i in range(len(test_enc_pos)):
  test_enc_pos[i] = np.array(test_enc_pos[i])

(2585,)


In [0]:
# TEST RESULTS COMPARISON
import nltk

# Variables to store accuracy of each sentence for both NLTK and Network
pystat_accs = []
network_accs = []

# Function to decode network prediction
def logits_to_tokens(sequences, index):
  token_sequences = []
  for categorical_sequence in sequences:
    token_sequence = []
    for categorical in categorical_sequence:
      token_sequence.append(index[np.argmax(categorical)])
 
      token_sequences.append(token_sequence)
    
  return token_sequences

for i in range(len(test_enc_pos)):

  # Network prediction requires encoded data
  prediction = model2.predict(test_enc_pos[i], 1)

  # Decode the network prediction
  network_results = logits_to_tokens(prediction, {i: t for t, i in sp_index.items()})[0]

  count_pystat = 0
  count_network = 0

  # Check number of correctly identified tags from Network results
  for j in range(len(test_enc_pos[i][0])):
    if test_sp[i][j] == sp_pystat[i][j]:
      count_pystat += 1

    if test_sp[i][j].lower() == network_results[j]:
      count_network += 1

  # Calculate accuracy for each sentence, and append to accuracy logs
  acc_log_pystat = count_pystat / len(test_sp[i])
  pystat_accs.append(acc_log_pystat)

  acc_log_network = count_network / len(test_sp[i])
  network_accs.append(acc_log_network)

# Calculate final accuracy (average)
network_acc = sum(network_accs) / len(test_sp)
pystat_acc = sum(pystat_accs) / len(test_sp)

print("Network Accuracy: ", network_acc*100, "%")
print("PyStatParser Accuracy: ", pystat_acc*100, "%")

Network Accuracy:  91.7432601469495 %
PyStatParser Accuracy:  61.588736986968854 %
