# Model evaluation

## Initialization

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

import sys
root_path = '/content/gdrive/My Drive/MasterSapienza/Semestre2/NLP/HM1/Arci/'  #change dir to your project folder
sys.path.insert(0, root_path)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as K
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed, concatenate, Activation, Masking
from tensorflow.keras.models import load_model
import pickle
import os, datetime, time

# Custom packages
import ChinesePreprocess as CP
import ModelConfiguration as ModelConfig
import TrainingUtils
from code_provided.score import score

In [0]:
def load_defaults(trainds="msr_training_simp_reordered", testds="msr_test_gold_simp_reordered", maxlength = 50, vocab_path = "vocab_msr", verbose=False):
  print("Loading configurations.")
  print("Training dataset: ", trainds)
  print("Testing dataset: ", testds)
  print("Vocabulary: ", vocab_path)
  print("maxlength: ", maxlength)
  
  print("\n******************** Loading vocabulary **********************\n")
  with open(root_path + "../vocabs/%s.pkl" % vocab_path,'rb') as voc:
    vocab = pickle.load(voc)
    print("Vocabulary loaded.")
  
  print("\n******************** Extract train data (MSR) **********************\n")
  ds_train = CP.ChinesePreprocess(root_path + "../dataset/icwb2-data/training/%s.utf8" % trainds, 
                                  num_samples=0, # Should be zero mostly. Becasue it's convenient to read everything most of the times. If not zero, some words to build the vocabulary might get lost in the trimming
                                  vocabulary = vocab, # Static preset vocabulary. Same in training
                                  verbose=False)

  print("\n\n******************** Extract test data (MSR) *******************\n")
  ds_test = CP.ChinesePreprocess(root_path + "../dataset/icwb2-data/gold/%s.utf8" % testds, 
                                 num_samples=0, 
                                 vocabulary = vocab,# Same vocabulary of training
                                 verbose=False)

    # Maximum length for padding
  train = CP.ChinesePreprocess.apply_padding_data_and_labels(ds_train, maxlength, False)
  test = CP.ChinesePreprocess.apply_padding_data_and_labels(ds_test, maxlength, False)
  print("Done.")
  return train, test

## Score function (Provided by professor)

In [0]:
pred_iter = ["BEBESBIIE","BIIIEBEBESS"]
gold_iter = ["BEBIEBIES","BIIESBEBESS"]
score(pred_iter, gold_iter)

0.7

##### Manual test (not done)

In [0]:
tf.enable_eager_execution()
rec = tf.keras.metrics.Recall()
pre = tf.keras.metrics.Precision()

In [0]:
##sess = tf.Session()
m = tf.keras.metrics.Recall()
m.update_state([[0, 1, 1, 1],[0, 1, 1, 1]], [[0, 1, 1, 1],[1, 0, 1, 1]])
print('Final result: ', m.result().numpy())  # Final result: 0.66

Instructions for updating:
Colocations handled automatically by placer.
Final result:  0.8333333


## Util methods

In [0]:
# For model evaluation

def num2BIES(sent):
  #''.join(num2BIES(['0', '2', '0', '2', '3', '3', '0', '2', '0', '2', '3', '0', '2']))
  #Out: BEBESSBEBESBE
  tags = {'0':'B',
          '1':'I',
          '2':'E',
          '3':'S'
         }
  bies = [tags[s] for s in sent]
  return bies

def readableBIES(sent):
  #readableBIES('BEBESSBEBESBE')
  #out: ['BE', 'BE', 'S', 'S', 'BE', 'BE', 'S', 'BE']
  bies = []
  word = []
  cut = False
  for i in sent:
    word.append(i)
    if i == 'B' or i == 'I': 
      if cut: 
        bies.append(''.join(word))
        word = []
      cut = True 
    if i == 'E':
      bies.append(''.join(word))
      word = []
      cut=False
    if i == 'S':
      bies.append(''.join(word))
      word = []
      cut=False
  return bies

def printComparisonSingle(ypred,k, useTest=False, table=False):
  if not useTest:
    length = len(train.sents_nospaces[k])
    print("Length of the sequence: ", str(len(train.sents_nospaces[k])))
    print("Input of network:\n\n",train.unigrams_pad[k],'\n')
    print("GroundT:",train.labels_bies[k])
    y0 = [str(np.argmax(i)) for i in ypred[k]]
    print("Output:\t" , y0[:length])
    biesy = num2BIES(y0[:length])
    if table:
      print(tabulate([["Test"] + train.sents_split[k].replace('  ',' ').split(' '), 
                     ["Output"] + readableBIES(biesy)]))
    else:
      print("Test: \t", train.sents_split[k].replace('  ',' '))
      print("BIES: \t", ''.join(biesy))
  if useTest:
    length = len(test.sents_nospaces[k])
    print("Length of the sequence: ", str(len(test.sents_nospaces[k])))
    print("Input of network:\n\n", test.unigrams_pad[k],'\n')
    print("GroundT:",test.labels_bies[k])
    y0 = [str(np.argmax(i)) for i in ypred[k]]
    print("Output:\t" , y0[:length])
    biesy = num2BIES(y0[:length])
    if table:
      print(tabulate([["Test"] + test.sents_split[k].replace('  ',' ').split(' '), 
                     ["Output"] + readableBIES(biesy)]))
    else:
      print("Test: \t", test.sents_split[k].replace('  ',' '))
      print("BIES: \t", ''.join(biesy))
  ygroundbies = num2BIES(test.labels_bies[k])
  return y0, biesy, ygroundbies  
  

# Metrics analysis. 
Find the right metric to evaluate your model.
In addition, convert your samples to readable understandable sentences

In [0]:
models = {}
model = load_model(root_path + '../models/2019-04-22_GPU_Model9_VocabSmall_model.h5')
#"content/gdrive/My Drive/MasterSapienza/Semestre2/NLP/HM1/models/2019-04-22_TPU_Model7_FullVocab_model.h5"

Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [0]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 50, 64)       27633856    input_9[0][0]                    
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 50, 64)       27633856    input_10[0][0]                   
__________________________________________________________________________________________________
concatenat

### Test from samples

In [0]:
k=0; j=16 # Range of samples to perform the experiment
xin = [test.unigrams_pad[k:j],test.bigrams_pad[k:j]]
yout = test.labels[k:j]

In [0]:
#k=0; j=16 # Range of samples to perform the experiment
#xin = [test.unigrams_pad,test.bigrams_pad]
#yout = test.labels

In [0]:
model.evaluate(x=xin,y=yout)
ypred = model.predict(x=xin)

sample = 10
y0 = printComparison(ypred,sample, useTest=True)

Length of the sequence:  48
Input of network:

 [336837 315609 138195 311174 363741 424986 129629 184437 393533 162297
 344685 204985 205336 335882 178139 354959  29769 253125  57660   8281
 427730 336837 315609 425331 195252  72974 291134 125414 424986 219356
   9223 292032 224022 177908 204985 336837 315609  24921 229145 424986
 344685 204985 159824  72974 210589  30893 424986   9223      0      0] 

GroundT: ['0', '1', '2', '0', '2', '3', '3', '0', '2', '3', '3', '3', '0', '1', '1', '2', '0', '2', '3', '0', '2', '0', '1', '1', '2', '3', '0', '2', '3', '3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '3', '3', '3', '3', '0', '2', '3', '3']
Output:	 ['0', '1', '2', '0', '2', '3', '3', '0', '2', '3', '3', '3', '0', '1', '1', '2', '1', '2', '3', '0', '2', '0', '1', '1', '2', '3', '0', '2', '3', '3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '3', '3', '3', '3', '0', '2', '3', '3']
Test: 	 社会学 概论 》 （ 合编 ） 、 《 经济体制 改革 对 农村 社会关系 的 影响 》 等 。 译着 有 《 社会 管理 》 、 《 人 的 前景 》 。
BIES: 	

In [0]:
sample = 10
y0, ybies, ygroundbies = printComparison(ypred,sample, useTest=True)
print(ygroundbies)
print(ybies)
print("Score: ", score([ybies],[ygroundbies]))

Length of the sequence:  48
Input of network:

 [336837 315609 138195 311174 363741 424986 129629 184437 393533 162297
 344685 204985 205336 335882 178139 354959  29769 253125  57660   8281
 427730 336837 315609 425331 195252  72974 291134 125414 424986 219356
   9223 292032 224022 177908 204985 336837 315609  24921 229145 424986
 344685 204985 159824  72974 210589  30893 424986   9223      0      0] 

GroundT: ['0', '1', '2', '0', '2', '3', '3', '0', '2', '3', '3', '3', '0', '1', '1', '2', '0', '2', '3', '0', '2', '0', '1', '1', '2', '3', '0', '2', '3', '3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '3', '3', '3', '3', '0', '2', '3', '3']
Output:	 ['0', '1', '2', '0', '2', '3', '3', '0', '2', '3', '3', '3', '0', '1', '1', '2', '1', '2', '3', '0', '2', '0', '1', '1', '2', '3', '0', '2', '3', '3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '3', '3', '3', '3', '0', '2', '3', '3']
Test: 	 社会学 概论 》 （ 合编 ） 、 《 经济体制 改革 对 农村 社会关系 的 影响 》 等 。 译着 有 《 社会 管理 》 、 《 人 的 前景 》 。
BIES: 	

## Experiment with whole dataset

In [0]:
k=1
j=10000
#print("Length of the sequence: ", str(len(train.sents_nospaces[k])))
#print(train.sents_split[k:j]) # No. 4 has unigrams and bigrams
#print(train.unigrams_pad[k:j])
#print(train.labels_bies[k:j])
#print(train.labels[k:j])
xin = [train.unigrams_pad[k:j],train.bigrams_pad[k:j]]
yout = train.labels[k:j]

model.evaluate(x=xin,y=yout)
ypred = model.predict(x=xin)




In [0]:
k=5
y0 = [str(np.argmax(i)) for i in ypred[5]]
print("Length of the sequence: ", str(len(train.sents_nospaces[k])))
print(train.sents_split[k]) # No. 4 has unigrams and bigrams
print(train.unigrams_pad[k])
print(train.labels_bies[k])
print(y0[:len(train.sents_nospaces[k])])

Length of the sequence:  19
“  这  首先  是  个  民族  问题  ，  民族  的  感情  问题  。
[113514 125267 328492 153491 218963 196714 210292 306428 208114  25420
  22849 210292 306428 359023 286835  41305 208114  25420 350214      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0]
['3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '0', '2', '3', '0', '2', '0', '2', '3']
['3', '3', '3', '0', '2', '3', '3', '0', '2', '3', '3', '3', '2', '0', '2', '3', '3', '3', '0']


# Models

## Model 10

In [0]:
train, test = load_defaults(testds="msr_test_gold")

Loading configurations.
Training dataset:  msr_training_simp_reordered
Testing dataset:  msr_test_gold
Vocabulary:  vocab_msr
maxlength:  50

******************** Loading vocabulary **********************

Vocabulary loaded.

******************** Extract train data (MSR) **********************

[INFO] Using preset vocabulary. No. of elements:  431779


******************** Extract test data (MSR) *******************

[INFO] Using preset vocabulary. No. of elements:  431779
Done.


In [0]:
model = load_model(root_path + '../models/2019-04-22_GPU_Model9_VocabSmall_model.h5')


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [0]:
k=0; j=10 # Range of samples to perform the experiment
xin = [test.unigrams_pad[k:j],test.bigrams_pad[k:j]]
yout = test.labels[k:j]

In [0]:
#k=0; j=16 # Range of samples to perform the experiment
#xin = [test.unigrams_pad,test.bigrams_pad]
#yout = test.labels

In [0]:
model.evaluate(x=xin,y=yout)
ypred = model.predict(x=xin)

sample = 0
y0 = printComparison(ypred,sample, useTest=True)

Length of the sequence:  13
Input of network:

 [426562  18862 246646 342376 382542 187757  68038 316411 184437 122544
  72974 161561 306922      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0] 

GroundT: ['0', '2', '0', '2', '3', '3', '0', '2', '0', '2', '3', '0', '2']
Output:	 ['0', '2', '0', '2', '3', '3', '0', '2', '0', '2', '3', '0', '3']
Test: 	 扬帆 远东 做 与 中国 合作 的 先行 
BIES: 	 BEBESSBEBESBS


In [0]:
def format_prediction(ypred):
  ypredf = [] # Predicted output (formatted)
  for y in ypred:
    ypredf.append([str(np.argmax(i)) for i in y])
  #print(ypredf)
  ybies = [''.join(num2BIES(y)[:len(test.sents_nospaces[i])]) for i,y in enumerate(ypredf)]
  return ybies

# Length of the output is the same

print([len(x) for x in format_prediction(ypred)])
print([len(x) for x in test.sents_nospaces[k:j]])

[13, 11, 24, 31, 30, 25, 35, 15, 9, 18]
[13, 11, 24, 31, 30, 25, 35, 15, 9, 18]


In [0]:
yground = [ ''.join(num2BIES(y)) for y in test.labels_bies[0:10] ]

In [0]:
yground

['BEBESSBEBESBE',
 'BESBEBESBES',
 'BESBEBESSSSBESSBEBESBIES',
 'BEBESBEBEBEBESBEBIIESBIEBEBEBES',
 'BIESSSBEBEBEBESBESBEBEBESBEBES',
 'BIESSBIIESBEBEBEBESBESBES',
 'BIIESBESSBIESSBESBESBESSBIESBEBEBES',
 'SBESSBEBEBESBES',
 'SSSBIEBES',
 'SBEBESBEBIESSSSBES']

In [0]:
print("Score: ", score(format_prediction(ypred),yground))

Score:  0.957345971563981


In [0]:
sample = 10
y0, ybies, ygroundbies = printComparisonSingle(ypred,sample, useTest=True)
print(ygroundbies)
print(ybies)
print("Score: ", score([ybies],[ygroundbies]))

Length of the sequence:  48
Input of network:

 [336837 315609 138195 311174 363741 424986 129629 184437 393533 162297
 344685 204985 205336 335882 178139 354959  29769 253125  57660   8281
 427730 336837 315609 425331 195252  72974 291134 125414 424986 219356
   9223 292032 224022 177908 204985 336837 315609  24921 229145 424986
 344685 204985 159824  72974 210589  30893 424986   9223      0      0] 

GroundT: ['0', '1', '2', '0', '2', '3', '3', '0', '2', '3', '3', '3', '0', '1', '1', '2', '0', '2', '3', '0', '2', '0', '1', '1', '2', '3', '0', '2', '3', '3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '3', '3', '3', '3', '0', '2', '3', '3']
Output:	 ['0', '1', '2', '0', '2', '3', '3', '0', '2', '3', '3', '3', '0', '1', '1', '2', '1', '2', '3', '0', '2', '0', '1', '1', '2', '3', '0', '2', '3', '3', '3', '0', '2', '3', '3', '0', '2', '0', '2', '3', '3', '3', '3', '3', '0', '2', '3', '3']
Test: 	 社会学 概论 》 （ 合编 ） 、 《 经济体制 改革 对 农村 社会关系 的 影响 》 等 。 译着 有 《 社会 管理 》 、 《 人 的 前景 》 。
BIES: 	

# Generate BIES file from Gold file


In [6]:
data = CP.ChinesePreprocess(root_path+'../dataset/icwb2-data/gold/pku_test_gold.utf8')



[MAIN] Data preprocessing  starting...

[INFO] Reading data file...
[INFO] Read file: /content/gdrive/My Drive/MasterSapienza/Semestre2/NLP/HM1/Arci/../dataset/icwb2-data/gold/pku_test_gold.utf8
[INFO] Total number of sentences:  1945
[INFO] Sample of the file: 
 ['（  二○○○年  十二月  三十一日  ）  （  附  图片  1  张  ）  ', '女士  们  ，  先生  们  ，  同志  们  ，  朋友  们  ：  '] 


[MAIN] Processing sentences into unigrams and bigrams with padding...
[INFO] Converting to unigrams...
[INFO] Converting to bigrams...

[INFO] Generating vocabulary from sentence...
[INFO] Vocabulary generated from sentence successfully. 
 Number of elements: 59386
[INFO]Sample of vocabulary: 

UNK : 1
应资 : 2
改〈 : 3
务机 : 4
任在 : 5
[INFO] Conversion to translated sentences with vocabulary complete.
--- Check: ---
Element:  1
Returned element: 
 [11750, 45342, 9278, 9278, 9278, 24479, 43800, 45342, 42946, 19175, 43800, 54949, 26372, 46308, 11750, 58710, 46027, 23140, 2589, 53440, 46308]
Converted using inverse vocabulary:
 ['（', '二', '

In [0]:
data.labels_bies
yground = [ ''.join(num2BIES(y)) for y in data.labels_bies]

In [0]:
yground
## save groundTruth file
with open(root_path+'../dataset/icwb2-data/gold/pku_test_gold_BIES.utf8','w', encoding='utf-8') as file:
  file.writelines('\n'.join(yground))
print(yground)