# Evaluating the results of Training

In [52]:
from __future__ import unicode_literals, print_function
import spacy
from spacy.lang.es import Spanish 
from spacy import displacy
from spacy.tokens import Doc
from collections import defaultdict, Counter
from spacy.attrs import ORTH
from spacy.scorer import Scorer
from spacy.language import GoldParse
from spacy.util import minibatch, compounding

import pandas as pd
import numpy as np
import json
import plac
import random
from sklearn.model_selection import train_test_split
from pathlib import Path

#nlp = Spanish().from_disk("Documents/Research/NPL/SevillianPaintersNPL/EM Spanish Model/Trained_Model")

In [53]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [54]:
#Convert Dataturks to Spacy Format

TAGGED_DATA = convert_dataturks_to_spacy("/Users/Felipe/Documents/Research/NPL/SevillianPaintersNPL/seville painters test 2-3.json")

In [55]:
# Load Spacy Model

nlp = spacy.load('es_core_news_md')

In [100]:
# Testing how much the evaluation depends on texts included in testing data

#Generate empty dictionary for storing evaluation results of different trials
d = {}

#Loop 101 times
for x in range(0,101):
    
    #Batching the Tagged Data into training and evaluation data (80-20)

    random.shuffle(TAGGED_DATA)
    train_data = TAGGED_DATA[:326]
    test_data = TAGGED_DATA[326:]

    #Testing NER results of existing model on test data

    def evaluate(ner_model, examples):
        scorer = Scorer()
        for sents, ents in examples:
            doc_gold = ner_model.make_doc(sents)
            gold = GoldParse(doc_gold, entities=ents['entities'])
            pred_value = ner_model(sents)
            scorer.score(pred_value, gold)
        return scorer.scores

    results = evaluate(nlp,test_data)
    d[x] = pd.DataFrame(results)

In [101]:
print(d[3])

      uas  las     ents_p     ents_r     ents_f  \
DATE  0.0  0.0  48.702595  41.924399  45.060018   
LOC   0.0  0.0  48.702595  41.924399  45.060018   
MISC  0.0  0.0  48.702595  41.924399  45.060018   
MON   0.0  0.0  48.702595  41.924399  45.060018   
OBJ   0.0  0.0  48.702595  41.924399  45.060018   
ORG   0.0  0.0  48.702595  41.924399  45.060018   
PER   0.0  0.0  48.702595  41.924399  45.060018   

                                          ents_per_type  tags_acc  token_acc  \
DATE                     {'p': 0.0, 'r': 0.0, 'f': 0.0}       0.0      100.0   
LOC   {'p': 46.012269938650306, 'r': 49.342105263157...       0.0      100.0   
MISC                     {'p': 0.0, 'r': 0.0, 'f': 0.0}       0.0      100.0   
MON                      {'p': 0.0, 'r': 0.0, 'f': 0.0}       0.0      100.0   
OBJ                      {'p': 0.0, 'r': 0.0, 'f': 0.0}       0.0      100.0   
ORG   {'p': 7.142857142857142, 'r': 2.38095238095238...       0.0      100.0   
PER   {'p': 57.33788395904437, 

In [102]:
columns=['ents_p', 'ents_r', 'ents_f', 'label','trial']
eval_data = pd.DataFrame(columns=columns)
eval_data = eval_data.fillna(0)

In [103]:
print(eval_data)

Empty DataFrame
Columns: [ents_p, ents_r, ents_f, label, trial]
Index: []


In [104]:
#Extract information from dictionary (f, p and r scores for each label within each trial) and save it into a dataframe
for x in d:
    ev_date= d[x].loc['DATE','ents_per_type']
    ev_loc= d[x].loc['LOC','ents_per_type']
    ev_misc= d[x].loc['MISC','ents_per_type']
    ev_mon= d[x].loc['MON','ents_per_type']
    ev_obj= d[x].loc['OBJ','ents_per_type']
    ev_org= d[x].loc['ORG','ents_per_type']
    ev_per= d[x].loc['PER','ents_per_type']
    newrow1={'ents_p':ev_date['p'],'ents_r':ev_date['r'],'ents_f':ev_date['f'],'label':'DATE','trial':x}
    newrow2={'ents_p':ev_loc['p'],'ents_r':ev_loc['r'],'ents_f':ev_loc['f'],'label':'LOC','trial':x}
    newrow3={'ents_p':ev_misc['p'],'ents_r':ev_misc['r'],'ents_f':ev_misc['f'],'label':'MISC','trial':x}
    newrow4={'ents_p':ev_mon['p'],'ents_r':ev_mon['r'],'ents_f':ev_mon['f'],'label':'MON','trial':x}
    newrow5={'ents_p':ev_obj['p'],'ents_r':ev_obj['r'],'ents_f':ev_obj['f'],'label':'OBJ','trial':x}
    newrow6={'ents_p':ev_org['p'],'ents_r':ev_org['r'],'ents_f':ev_org['f'],'label':'ORG','trial':x}
    newrow7={'ents_p':ev_per['p'],'ents_r':ev_per['r'],'ents_f':ev_per['f'],'label':'PER','trial':x}
    eval_data=eval_data.append(newrow1,ignore_index=True)
    eval_data=eval_data.append(newrow2,ignore_index=True)
    eval_data=eval_data.append(newrow3,ignore_index=True)
    eval_data=eval_data.append(newrow4,ignore_index=True)
    eval_data=eval_data.append(newrow5,ignore_index=True)
    eval_data=eval_data.append(newrow6,ignore_index=True)
    eval_data=eval_data.append(newrow7,ignore_index=True)  

In [105]:
print(eval_data)

        ents_p     ents_r     ents_f label trial
0     0.000000   0.000000   0.000000  DATE     0
1    39.408867  48.192771  43.360434   LOC     0
2     0.000000   0.000000   0.000000  MISC     0
3     0.000000   0.000000   0.000000   MON     0
4     0.000000   0.000000   0.000000   OBJ     0
5     9.090909   1.369863   2.380952   ORG     0
6    45.682451  73.542601  56.357388   PER     0
7     0.000000   0.000000   0.000000  DATE     1
8    46.153846  50.704225  48.322148   LOC     1
9     0.000000   0.000000   0.000000  MISC     1
10    0.000000   0.000000   0.000000   MON     1
11    0.000000   0.000000   0.000000   OBJ     1
12   15.384615   4.545455   7.017544   ORG     1
13   62.737643  81.280788  70.815451   PER     1
14    0.000000   0.000000   0.000000  DATE     2
15   36.416185  41.447368  38.769231   LOC     2
16    0.000000   0.000000   0.000000  MISC     2
17    0.000000   0.000000   0.000000   MON     2
18    0.000000   0.000000   0.000000   OBJ     2
19   10.000000   1.6

In [106]:
#Measure mean and standard deviation of f, p and r scores for each label 
eval_data.groupby('label').agg({'ents_f':['mean','std'],'ents_p':['mean','std'],'ents_r':['mean','std']})

Unnamed: 0_level_0,ents_f,ents_f,ents_p,ents_p,ents_r,ents_r
Unnamed: 0_level_1,mean,std,mean,std,mean,std
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
DATE,0.0,0.0,0.0,0.0,0.0,0.0
LOC,44.520979,4.40579,42.088666,5.309654,47.555381,4.680152
MISC,0.0,0.0,0.0,0.0,0.0,0.0
MON,0.0,0.0,0.0,0.0,0.0,0.0
OBJ,0.0,0.0,0.0,0.0,0.0,0.0
ORG,2.744431,2.505632,8.923374,8.278483,1.668712,1.578833
PER,62.982444,4.325562,54.139124,4.884477,75.441766,3.493916
