In [1]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import ModelEvaluator
from collections import Counter

%load_ext autoreload
%autoreload 2

MY_PRESIDIO_ENDPOINT = "http://localhost:8080/api/v1/projects/test/analyze"

## Evaluate your Presidio instance via the Presidio API

#### A. Read dataset for evaluation

In [2]:
input_samples = read_synth_dataset("../data/Synth/synth_test.json")
print("Read {} samples".format(len(input_samples)))

Read 60 samples


#### B. Descriptive statistics

In [3]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'EMAIL': 8,
         'LOCATION': 12,
         'PERSON': 44,
         'BIRTHDAY': 1,
         'ORGANIZATION': 5,
         'PHONE_NUMBER': 2,
         'URL': 1,
         'IBAN': 2})

#### C. Match the dataset's entity names with Presidio's entity names

In [4]:
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
    'EMAIL': 'EMAIL_ADDRESS',
    'LOCATION':'LOCATION',
    'PERSON': 'PERSON',
    'BIRTHDAY': 'BIRTHDAY',
    'ORGANIZATION':'ORG',
    'PHONE_NUMBER': 'PHONE_NUMBER',
    'URL': 'DOMAIN_NAME',
    'URL': 'URL',
    'IBAN': 'IBAN_CODE',
    'O': 'O'
}

new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
                                                                   entities_mapping)

#### D. Recalculate statistics on updated dataset

In [5]:
## recheck counter
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_list])])
count_per_entity_new

Counter({'EMAIL_ADDRESS': 8,
         'LOCATION': 12,
         'PERSON': 44,
         'BIRTHDAY': 1,
         'ORG': 5,
         'PHONE_NUMBER': 2,
         'URL': 1,
         'IBAN_CODE': 2})

#### E. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [6]:
from presidio_evaluator import PresidioAPIEvaluator
presidio = PresidioAPIEvaluator(all_fields=False, entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)
evaluted_samples = presidio.evaluate_all(new_list)

Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:   0%|          | 0/60 [00:00<?, ?it/s]

loading model en_core_web_trf


Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>: 100%|██████████| 60/60 [00:15<00:00,  3.85it/s]


#### F. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [7]:
evaluation_result = presidio.calculate_score(evaluted_samples)

In [8]:
evaluation_result.print()

                        Entity                     Precision                        Recall
                 EMAIL_ADDRESS                       100.00%                       100.00%
                      LOCATION                        94.34%                       100.00%
                        PERSON                        81.01%                        95.52%
                      BIRTHDAY                       100.00%                       100.00%
                           ORG                        88.89%                       100.00%
                  PHONE_NUMBER                          nan%                         0.00%
                           URL                       100.00%                       100.00%
                     IBAN_CODE                       100.00%                       100.00%
                           PII                        88.89%                        95.36%
PII F measure: 0.9201277955271565


#### G. Analyze wrong predictions

In [9]:
errors = evaluation_result.model_errors

In [10]:
ModelEvaluator.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[]
Example sentence with each FP token:


In [11]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')
fps_df

No errors of type FP and entity PERSON were found


In [57]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FN,PERSON,O,Wilson,"For my take on Mr. Wilson, see Guilty Pleasure...",male,American,Latvia,False,95
1,Wrong entity,PERSON,ORG,Spartacus,Spartacus is a very sympathetic person. He's a...,male,Russian,Malta,False,87
2,FN,PERSON,O,Souza,"Unlike the Souza novel, it's not about necroph...",female,Brazil,Mauritania,False,96
3,Wrong entity,PERSON,LOCATION,Raisová,"Unlike the Raisová novel, it's not about necro...",female,Czech,Belgium,False,96


In [31]:
for i, sample in enumerate(input_samples):
    if 'Kristian shouted at Enrico' in sample.full_text:
        s = i
        break

In [32]:
input_samples[s]

Full text: Kristian shouted at Enrico: "What are you doing here?"
Spans: [Type: PERSON, value: Kristian, start: 0, end: 8, Type: PERSON, value: Enrico, start: 20, end: 26]
Tokens: [Kristian, shouted, at, Enrico, :, ", What, are, you, doing, here, ?, "]
Tags: ['U-PERSON', 'O', 'O', 'U-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

In [60]:
s = evaluted_samples[10]

In [65]:
s.results

Counter({('PERSON', 'PERSON'): 1, ('O', 'O'): 12})