In [1]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import ModelEvaluator
from collections import Counter

%load_ext autoreload
%autoreload 2

MY_PRESIDIO_ENDPOINT = "http://localhost:8080/api/v1/projects/test/analyze"

## Evaluate your Presidio instance via the Presidio API

#### A. Read dataset for evaluation

In [2]:
input_samples = read_synth_dataset("../data/synth_dataset.json")
print("Read {} samples".format(len(input_samples)))

Read 299 samples


#### B. Descriptive statistics

In [3]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'PERSON': 174,
         'CREDIT_CARD': 49,
         'LOCATION': 75,
         'ORGANIZATION': 48,
         'US_SSN': 1,
         'EMAIL': 11,
         'BIRTHDAY': 4,
         'TITLE': 4,
         'URL': 8,
         'PHONE_NUMBER': 9,
         'IP_ADDRESS': 3,
         'IBAN': 3,
         'NATIONALITY': 1})

#### C. Match the dataset's entity names with Presidio's entity names

In [4]:
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
    'PERSON': 'PERSON',
    'EMAIL': 'EMAIL_ADDRESS',
    'CREDIT_CARD': 'CREDIT_CARD',
    'FIRST_NAME': 'PERSON',
    'PHONE_NUMBER': 'PHONE_NUMBER',
    'LOCATION':'LOCATION',
    'BIRTHDAY': 'DATE_TIME',
    'DATE': 'DATE_TIME',
    'DOMAIN': 'DOMAIN',
    'CITY': 'LOCATION',
    'ADDRESS': 'LOCATION',
    'IBAN': 'IBAN_CODE',
    'URL': 'DOMAIN_NAME',
    'US_SSN': 'US_SSN',
    'IP_ADDRESS': 'IP_ADDRESS',
    'ORGANIZATION':'ORG',
    'O': 'O'
}
presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE', 'ORG', 
                   'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']

new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
                                                                   entities_mapping,
                                                                   presidio_fields)

#### D. Recalculate statistics on updated dataset

In [5]:
## recheck counter
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_list])])
count_per_entity_new

Counter({'PERSON': 174,
         'CREDIT_CARD': 49,
         'LOCATION': 75,
         'ORG': 48,
         'US_SSN': 1,
         'EMAIL_ADDRESS': 11,
         'DATE_TIME': 4,
         'DOMAIN_NAME': 8,
         'PHONE_NUMBER': 9,
         'IP_ADDRESS': 3,
         'IBAN_CODE': 3})

#### E. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [6]:
from presidio_evaluator import PresidioAPIEvaluator
presidio = PresidioAPIEvaluator(entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)
evaluted_samples = presidio.evaluate_all(new_list)

Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:   0%|          | 0/299 [00:00<?, ?it/s]

loading model en_core_web_lg


Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>: 100%|██████████| 299/299 [00:15<00:00, 19.47it/s]


#### F. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [7]:
evaluation_result = presidio.calculate_score(evaluted_samples)

In [8]:
evaluation_result.print()

                        Entity                     Precision                        Recall
                        PERSON                        80.81%                        85.41%
                   CREDIT_CARD                       100.00%                        93.88%
                      LOCATION                        67.19%                        14.19%
                           ORG                          nan%                         0.00%
                        US_SSN                          nan%                          nan%
                 EMAIL_ADDRESS                       100.00%                       100.00%
                     DATE_TIME                         2.60%                        50.00%
                   DOMAIN_NAME                          nan%                          nan%
                  PHONE_NUMBER                       100.00%                        27.78%
                    IP_ADDRESS                          nan%                          nan%

#### G. Analyze wrong predictions

In [21]:
errors = evaluation_result.model_errors

In [22]:
ModelEvaluator.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[('Jesus', 126), ('David', 45), ('the', 33), ('Israel', 32), ('Saul', 28)]
Example sentence with each FP token:
When Jesus finished speaking, the people were amazed at his teaching.
I will do this for my servant David, so he will always have someone to rule near me in Jerusalem, the city that I chose to be my own.
They said, "John the Baptizer sent us to you with this question: 'Are you the one who is coming, or should we wait for someone else?'"
I swear by the Lord who saves Israel, that even if my own son Jonathan sinned, he must die.
Saul quickly fell to the ground and lay stretched out there.


In [27]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')
fps_df[['full_text','token','prediction']]

Unnamed: 0,full_text,token,prediction
0,Pero Stolar will be back next week at a specia...,Eastern,PERSON
1,Our Rabbis teach us that committing a crime in...,Torah,PERSON
2,"The result, ""Poodle Springs"" (Newmark & Lewis,...",Marlowe,PERSON
3,If your point is that it was done to goad Japa...,Japan,LOCATION
4,If your point is that it was done to goad Japa...,Indochina,LOCATION
...,...,...,...
1905,"And Elijah, the woman, and her son had enough ...",Elijah,PERSON
1906,I would love to hitch a ride from the hotel to...,PreParty,PERSON
1907,I would love to hitch a ride from the hotel to...,Wednesay,PERSON
1908,He thinks most of the noise complaints origina...,Bill,PERSON


In [24]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FN,PERSON,O,Landmark'schecks,An initial batch of Mr. Caroline Landmark'sche...,female,Norwegian,Middle-East,False,7900
1,FN,PERSON,O,Thaís,Although Thaís Correia'strip would seem to hav...,female,Czech,Bolivia,False,36146
2,FN,PERSON,O,Correia'strip,Although Thaís Correia'strip would seem to hav...,female,Czech,Bolivia,False,36146
3,FN,PERSON,O,Efimiya,"According to Efimiya Dubinina, deputy secretar...",female,Russian,Ethiopia,False,75227
4,FN,PERSON,O,Dubinina,"According to Efimiya Dubinina, deputy secretar...",female,Russian,Ethiopia,False,75227
...,...,...,...,...,...,...,...,...,...,...
252,FN,PERSON,O,Asu,A third assistant principal Asu Vizirov was ki...,male,Chechen (Latin),Iraq,False,8636
253,FN,PERSON,O,Vizirov,A third assistant principal Asu Vizirov was ki...,male,Chechen (Latin),Iraq,False,8636
254,FN,PERSON,O,Lafontaine'sThe,The transaction called for Mr. Valérie Lafonta...,female,French,Bangladesh,False,15276
255,FN,PERSON,O,zorislav,it is worth noting that zorislav ivanković imm...,male,Croatian,Somalia,True,63633


In [35]:
input_samples[100]

Full text: "But advertisers wouldn't think of it," she said.
Spans: []
Tokens: [", But, advertisers, would, n't, think, of, it, ,, ", she, said, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']