In [2]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import ModelEvaluator
from collections import Counter

%load_ext autoreload
%autoreload 2

MY_PRESIDIO_ENDPOINT = "http://localhost:8080/api/v1/projects/test/analyze"

## Evaluate your Presidio instance via the Presidio API

#### A. Read dataset for evaluation

In [3]:
input_samples = read_synth_dataset("../data/conll_size_10000_date_November_29_2020.json")
print("Read {} samples".format(len(input_samples)))

Read 10000 samples


#### B. Descriptive statistics

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'NATIONALITY': 1757,
         'ORGANIZATION': 3527,
         'PERSON': 4249,
         'LOCATION': 4809,
         'NATION_PLURAL': 91,
         'NATION_MAN': 64,
         'FEMALE_TITLE': 9,
         'MALE_TITLE': 6})

#### C. Match the dataset's entity names with Presidio's entity names

In [6]:
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
    'PERSON': 'PERSON',
    'EMAIL': 'EMAIL_ADDRESS',
    'CREDIT_CARD': 'CREDIT_CARD',
    'FIRST_NAME': 'PERSON',
    'PHONE_NUMBER': 'PHONE_NUMBER',
    'LOCATION':'LOCATION',
    'BIRTHDAY': 'BIRTHDAY',
    'DATE': 'DATE_TIME',
    'DOMAIN': 'DOMAIN',
    'CITY': 'LOCATION',
    'ADDRESS': 'LOCATION',
    'IBAN': 'IBAN_CODE',
    'URL': 'DOMAIN_NAME',
    'US_SSN': 'US_SSN',
    'IP_ADDRESS': 'IP_ADDRESS',
    'ORGANIZATION':'ORG',
    'O': 'O'
}
presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'BIRTHDAY', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE', 'ORG', 
                   'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']

new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
                                                                   entities_mapping,
                                                                   presidio_fields)

#### D. Recalculate statistics on updated dataset

In [7]:
## recheck counter
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_list])])
count_per_entity_new

Counter({'ORG': 3527, 'PERSON': 4249, 'LOCATION': 4809})

#### E. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [8]:
from presidio_evaluator import PresidioAPIEvaluator
presidio = PresidioAPIEvaluator(all_fields=False, entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)
evaluted_samples = presidio.evaluate_all(new_list)

Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:   0%|          | 0/10000 [00:00<?, ?it/s]

loading model en_core_web_lg


Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  11%|█         | 1055/10000 [02:43<23:08,  6.44it/s]


KeyboardInterrupt: 

#### F. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [74]:
evaluation_result = presidio.calculate_score(evaluted_samples)

In [75]:
evaluation_result.print()

                        Entity                     Precision                        Recall
                        PERSON                        97.74%                        92.17%
                   CREDIT_CARD                       100.00%                       100.00%
                      LOCATION                        90.67%                        22.44%
                           ORG                          nan%                         0.00%
                        US_SSN                       100.00%                       100.00%
                 EMAIL_ADDRESS                       100.00%                       100.00%
                      BIRTHDAY                       100.00%                       100.00%
                   DOMAIN_NAME                          nan%                          nan%
                  PHONE_NUMBER                       100.00%                        27.78%
                    IP_ADDRESS                       100.00%                       100.00%

#### G. Analyze wrong predictions

In [76]:
errors = evaluation_result.model_errors

In [77]:
ModelEvaluator.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[('Texas', 4), ("'s", 2), ('Southern', 1), ('the', 1)]
Example sentence with each FP token:
Celebrating its 10th year in Villa de Ves, Trak Auto is a 501(c)3 that invites songwriters from around the world to Texas to share the universal language of music in collaborations designed to bridge cultures, build friendships and cultivate peace.
When they weren't singing about Hobbits, satanic felines and interstellar journeys, they were singing about the verses from Marsilius Chenard's Cautionary Tales. Is there a better example of unbridled creativity than early Chenard?
The Home Centers Orchestra was founded in 1929. Since then, the TSO has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Dominica
Blink-182 pay tribute here to the United Arab Emirates. Producer Jaroslava Sedláčková explained to Fuse TV: "We all liked the idea of writing a song about our state, where we live and love. To me it's the most beautif

In [78]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PHONE_NUMBER')
fps_df

No errors of type FP and entity PHONE_NUMBER were found


In [79]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PHONE_NUMBER')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FN,PHONE_NUMBER,O,99,Can someone call me on 99 702542? I have some ...,male,Hungarian,Guinea,False,55
1,FN,PHONE_NUMBER,O,702542,Can someone call me on 99 702542? I have some ...,male,Hungarian,Guinea,False,55
2,FN,PHONE_NUMBER,O,0477,Can someone call me on 0477 99 13 51? I have s...,female,Czech,Mauritania,False,55
3,FN,PHONE_NUMBER,O,99,Can someone call me on 0477 99 13 51? I have s...,female,Czech,Mauritania,False,55
4,FN,PHONE_NUMBER,O,13,Can someone call me on 0477 99 13 51? I have s...,female,Czech,Mauritania,False,55
5,FN,PHONE_NUMBER,O,51,Can someone call me on 0477 99 13 51? I have s...,female,Czech,Mauritania,False,55
6,FN,PHONE_NUMBER,O,026,I would like to stop receiving messages to 026...,male,Hungarian,Mongolia,False,43
7,FN,PHONE_NUMBER,O,848,I would like to stop receiving messages to 026...,male,Hungarian,Mongolia,False,43
8,FN,PHONE_NUMBER,O,14,I would like to stop receiving messages to 026...,male,Hungarian,Mongolia,False,43
9,FN,PHONE_NUMBER,O,90,I would like to stop receiving messages to 026...,male,Hungarian,Mongolia,False,43


In [15]:
input_samples[100]

Full text: What's your credit card? 4929218835001304
Spans: [Type: CREDIT_CARD, value: 4929218835001304, start: 25, end: 41]
Tokens: [What, 's, your, credit, card, ?, 4929218835001304]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'U-CREDIT_CARD']