In [63]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import ModelEvaluator
from collections import Counter

%load_ext autoreload
%autoreload 2

MY_PRESIDIO_ENDPOINT = "http://localhost:8080/api/v1/projects/test/analyze"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Evaluate your Presidio instance via the Presidio API

#### A. Read dataset for evaluation

In [92]:
input_samples = read_synth_dataset("../data/Synth/synth_test.json")
print("Read {} samples".format(len(input_samples)))

Read 60 samples


#### B. Descriptive statistics

In [93]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'CREDIT_CARD': 12,
         'PHONE_NUMBER': 4,
         'PERSON': 31,
         'URL': 4,
         'LOCATION': 16,
         'IBAN': 1,
         'BIRTHDAY': 2})

#### C. Match the dataset's entity names with Presidio's entity names

In [94]:
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
    'PERSON': 'PERSON',
    'CREDIT_CARD': 'CREDIT_CARD',
    'LOCATION':'LOCATION',
    'ORGANIZATION':'ORG',
    'US_SSN': 'US_SSN',
    'EMAIL': 'EMAIL_ADDRESS',
    'BIRTHDAY': 'BIRTHDAY',
    'URL': 'DOMAIN_NAME',
    'PHONE_NUMBER': 'PHONE_NUMBER',
    'IP_ADDRESS': 'IP_ADDRESS',
    'IBAN': 'IBAN_CODE',
    'URL': 'URL',
    'O': 'O'
}
presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'BIRTHDAY', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE', 'ORG', 
                   'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']

new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
                                                                   entities_mapping,
                                                                   presidio_fields)

#### D. Recalculate statistics on updated dataset

In [88]:
## recheck counter
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_list])])
count_per_entity_new

Counter({'PERSON': 174,
         'CREDIT_CARD': 49,
         'LOCATION': 75,
         'ORG': 48,
         'US_SSN': 1,
         'EMAIL_ADDRESS': 11,
         'BIRTHDAY': 4,
         'URL': 8,
         'PHONE_NUMBER': 9,
         'IP_ADDRESS': 3,
         'IBAN_CODE': 3})

#### E. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [89]:
from presidio_evaluator import PresidioAPIEvaluator
presidio = PresidioAPIEvaluator(all_fields=False, entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)
evaluted_samples = presidio.evaluate_all(new_list)

Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>: 100%|██████████| 299/299 [00:31<00:00,  9.37it/s]


#### F. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [90]:
evaluation_result = presidio.calculate_score(evaluted_samples)

In [95]:
evaluation_result.print()

                        Entity                     Precision                        Recall
                        PERSON                        90.91%                        74.73%
                   CREDIT_CARD                       100.00%                       100.00%
                      LOCATION                        62.25%                        31.02%
                           ORG                        60.51%                        99.16%
                        US_SSN                       100.00%                       100.00%
                 EMAIL_ADDRESS                       100.00%                        27.27%
                      BIRTHDAY                       100.00%                       100.00%
                           URL                          nan%                          nan%
                  PHONE_NUMBER                       100.00%                        27.78%
                    IP_ADDRESS                       100.00%                       100.00%

#### G. Analyze wrong predictions

In [96]:
errors = evaluation_result.model_errors

In [97]:
ModelEvaluator.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[('American', 7), ('Blink-182', 6), ('pay', 6), ('Fuse', 6), ('TV', 6)]
Example sentence with each FP token:
From the film American graffiti (also features Ilya Bodrov. What's not to love?
Blink-182 pay tribute here to the Mozambique. Producer Devin Berg explained to Fuse TV: "We all liked the idea of writing a song about our state, where we live and love. To me it's the most beautiful place in the world, this song was us giving credit to how lucky we are to have lived here and grown up here, raising families here, the whole thing."
Blink-182 pay tribute here to the Mozambique. Producer Devin Berg explained to Fuse TV: "We all liked the idea of writing a song about our state, where we live and love. To me it's the most beautiful place in the world, this song was us giving credit to how lucky we are to have lived here and grown up here, raising families here, the whole thing."
Blink-182 pay tribute here to the Mozambique. Producer Devin Berg explained 

In [98]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='PERSON')
fps_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FP,O,PERSON,from,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
1,FP,O,PERSON,the,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
2,FP,O,PERSON,will,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
3,FP,O,PERSON,.,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
4,FP,O,PERSON,How,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
5,FP,O,PERSON,do,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
6,FP,O,PERSON,I,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
7,FP,O,PERSON,do,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
8,FP,O,PERSON,that,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45
9,FP,O,PERSON,?,I would like to remove my kid Ryley from the w...,male,Scottish,Hungary,False,45


In [99]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='PERSON')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FN,PERSON,O,Erik,My name is Erik Baader but everyone calls me Erik,male,German,Slovakia,False,58
1,FN,PERSON,O,Souza,"Unlike the Souza novel, it's not about necroph...",female,Brazil,Mauritania,False,96
2,Wrong entity,PERSON,LOCATION,Avtorhan,"Avtorhan, can I please speak to your boss?",male,Chechen (Latin),Portugal,False,37
3,Wrong entity,PERSON,LOCATION,george,"george, can i please speak to your boss?",male,Australian,Djibouti,True,37
4,FN,PERSON,O,Aron,Aron is a very sympathetic person. He's also a...,male,Scottish,Armenia,False,87
...,...,...,...,...,...,...,...,...,...,...
66,FN,PERSON,O,Szemere,You can tell Szemere was a huge Szemere Szakác...,male,Hungarian,Grenada,False,105
67,Wrong entity,PERSON,LOCATION,Signe,The true gender of Signe has been under debate...,female,Danish,Middle East,False,94
68,FN,PERSON,O,Monika,You can tell Monika was a huge Monika Chocholo...,female,Czech,The Philippines,False,105
69,Wrong entity,PERSON,LOCATION,Mazzi,This song by ex-Zombie Mazzi is a perfect exam...,male,Italian,Panama,False,106


In [15]:
input_samples[100]

Full text: What's your credit card? 4929218835001304
Spans: [Type: CREDIT_CARD, value: 4929218835001304, start: 25, end: 41]
Tokens: [What, 's, your, credit, card, ?, 4929218835001304]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'U-CREDIT_CARD']