In [29]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import ModelEvaluator
from collections import Counter

%load_ext autoreload
%autoreload 2

MY_PRESIDIO_ENDPOINT = "http://localhost:8080/api/v1/projects/test/analyze"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Evaluate your Presidio instance via the Presidio API

#### A. Read dataset for evaluation

In [30]:
input_samples = read_synth_dataset("../data/OntoNotes/ontonotes_test.json")
print("Read {} samples".format(len(input_samples)))

Read 1986 samples


#### B. Descriptive statistics

In [31]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'PERSON': 349,
         'LOCATION': 434,
         'ORGANIZATION': 381,
         'NATIONALITY': 80,
         'NATION_PLURAL': 8,
         'MALE_TITLE': 5,
         'NATION_MAN': 3})

#### C. Match the dataset's entity names with Presidio's entity names

In [32]:
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
    'PERSON': 'PERSON',
    'EMAIL': 'EMAIL_ADDRESS',
    'CREDIT_CARD': 'CREDIT_CARD',
    'FIRST_NAME': 'PERSON',
    'PHONE_NUMBER': 'PHONE_NUMBER',
    'LOCATION':'LOCATION',
    'BIRTHDAY': 'BIRTHDAY',
    'DATE': 'DATE_TIME',
    'DOMAIN': 'DOMAIN',
    'CITY': 'LOCATION',
    'ADDRESS': 'LOCATION',
    'IBAN': 'IBAN_CODE',
    'URL': 'DOMAIN_NAME',
    'US_SSN': 'US_SSN',
    'IP_ADDRESS': 'IP_ADDRESS',
    'ORGANIZATION':'ORG',
    'O': 'O'
}
presidio_fields = ['CREDIT_CARD', 'CRYPTO', 'BIRTHDAY', 'DOMAIN_NAME', 'EMAIL_ADDRESS', 'IBAN_CODE', 'ORG', 
                   'IP_ADDRESS', 'NRP', 'LOCATION', 'PERSON', 'PHONE_NUMBER', 'US_SSN']

new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
                                                                   entities_mapping,
                                                                   presidio_fields)

#### D. Recalculate statistics on updated dataset

In [33]:
## recheck counter
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_list])])
count_per_entity_new

Counter({'PERSON': 349, 'LOCATION': 434, 'ORG': 381})

#### E. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [34]:
from presidio_evaluator import PresidioAPIEvaluator
presidio = PresidioAPIEvaluator(all_fields=False, entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)
evaluted_samples = presidio.evaluate_all(new_list)

Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  16%|█▌        | 314/1986 [00:35<03:12,  8.68it/s]

Annotation and prediction do not have thesame length. Sample=Full text: FEDERAL NATIONAL MORTGAGE ASSOCIATION (Evans-Miller): Posted yields on 30 year mortgage commitments for delivery within 30 days (priced at par) 9.78%, standard conventional fixed - rate mortgages ; 8.75%, 6/2 rate capped one - year adjustable rate mortgages.
Spans: [Type: ORG, value: Evans-Miller, start: 39, end: 51]
Tokens: [FEDERAL, NATIONAL, MORTGAGE, ASSOCIATION, (, Evans, -, Miller, ), :, Posted, yields, on, 30, year, mortgage, commitments, for, delivery, within, 30, days, (, priced, at, par, ), 9.78, %, ,, standard, conventional, fixed, -, rate, mortgages, ;, 8.75, %, ,, 6/2, rate, capped, one, -, year, adjustable, rate, mortgages, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  27%|██▋       | 546/1986 [01:00<03:09,  7.60it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (lolo): You were not educated in a countryside environment, but the fact you could have made such uncivilized remark shows that your education background (lolo): was not that good either.
Spans: []
Tokens: [(, lolo, ), :, You, were, not, educated, in, a, countryside, environment, ,, but, the, fact, you, could, have, made, such, uncivilized, remark, shows, that, your, education, background, (, lolo, ), :, was, not, that, good, either, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  48%|████▊     | 957/1986 [01:45<01:28, 11.68it/s]

Annotation and prediction do not have thesame length. Sample=Full text: The person in the apartment upstairs is (crescent): a pervert.
Spans: []
Tokens: [The, person, in, the, apartment, upstairs, is, (, crescent, ), :, a, pervert, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  52%|█████▏    | 1038/1986 [01:54<01:30, 10.47it/s]

Annotation and prediction do not have thesame length. Sample=Full text: Kurdistan INTERBANK OFFERED RATES (LIBOR): 8 11/16% one month ; 8 11/16% three months ; 8 7/16% six months ; 8 3/8% one year.
Spans: [Type: LOCATION, value: Kurdistan, start: 0, end: 9]
Tokens: [Kurdistan, INTERBANK, OFFERED, RATES, (, LIBOR, ), :, 8, 11/16, %, one, month, ;, 8, 11/16, %, three, months, ;, 8, 7/16, %, six, months, ;, 8, 3/8, %, one, year, .]
Tags: ['U-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  53%|█████▎    | 1046/1986 [01:55<01:42,  9.14it/s]

Annotation and prediction do not have thesame length. Sample=Full text: the most interesting thing is that she first went to rutgers ( in this discipline it'sa second - (unameme) tier (unameme): school ), but she didn't get tenure.
Spans: []
Tokens: [the, most, interesting, thing, is, that, she, first, went, to, rutgers, (, in, this, discipline, it'sa, second, -, (, unameme, ), tier, (, unameme, ), :, school, ), ,, but, she, did, n't, get, tenure, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  57%|█████▋    | 1129/1986 [02:03<01:48,  7.92it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (Aleusis): However, the characteristic of half on and half off in a soccer match could also be referred to.
Spans: []
Tokens: [(, Aleusis, ), :, However, ,, the, characteristic, of, half, on, and, half, off, in, a, soccer, match, could, also, be, referred, to, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  65%|██████▌   | 1294/1986 [02:21<01:09,  9.90it/s]

Annotation and prediction do not have thesame length. Sample=Full text: My hubby is somewhat like a kid, thoughtless, and fond of playing, (Dreamer): and does not like to study.
Spans: []
Tokens: [My, hubby, is, somewhat, like, a, kid, ,, thoughtless, ,, and, fond, of, playing, ,, (, Dreamer, ), :, and, does, not, like, to, study, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  67%|██████▋   | 1331/1986 [02:24<01:11,  9.14it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (duckle): It is my belief that the Rockets can not just have one set of tactics, for example only revolving around Yao Ming or revolving around McGrady when they play.
Spans: []
Tokens: [(, duckle, ), :, It, is, my, belief, that, the, Rockets, can, not, just, have, one, set, of, tactics, ,, for, example, only, revolving, around, Yao, Ming, or, revolving, around, McGrady, when, they, play, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  68%|██████▊   | 1341/1986 [02:25<01:00, 10.71it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (trunkslu): i have come to cry and vent.
Spans: []
Tokens: [(, trunkslu, ), :, i, have, come, to, cry, and, vent, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  94%|█████████▍| 1865/1986 [03:20<00:13,  8.68it/s]

Annotation and prediction do not have thesame length. Sample=Full text: but it seems to me like I gravitated towards 1-tooth diffs at the upper end.
Spans: []
Tokens: [but, it, seems, to, me, like, I, gravitated, towards, 1-tooth, diffs, at, the, upper, end, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>: 100%|██████████| 1986/1986 [03:33<00:00,  9.32it/s]


#### F. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [35]:
evaluation_result = presidio.calculate_score(evaluted_samples)

In [36]:
evaluation_result.print()

                        Entity                     Precision                        Recall
                        PERSON                        93.29%                        99.71%
                      LOCATION                        65.45%                        95.90%
                           ORG                        80.52%                        93.99%
                           PII                        80.81%                        98.27%
PII F measure: 0.8869060896720945


#### G. Analyze wrong predictions

In [11]:
errors = evaluation_result.model_errors

In [12]:
ModelEvaluator.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[('-', 10), ('Israel', 9), ('American', 8), ('Qingdao', 4), ('South', 4)]
Example sentence with each FP token:
Christ came and brought the message of peace to you non-Algerian who were far away from God.
Then all Israel, from Dan to Beersheba, knew that Samuel was a true prophet of the Lord.
According to the survey forecast, in 2007 American people will spend more than 3,210 hours on surfing the Internet, watching TV, reading newspapers, and listening to music.
(dasanicool) However, ordinary Qingdao residents do not see the benefits only that South Koreans have brought.
(dasanicool) However, ordinary Qingdao residents do not see the benefits only that South Koreans have brought.


In [11]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='ORG')
fps_df

No errors of type FP and entity ORG were found


In [12]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='ORG')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FN,ORG,O,Lenddo,Yet some dealers have turned down Lenddo or Ci...,female,Dutch,Nepal,False,59813
1,FN,ORG,O,Cityscan,Yet some dealers have turned down Lenddo or Ci...,female,Dutch,Nepal,False,59813
2,FN,ORG,O,Sky,Sky News brought in Indira Khadzhiyev as a par...,female,Chechen (Latin),Armenia,False,73824
3,FN,ORG,O,News,Sky News brought in Indira Khadzhiyev as a par...,female,Chechen (Latin),Armenia,False,73824
4,FN,ORG,O,Connectdot,Connectdot Llc just last year started selling ...,male,Australian,Guyana,False,5272
...,...,...,...,...,...,...,...,...,...,...
3601,FN,ORG,O,Transaction,"But the editorial, by Santino Cadena of Govern...",male,Hispanic,Gambia,False,2736
3602,FN,ORG,O,Services,"But the editorial, by Santino Cadena of Govern...",male,Hispanic,Gambia,False,2736
3603,Wrong entity,ORG,PERSON,Sap,Senior Tuvaluan officials telling Sap that the...,female,Japanese (Anglicized),United States Of America (Usa),False,73103
3604,FN,ORG,O,Castle,actually he'son a summer internship with Castl...,male,French,Mozambique,False,102785


In [19]:
input_samples[6001]

Full text: Other analysts were more bullish, even though the company is expected to shrink to slightly more than half its current size in sales.
Spans: []
Tokens: [Other, analysts, were, more, bullish, ,, even, though, the, company, is, expected, to, shrink, to, slightly, more, than, half, its, current, size, in, sales, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']