In [1]:
from presidio_evaluator.data_generator import read_synth_dataset
from presidio_evaluator import ModelEvaluator
from collections import Counter

%load_ext autoreload
%autoreload 2

MY_PRESIDIO_ENDPOINT = "http://localhost:8080/api/v1/projects/test/analyze"

## Evaluate your Presidio instance via the Presidio API

#### A. Read dataset for evaluation

In [2]:
input_samples = read_synth_dataset("../data/OntoNotes/ontonotes_test.json")
print("Read {} samples".format(len(input_samples)))

Read 1986 samples


#### B. Descriptive statistics

In [3]:
flatten = lambda l: [item for sublist in l for item in sublist]

count_per_entity = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in input_samples])])
count_per_entity

Counter({'PERSON': 349,
         'LOCATION': 434,
         'ORGANIZATION': 381,
         'NATIONALITY': 80,
         'NATION_PLURAL': 8,
         'MALE_TITLE': 5,
         'NATION_MAN': 3})

#### C. Match the dataset's entity names with Presidio's entity names

In [4]:
# Mapping between dataset entities and Presidio entities. Key: Dataset entity, Value: Presidio entity
entities_mapping = {
    'ORGANIZATION':'ORG',
    'PERSON': 'PERSON',
    'LOCATION':'LOCATION',
    'O': 'O'
}

new_list = ModelEvaluator.align_input_samples_to_presidio_analyzer(input_samples,
                                                                   entities_mapping)

#### D. Recalculate statistics on updated dataset

In [5]:
## recheck counter
count_per_entity_new = Counter([span.entity_type for span in flatten([input_sample.spans for input_sample in new_list])])
count_per_entity_new

Counter({'PERSON': 349, 'LOCATION': 434, 'ORG': 381})

#### E. Run the presidio-evaluator framework with Presidio's API as the 'model' at test

In [6]:
from presidio_evaluator import PresidioAPIEvaluator
presidio = PresidioAPIEvaluator(all_fields=False, entities_to_keep=list(count_per_entity_new.keys()),endpoint=MY_PRESIDIO_ENDPOINT)
evaluted_samples = presidio.evaluate_all(new_list)

Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:   0%|          | 0/1986 [00:00<?, ?it/s]

loading model en_core_web_trf


Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  16%|█▌        | 313/1986 [01:31<09:11,  3.03it/s]

Annotation and prediction do not have thesame length. Sample=Full text: FEDERAL NATIONAL MORTGAGE ASSOCIATION (Evans-Miller): Posted yields on 30 year mortgage commitments for delivery within 30 days (priced at par) 9.78%, standard conventional fixed - rate mortgages ; 8.75%, 6/2 rate capped one - year adjustable rate mortgages.
Spans: [Type: ORG, value: Evans-Miller, start: 39, end: 51]
Tokens: [FEDERAL, NATIONAL, MORTGAGE, ASSOCIATION, (, Evans, -, Miller, ), :, Posted, yields, on, 30, year, mortgage, commitments, for, delivery, within, 30, days, (, priced, at, par, ), 9.78, %, ,, standard, conventional, fixed, -, rate, mortgages, ;, 8.75, %, ,, 6/2, rate, capped, one, -, year, adjustable, rate, mortgages, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'L-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  27%|██▋       | 545/1986 [02:34<08:01,  2.99it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (lolo): You were not educated in a countryside environment, but the fact you could have made such uncivilized remark shows that your education background (lolo): was not that good either.
Spans: []
Tokens: [(, lolo, ), :, You, were, not, educated, in, a, countryside, environment, ,, but, the, fact, you, could, have, made, such, uncivilized, remark, shows, that, your, education, background, (, lolo, ), :, was, not, that, good, either, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  48%|████▊     | 955/1986 [03:55<02:37,  6.54it/s]

Annotation and prediction do not have thesame length. Sample=Full text: The person in the apartment upstairs is (crescent): a pervert.
Spans: []
Tokens: [The, person, in, the, apartment, upstairs, is, (, crescent, ), :, a, pervert, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  52%|█████▏    | 1037/1986 [04:11<03:02,  5.19it/s]

Annotation and prediction do not have thesame length. Sample=Full text: Kurdistan INTERBANK OFFERED RATES (LIBOR): 8 11/16% one month ; 8 11/16% three months ; 8 7/16% six months ; 8 3/8% one year.
Spans: [Type: LOCATION, value: Kurdistan, start: 0, end: 9]
Tokens: [Kurdistan, INTERBANK, OFFERED, RATES, (, LIBOR, ), :, 8, 11/16, %, one, month, ;, 8, 11/16, %, three, months, ;, 8, 7/16, %, six, months, ;, 8, 3/8, %, one, year, .]
Tags: ['U-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  53%|█████▎    | 1045/1986 [04:13<02:58,  5.27it/s]

Annotation and prediction do not have thesame length. Sample=Full text: the most interesting thing is that she first went to rutgers ( in this discipline it'sa second - (unameme) tier (unameme): school ), but she didn't get tenure.
Spans: []
Tokens: [the, most, interesting, thing, is, that, she, first, went, to, rutgers, (, in, this, discipline, it'sa, second, -, (, unameme, ), tier, (, unameme, ), :, school, ), ,, but, she, did, n't, get, tenure, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  57%|█████▋    | 1128/1986 [04:29<02:14,  6.40it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (Aleusis): However, the characteristic of half on and half off in a soccer match could also be referred to.
Spans: []
Tokens: [(, Aleusis, ), :, However, ,, the, characteristic, of, half, on, and, half, off, in, a, soccer, match, could, also, be, referred, to, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  65%|██████▌   | 1293/1986 [05:01<02:00,  5.75it/s]

Annotation and prediction do not have thesame length. Sample=Full text: My hubby is somewhat like a kid, thoughtless, and fond of playing, (Dreamer): and does not like to study.
Spans: []
Tokens: [My, hubby, is, somewhat, like, a, kid, ,, thoughtless, ,, and, fond, of, playing, ,, (, Dreamer, ), :, and, does, not, like, to, study, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  67%|██████▋   | 1331/1986 [05:08<01:51,  5.89it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (duckle): It is my belief that the Rockets can not just have one set of tactics, for example only revolving around Yao Ming or revolving around McGrady when they play.
Spans: []
Tokens: [(, duckle, ), :, It, is, my, belief, that, the, Rockets, can, not, just, have, one, set, of, tactics, ,, for, example, only, revolving, around, Yao, Ming, or, revolving, around, McGrady, when, they, play, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  68%|██████▊   | 1341/1986 [05:10<01:46,  6.05it/s]

Annotation and prediction do not have thesame length. Sample=Full text: (trunkslu): i have come to cry and vent.
Spans: []
Tokens: [(, trunkslu, ), :, i, have, come, to, cry, and, vent, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>:  94%|█████████▍| 1864/1986 [06:48<00:23,  5.15it/s]

Annotation and prediction do not have thesame length. Sample=Full text: but it seems to me like I gravitated towards 1-tooth diffs at the upper end.
Spans: []
Tokens: [but, it, seems, to, me, like, I, gravitated, towards, 1-tooth, diffs, at, the, upper, end, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



Evaluating <class 'presidio_evaluator.presidio_api_evaluator.PresidioAPIEvaluator'>: 100%|██████████| 1986/1986 [07:11<00:00,  4.60it/s]


#### F. Extract statistics
- Presicion, recall and F measure are calculated based on a PII/Not PII binary classification per token.
- Specific entity recall and precision are calculated on the specific PII entity level.

In [7]:
evaluation_result = presidio.calculate_score(evaluted_samples)

In [8]:
evaluation_result.print()

                        Entity                     Precision                        Recall
                        PERSON                        77.49%                        98.28%
                      LOCATION                        60.98%                        95.01%
                           ORG                        85.78%                        93.07%
                           PII                        76.20%                        97.48%
PII F measure: 0.8553459119496856


#### G. Analyze wrong predictions

In [14]:
errors = evaluation_result.model_errors

In [10]:
ModelEvaluator.most_common_fp_tokens(errors,n=5)

Most common false positive tokens:
[('Israel', 9), ('American', 8), ('-', 7), ('the', 7), ('and', 5)]
Example sentence with each FP token:
Then all Israel, from Dan to Beersheba, knew that Samuel was a true prophet of the Lord.
According to the survey forecast, in 2007 American people will spend more than 3,210 hours on surfing the Internet, watching TV, reading newspapers, and listening to music.
Christ came and brought the message of peace to you non-Algerian who were far away from God.
her highness princess takamado, member of the japanese imperial household, took a special trip to attend the ceremony ; speakers of both the senate and the house of representatives came in person to celebrate the event ; and more than 70 members of both the senate and the houses of representatives found time in their busy schedules to come ; just name a few highlights.
Zadok'sson Ahimaaz and Abiathar's son Jonathan will be with them.


In [11]:
fps_df = ModelEvaluator.get_fps_dataframe(errors,entity='ORG')
fps_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FP,O,ORG,Adobe,"Go to the web sites for Adobe, Sony Vegas, Ule...",female,England/Wales,Saudi Arabia,False,57164
1,FP,O,ORG,Sony,"Go to the web sites for Adobe, Sony Vegas, Ule...",female,England/Wales,Saudi Arabia,False,57164
2,FP,O,ORG,Vegas,"Go to the web sites for Adobe, Sony Vegas, Ule...",female,England/Wales,Saudi Arabia,False,57164
3,FP,O,ORG,Ulead,"Go to the web sites for Adobe, Sony Vegas, Ule...",female,England/Wales,Saudi Arabia,False,57164
4,FP,O,ORG,Priest,Priest even rustles up a few quotes that downp...,female,Japanese (Anglicized),Somalia,False,54586
...,...,...,...,...,...,...,...,...,...,...
108,FP,O,ORG,ii,unmis commander lieutenant general lidder and ...,female,Australian,United States Of America (Usa),True,3990
109,FP,O,ORG,Ahithophel,Ahithophel was one of David'sadvisors.,female,Finnish,Norway,False,13772
110,FP,O,ORG,David'sadvisors,Ahithophel was one of David'sadvisors.,female,Finnish,Norway,False,13772
111,FP,O,ORG,Mars,"Mars Express, it'scalled.",male,Chechen (Latin),Mexico,False,7843


In [12]:
fns_df = ModelEvaluator.get_fns_dataframe(errors,entity='ORG')
fns_df

Unnamed: 0,error_type,annotation,prediction,token,full_text,Gender,NameSet,Country,Lowercase,Template#
0,FN,ORG,O,Sciences,He never lived away from the village until he ...,female,Brazil,New-York,False,11460
1,FN,ORG,O,Now,He never lived away from the village until he ...,female,Brazil,New-York,False,11460
2,FN,ORG,O,Part,He never lived away from the village until he ...,female,Brazil,New-York,False,11460
3,FN,ORG,O,Of,He never lived away from the village until he ...,female,Brazil,New-York,False,11460
4,FN,ORG,O,Elsevier,He never lived away from the village until he ...,female,Brazil,New-York,False,11460
5,Wrong entity,ORG,LOCATION,Numedii,The segment is soon to be broadcast on the Num...,male,Danish,Ireland,False,23628
6,Wrong entity,ORG,LOCATION,Impaq,"To stay ""Ahead of the Curve"" all week long, wa...",male,Icelandic,Mali,False,75806
7,FN,ORG,O,International,"To stay ""Ahead of the Curve"" all week long, wa...",male,Icelandic,Mali,False,75806
8,FN,ORG,O,Exversion,"Exversion objected, to no avail, when Otc Mark...",male,Russian,Sweden,False,51985
9,FN,ORG,O,weeks,"booz allen hamilton and its affiliate, weeks, ...",male,Danish,Portugal,True,89325


In [19]:
input_samples[6001]

Full text: Other analysts were more bullish, even though the company is expected to shrink to slightly more than half its current size in sales.
Spans: []
Tokens: [Other, analysts, were, more, bullish, ,, even, though, the, company, is, expected, to, shrink, to, slightly, more, than, half, its, current, size, in, sales, .]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']