In [187]:
import json
import re

In [188]:
pii_clean_file_path = '../../nlp-assets/resources/pipeline_test_case_gen/out/synth_dataset_date_piiclean.json'
source_file_path = '../../presidio-research/data/synth_dataset.json'

anonymized_pattern = r"(?:-{2,}\s?)+(?<!\s)"

In [189]:
with open (pii_clean_file_path) as f:
    clean_file = json.load(f)

In [190]:
with open(source_file_path) as f:
    source_file = json.load(f)

In [191]:
anonymized_content = clean_file['anonymized_content']

In [192]:
assert(len(anonymized_content)) == len(source_file)

In [193]:
len(anonymized_content), len(source_file)

(299, 299)

In [195]:
anonymized_content[17]

"My card -----------47413 is expiring this month. Please let me know process to it's extend validity."

In [196]:
source_file[17]['spans']

[{'entity_type': 'CREDIT_CARD',
  'entity_value': '4916843781747413',
  'start_position': 8,
  'end_position': 24}]

## Check the overall Accuracy, Precision, Recall and F1

In [197]:
ground_truth_entities = {}
correct_entities = {}
num_ground_truth = 0
num_predictions = 0
num_correct_predictions = 0
num_correct_predictions_by_entity_types = {}

for x in range(len(anonymized_content)):
    anon_text = anonymized_content[x]
    detected_spans = list(re.finditer(anonymized_pattern, anon_text))
    spans = source_file[x]['spans']
    
    num_ground_truth += len(spans)
    num_predictions += len(detected_spans)
    
    for s in spans:    
        if s['entity_type'] in ground_truth_entities:
            ground_truth_entities[s['entity_type']] += 1
        else:
            ground_truth_entities[s['entity_type']] = 1
        
        found = False
        
        for predicted in detected_spans:
            if s['start_position'] == predicted.span()[0] and s['end_position'] == predicted.span()[1]:
                found = True
                num_correct_predictions += 1
                if s['entity_type'] in num_correct_predictions_by_entity_types:
                    num_correct_predictions_by_entity_types[s['entity_type']] += 1
                else:
                    num_correct_predictions_by_entity_types[s['entity_type']] = 1
                break

In [198]:
num_ground_truth, num_predictions

(390, 228)

In [199]:
ground_truth_entities, num_correct_predictions_by_entity_types

({'PERSON': 174,
  'CREDIT_CARD': 49,
  'LOCATION': 75,
  'ORGANIZATION': 48,
  'US_SSN': 1,
  'EMAIL': 11,
  'BIRTHDAY': 4,
  'TITLE': 4,
  'URL': 8,
  'PHONE_NUMBER': 9,
  'IP_ADDRESS': 3,
  'IBAN': 3,
  'NATIONALITY': 1},
 {'PERSON': 47,
  'EMAIL': 8,
  'LOCATION': 4,
  'BIRTHDAY': 3,
  'PHONE_NUMBER': 3,
  'ORGANIZATION': 1})

In [200]:
num_correct_predictions

66

In [201]:
recall = num_correct_predictions / num_ground_truth

In [202]:
precision = num_correct_predictions / num_predictions

In [203]:
F = 2 * (precision * recall) / (precision + recall)

### Because we are only predicting what are the PIIs but we don't predict what are NOT PIIs, so recall is actually accuracy

In [204]:
recall, precision, F

(0.16923076923076924, 0.2894736842105263, 0.21359223300970875)