In [1]:
from tqdm import tqdm_notebook as tqdm
from presidio_evaluator.data_generator.main import generate,read_synth_dataset

import datetime
import json

In [None]:
!python -m spacy download "en_core_web_lg"

In [3]:
# Generate fake PII data using Presidio's data generator

Collecting en_core_web_lg==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz (782.7 MB)
[K     |████████████████████████████████| 782.7 MB 6.8 MB/s eta 0:00:01
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.3.1-py3-none-any.whl size=780233834 sha256=cfc4b748861ff55c20fbd76e0348ba67a4977bf7dc2092d38c32675750ccdc16
  Stored in directory: /private/var/folders/7z/tn1c_tk90tsdrr1mrjl0n85c0000gn/T/pip-ephem-wheel-cache-1jzjgxph/wheels/41/75/77/c4a98e18b2c317a2a13931cbbea7e3ca7f3a21efc36adc1d71
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.3.1
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


# Generate fake PII data using Presidio's data generator

Presidio's data generator allows you to generate a synthetic dataset with two preriquisites:
1. A fake PII csv (We used https://www.fakenamegenerator.com/)
2. A text file with template sentences or paragraphs. In this file, each PII entity placeholder is written in brackets. The name of the PII entity should be one of the columns in the fake PII csv file.

The generator creates fake sentences based on the provided fake PII csv AND a list of [extension functions](../presidio_evaluator/data_generator/extensions.py) and a few additional 3rd party libraries like `Faker`, and `haikunator`.


For example:
1. **A fake PII csv**:

| FIRST_NAME  |  LAST_NAME  |  EMAIL |
|-------------|-------------|-----------|
| David       |  Brown      |  david.brown@jobhop.com |
| Mel         |  Brown      |  melb@hobjob.com |


2. **Templates**:

My name is [FIRST_NAME]

You can email me at [EMAIL]. Thanks, [FIRST_NAME]

What's your last name? It's [LAST_NAME]

Every time I see you falling I get down on my knees and pray


### Generate files
Based on these two prerequisites, a requested number of examples and an output file name:

In [2]:
EXAMPLES = 10000
SPAN_TO_TAG = True #Whether to create tokens + token labels (tags)
TEMPLATES_FILE = '../presidio_evaluator/data_generator/raw_data/ontonotes_based_templates.txt'
KEEP_ONLY_TAGGED = False
LOWER_CASE_RATIO = 0.1
IGNORE_TYPES = {"IP_ADDRESS", 'US_SSN', 'URL'}

# IGNORE_TYPES = {}

cur_time = datetime.date.today().strftime("%B_%d_%Y")

OUTPUT = "../data/generated_size_{}_date_{}.json".format(EXAMPLES, cur_time)

fake_pii_csv = '../presidio_evaluator/data_generator/raw_data/FakeNameGenerator.com_3000.csv'
utterances_file = TEMPLATES_FILE
dictionary_path = None

examples = generate(fake_pii_csv=fake_pii_csv,
                        utterances_file=utterances_file,
                        dictionary_path=dictionary_path,
                        output_file=OUTPUT,
                        lower_case_ratio=LOWER_CASE_RATIO,
                        num_of_examples=EXAMPLES,
                        ignore_types=IGNORE_TYPES,
                        keep_only_tagged=KEEP_ONLY_TAGGED,
                        span_to_tag=SPAN_TO_TAG)

Preparing sample sentences for ingestion
Preparing fake PII data for ingestion
Generating address parts
Generating roles
Generating titles
Generating nationalities


  0%|          | 0/10000 [00:00<?, ?it/s]

Generating IBANs
Generating company names
Finished preparing fake PII data
loading model en_core_web_lg





OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

To read a dataset file into the InputSample format, use `read_synth_dataset`:

In [5]:
input_samples = read_synth_dataset(OUTPUT)

In [6]:
input_samples[0]

Full text: The sports were the only two of 28 that failed to win a majority of votes in a ballot of members at an Practice Fusion meeting in Keychain Logistics Corp..
Spans: [Type: ORGANIZATION, value: Practice Fusion, start: 103, end: 118, Type: ORGANIZATION, value: Keychain Logistics Corp., start: 130, end: 154]
Tokens: [The, sports, were, the, only, two, of, 28, that, failed, to, win, a, majority, of, votes, in, a, ballot, of, members, at, an, Practice, Fusion, meeting, in, Keychain, Logistics, Corp, ..]
Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORGANIZATION', 'L-ORGANIZATION', 'O', 'O', 'B-ORGANIZATION', 'I-ORGANIZATION', 'I-ORGANIZATION', 'L-ORGANIZATION']

The full structure of each input_sample is the following. It includes different feature values per token as calculated by Spacy

In [7]:
input_samples[0].to_dict()

{'full_text': 'The sports were the only two of 28 that failed to win a majority of votes in a ballot of members at an Practice Fusion meeting in Keychain Logistics Corp..',
 'masked': None,
 'spans': [{'entity_type': 'ORGANIZATION',
   'entity_value': 'Practice Fusion',
   'start_position': 103,
   'end_position': 118},
  {'entity_type': 'ORGANIZATION',
   'entity_value': 'Keychain Logistics Corp.',
   'start_position': 130,
   'end_position': 154}],
 'tokens': [{'text': 'The',
   'idx': 0,
   'tag_': 'DT',
   'pos_': 'DET',
   'dep_': 'det',
   'lemma_': 'the',
   '_': {'is_in_vocabulary': False}},
  {'text': 'sports',
   'idx': 4,
   'tag_': 'NNS',
   'pos_': 'NOUN',
   'dep_': 'nsubj',
   'lemma_': 'sport',
   '_': {'is_in_vocabulary': False}},
  {'text': 'were',
   'idx': 11,
   'tag_': 'VBD',
   'pos_': 'AUX',
   'dep_': 'ROOT',
   'lemma_': 'be',
   '_': {'is_in_vocabulary': False}},
  {'text': 'the',
   'idx': 16,
   'tag_': 'DT',
   'pos_': 'DET',
   'dep_': 'det',
   'lemma_':

#### Verify randomness of dataset

In [8]:
from collections import Counter
count_per_template_id = Counter([sample.metadata['Template#'] for sample in input_samples])
for key in sorted(count_per_template_id):
    print("{}: {}".format(key,count_per_template_id[key]))
    
print(sum(count_per_template_id.values()))

230: 1
418: 1
754: 1
860: 1
997: 1
1159: 1
1534: 1
2789: 1
2934: 1
3150: 1
3156: 1
3538: 1
4172: 1
4549: 1
4670: 1
4778: 1
5038: 1
5727: 1
5977: 1
6242: 1
6640: 1
6862: 1
7054: 1
7861: 1
8394: 1
8657: 1
8928: 1
9450: 1
9729: 1
9847: 1
10211: 1
10797: 1
11792: 1
11936: 1
12561: 1
12621: 1
12766: 1
12891: 1
13071: 1
13807: 1
14918: 1
15406: 1
15790: 1
15809: 1
15945: 1
16186: 1
16424: 1
16463: 1
16602: 1
16959: 1
17152: 1
17230: 1
17812: 1
17909: 1
18647: 1
18718: 1
18843: 1
18968: 1
19917: 1
20269: 1
20598: 1
21070: 1
21350: 1
21799: 1
21931: 1
22482: 1
22662: 1
22952: 1
23138: 1
23699: 1
23852: 1
24603: 1
24643: 1
24784: 1
24932: 1
25706: 1
25761: 1
25968: 1
27081: 1
27385: 1
27436: 1
27830: 1
27857: 1
28123: 1
28992: 1
29007: 1
29624: 1
29643: 1
29783: 1
30102: 1
30222: 1
31189: 1
31283: 1
31304: 1
31708: 1
31727: 1
33207: 1
33434: 1
33537: 1
34007: 1
100


#### Transform to the CONLL structure:

In [9]:
from presidio_evaluator import InputSample

conll = InputSample.create_conll_dataset(input_samples)
conll.head(5)

Unnamed: 0,text,pos,tag,Template#,gender,country,label,sentence
0,The,DET,DT,6640,female,Barbados,O,0
1,sports,NOUN,NNS,6640,female,Barbados,O,0
2,were,AUX,VBD,6640,female,Barbados,O,0
3,the,DET,DT,6640,female,Barbados,O,0
4,only,ADJ,JJ,6640,female,Barbados,O,0


#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.
