## Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from nerpii.named_entity_recognizer import NamedEntityRecognizer, split_name
from nerpii.faker_generator import FakerGenerator

  from .autonotebook import tqdm as notebook_tqdm


### Personal Information Dataset

In [2]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [3]:
recognizer = NamedEntityRecognizer(personal_info)

The functions below try to assign different named entities to the columns of the dataset.

In [4]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [5]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.9127725856697819},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.8625},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.9096385542168675},
 'address': {'entity': 'ADDRESS', 'confidence_score': 0.8926174496644296},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8731343283582089},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7171717171717171},
 'state': {'entity': 'LOCATION', 'confidence_score': 0.976},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.888},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.918},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.998}}

Create a FakerGenerator to synthetize PII 

In [6]:
faker_generator = FakerGenerator(personal_info, recognizer.dict_global_entities)

In [7]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized with Faker.
Column [1;32mphone1[0m synthesized with Faker.
Column [1;32mphone2[0m synthesized with Faker.
Column [1;32memail[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mcity[0m synthesized with Faker.
Column [1;32mstate[0m synthesized with Faker.
Column [1;32mweb[0m synthesized with Faker.
Column [1;32mzip[0m synthesized with Faker.
Column [1;31mcompany_name[0m not synthesized with Faker.
Column [1;31mcounty[0m not synthesized with Faker.


In [8]:
personal_info

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,Colin,Brown,"Benton, John B Jr",03743 David Creek,Gonzalezmouth,Orleans,NV,11354,(465)266-4849x32441,001-353-112-0384x6939,okline@hotmail.com,http://www.charles.com/
1,Pamela,Wilson,"Chanay, Jeffrey A Esq",09553 Maddox Motorway,Ashleyland,Livingston,AK,02562,095.093.9976x96237,247-745-1811,reesejames@hotmail.com,http://www.reed-sosa.com/
2,Harold,Johnson,"Chemel, James L Cpa",85441 Christopher Mountain Suite 948,Liubury,Gloucester,KS,58685,955.494.8387x642,+1-776-499-9510x007,mcculloughnatalie@yahoo.com,http://mason.com/
3,William,Fox,Feltz Printing Service,5532 Vargas Parks,New Hollystad,Anchorage,AK,02891,+1-171-718-5558x936,252-472-6670,dariuscollins@gmail.com,http://www.brown.org/
4,Anna,Anderson,Printing Dimensions,89708 Steven Forge Suite 165,Thomasborough,Butler,DE,04350,503.459.3446,001-203-996-9346x906,cummingsdavid@yahoo.com,http://buchanan-spencer.info/
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Sabrina,Harrison,Inner Label,69837 Eric Spur,Lake Amandastad,Ada,CA,76114,+1-321-656-0653x029,861-399-6558x078,millerwilliam@gmail.com,https://www.scott-moore.com/
496,Jennifer,Thomas,Hermar Inc,250 Samantha Route Suite 459,Kaylashire,Elkhart,CO,97954,(568)527-5656,+1-780-935-5677x00874,djones@hotmail.com,http://www.hubbard-romero.biz/
497,Chris,Trevino,Simonton Howe & Schneider Pc,03184 Eric Way Apt. 908,Lake Ginaville,Box Butte,MT,86186,001-579-244-3021,+1-859-216-6594,gregorygarcia@yahoo.com,https://www.ryan.com/
498,Thomas,Stevens,Warehouse Office & Paper Prod,3767 Jones Locks,North Jennifer,King,MP,51002,(748)921-9894x87017,929.587.7186,copelandjennifer@gmail.com,http://www.cole.com/


### Full Foia Contacts Dataset

In [2]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In [3]:
full_foia_contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Agency               747 non-null    object
 1   Department           747 non-null    object
 2   Name                 510 non-null    object
 3   Title                747 non-null    object
 4   Room Number          248 non-null    object
 5   Street Address       272 non-null    object
 6   City                 282 non-null    object
 7   State                282 non-null    object
 8   Zip Code             279 non-null    object
 9   Telephone            732 non-null    object
 10  Fax                  277 non-null    object
 11  Email Address        304 non-null    object
 12  Website              254 non-null    object
 13  Online Request Form  93 non-null     object
 14  Notes                48 non-null     object
dtypes: object(15)
memory usage: 87.7+ KB


In this dataset, it is necessary to split the name into first_name and last_name columns. To do so, the function split_name() is used.

In [4]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [5]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [6]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [11]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.1949225473321859},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.2870264064293915},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.20053475935828877},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.25396825396825395},
 'Street Address': None,
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9490445859872612},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9877800407331976},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.9824561403508771},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.9900990099009901},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': {'entity': 'ORGANIZATION', 'confidence_score': 0.125},
 'Notes': {'entity': 'ORGANIZATION', 'confidence_score': 0.6176470588235294},
 'first_name': {'entity': 'PERSON

In [12]:
faker_generator = FakerGenerator(full_foia_contacts, recognizer.dict_global_entities)

In [13]:
faker_generator.get_faker_generation()

Column [1;32mTelephone[0m synthesized with Faker.
Column [1;32mFax[0m synthesized with Faker.
Column [1;32mEmail Address[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mCity[0m synthesized with Faker.
Column [1;32mWebsite[0m synthesized with Faker.
Column [1;32mZip Code[0m synthesized with Faker.
Column [1;31mAgency[0m not synthesized with Faker.
Column [1;31mDepartment[0m not synthesized with Faker.
Column [1;31mTitle[0m not synthesized with Faker.
Column [1;31mRoom Number[0m not synthesized with Faker.
Column [1;31mOnline Request Form[0m not synthesized with Faker.
Column [1;31mNotes[0m not synthesized with Faker.
