## Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from nerpii.named_entity_recognizer import NamedEntityRecognizer, split_name
from nerpii.faker_generator import FakerGenerator

  from .autonotebook import tqdm as notebook_tqdm


### Personal Information Dataset

In [2]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [3]:
recognizer = NamedEntityRecognizer(personal_info)

The functions below try to assign different named entities to the columns of the dataset.

In [4]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [5]:
recognizer.get_presidio_analyzer_results()

[DictAnalyzerResult(key='first_name', value=['Avery', 'Chau', 'Janine', 'Stephane', 'Theola', 'Alesia', 'Lenna', 'Devora', 'Quentin', 'Casie', 'Dyan', 'Pamella', 'Kallie', 'Delmy', 'Arlette', 'Alex', 'Elke', 'Brittni', 'Latrice', 'Novella', 'Fannie', 'Lawrence', 'Shonda', 'Stephaine', 'Amber', 'Fausto', 'Georgene', 'Markus', 'Shawna', 'Ilene', 'Benton', 'Cathrine', 'Glenn', 'Kiley', 'Myra', 'Quentin', 'Kristofer', 'Lucina', 'Olive', 'Blair', 'Shenika', 'Loren', 'Valentin', 'Janey', 'Penney', 'Nelida', 'Kirk', 'Sheron', 'Chantell', 'Elli', 'Carey', 'Junita', 'Jesusita', 'Nobuko', 'Lorean', 'Devorah', 'Joanna', 'Elouise', 'Laticia', 'Bette', 'Sabra', 'Mitsue', 'Mona', 'Caprice', 'Nicolette', 'Gayla', 'Earleen', 'Kaitlyn', 'Fernanda', 'Dorothy', 'Goldie', 'Hillary', 'Kate', 'Cyril', 'Andra', 'Van', 'Tammara', 'Audra', 'Kimbery', 'Rikki', 'Denise', 'Tiffiny', 'Ahmed', 'Lettie', 'Chaya', 'Josphine', 'Brett', 'Alyce', 'Jaclyn', 'Tiera', 'Katina', 'Mitzie', 'Salome', 'Justine', 'Marti', 'Rege

In [6]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.9127725856697819},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.8625},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.9041916167664671},
 'address': {'entity': 'ADDRESS', 'confidence_score': 0.8926174496644296},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8731343283582089},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7171717171717171},
 'state': {'entity': 'LOCATION', 'confidence_score': 0.976},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.888},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.918},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.998}}

Create a faker generator to synthesize new PII

In [7]:
faker_generator = FakerGenerator(personal_info, recognizer.dict_global_entities)

In [8]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized with Faker.
Column [1;32mphone1[0m synthesized with Faker.
Column [1;32mphone2[0m synthesized with Faker.
Column [1;32memail[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mcity[0m synthesized with Faker.
Column [1;32mstate[0m synthesized with Faker.
Column [1;32mweb[0m synthesized with Faker.
Column [1;32mzip[0m synthesized with Faker.
Column [1;31mcompany_name[0m not synthesized with Faker.
Column [1;31mcounty[0m not synthesized with Faker.


In [9]:
personal_info

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,Christopher,Roberts,"Benton, John B Jr",8797 Shelby Common Apt. 612,Clarkchester,Orleans,DC,41883,001-953-847-5371x564,136-305-4360x59117,steven43@hotmail.com,https://www.allen.com/
1,Alexander,Carlson,"Chanay, Jeffrey A Esq",51590 Anthony Pike Apt. 178,South Sydney,Livingston,KY,41475,001-981-294-1590x8799,+1-056-943-2200x17941,qatkins@yahoo.com,https://wilkins-thompson.com/
2,Ryan,Lee,"Chemel, James L Cpa",137 Susan Branch Suite 111,Lake Nathanielmouth,Gloucester,MO,56922,+1-655-487-7030x2724,+1-415-568-6817x9960,whitneymorgan@hotmail.com,http://www.gilmore.com/
3,Stephen,Cook,Feltz Printing Service,261 Herrera Haven Suite 062,North Steven,Anchorage,VA,42957,027-043-6633,001-227-311-0589x292,rsullivan@yahoo.com,https://fitzgerald.org/
4,Suzanne,Wallace,Printing Dimensions,6032 Bridget Lock,Taylorside,Butler,VT,69192,459.993.1736x350,(867)217-4273x722,caseymooney@hotmail.com,https://www.owen.com/
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Tonya,Levine,Inner Label,256 Dean Spring Suite 270,South Larry,Ada,IN,27837,+1-663-503-9895,895-605-4924,igibson@gmail.com,http://www.benjamin-chavez.org/
496,Brandon,Ray,Hermar Inc,5049 Williams Key,Scottport,Elkhart,MH,58915,413-906-5976,2569536453,ethompson@gmail.com,http://kent.com/
497,Diana,Vargas,Simonton Howe & Schneider Pc,97665 Campbell Course Apt. 154,Gibsontown,Box Butte,VI,84523,001-550-518-4265,001-433-701-2649x4005,anthonyperez@hotmail.com,https://rose.com/
498,Valerie,Ellis,Warehouse Office & Paper Prod,07529 Michelle Inlet,Cunninghamview,King,NY,64545,591-224-3313,9470716846,waltonsean@gmail.com,https://www.berry-carroll.biz/


### Full Foia Contacts Dataset

In [10]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In [11]:
full_foia_contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Agency               747 non-null    object
 1   Department           747 non-null    object
 2   Name                 510 non-null    object
 3   Title                747 non-null    object
 4   Room Number          248 non-null    object
 5   Street Address       272 non-null    object
 6   City                 282 non-null    object
 7   State                282 non-null    object
 8   Zip Code             279 non-null    object
 9   Telephone            732 non-null    object
 10  Fax                  277 non-null    object
 11  Email Address        304 non-null    object
 12  Website              254 non-null    object
 13  Online Request Form  93 non-null     object
 14  Notes                48 non-null     object
dtypes: object(15)
memory usage: 87.7+ KB


In this dataset, it is necessary to split the name into first_name and last_name columns. To do so, the function split_name() is used.

In [12]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [13]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [14]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [15]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.19921017990346643},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.2852253280091272},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.19714587737843553},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.26838235294117646},
 'Street Address': {'entity': 'ADDRESS',
  'confidence_score': 0.9285714285714286},
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9473684210526315},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9897959183673469},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.9888268156424581},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.9950738916256158},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': None,
 'Notes': {'entity': 'ORGANIZATION', 'confidence_score': 0.5728155339805825},
 'first_name': {'entit

In [16]:
faker_generator = FakerGenerator(full_foia_contacts, recognizer.dict_global_entities)

In [17]:
faker_generator.get_faker_generation()

Column [1;32mStreet Address[0m synthesized with Faker.
Column [1;32mTelephone[0m synthesized with Faker.
Column [1;32mFax[0m synthesized with Faker.
Column [1;32mEmail Address[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mCity[0m synthesized with Faker.
Column [1;32mWebsite[0m synthesized with Faker.
Column [1;32mZip Code[0m synthesized with Faker.
Column [1;31mAgency[0m not synthesized with Faker.
Column [1;31mDepartment[0m not synthesized with Faker.
Column [1;31mTitle[0m not synthesized with Faker.
Column [1;31mRoom Number[0m not synthesized with Faker.
Column [1;31mNotes[0m not synthesized with Faker.


In [18]:
full_foia_contacts

Unnamed: 0,Agency,Department,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes,first_name,last_name
0,Agricultural Marketing Service,Department of Agriculture,FOIA Officer,"AG Stop 0202, Room 3521-S",1986 Shaw Square Suite 159,Danielberg,DC,47559,886-774-8454,868-081-1076x6921,kgray@gmail.com,https://www.roberts-anderson.com/,,,Matthew,Smith
1,Agricultural Marketing Service,Department of Agriculture,FOIA Requester Service Center,,,,,,+1-066-094-2755x9917,,,,,,Jessica,Compton
2,Agricultural Marketing Service,Department of Agriculture,FOIA Public Liaison,"AG Stop 0202, Room 3521-S",9954 Huang Fords,Port Stephenstad,DC,72467,464-858-5477x776,,dbrown@gmail.com,,,,Justin,Jennings
3,Animal & Plant Health Inspection,Department of Agriculture,FOIA Director,Unit 50,9928 Fuller Plains Apt. 296,West Allisonchester,MD,57692,989-631-1314,(745)962-4106,zgomez@gmail.com,http://brown-jenkins.net/,http://www.aphis.usda.gov/wps/portal/aphis/res...,,John,Romero
4,Animal & Plant Health Inspection,Department of Agriculture,FOIA Requester Service Center,,,,,,0834932983,4177504035,rachelphelps@yahoo.com,,,* Please mail requests to FOIA.Officer@aphis.u...,Erik,Hall
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,,,,,,,Denise,Arias
743,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Public Liaison,,,,,,,,,,,,Derrick,Palmer
744,I don't know which office,Department of Veterans Affairs,FOIA Team Lead,(005R1C) VACO,194 David Gateway Suite 328,South Michaelton,DC,34412,403.134.8207,+1-908-084-7295,,https://harris-williams.com/,,,Jennifer,Hansen
745,I don't know which office,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,266.957.4987x921,,,,,,William,Brown
