## Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from nerpii.named_entity_recognizer import NamedEntityRecognizer, split_name
from nerpii.faker_generator import FakerGenerator

  from .autonotebook import tqdm as notebook_tqdm


### Personal Information Dataset

In [2]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [3]:
recognizer = NamedEntityRecognizer(personal_info, lang='it')

The functions below try to assign different named entities to the columns of the dataset.

In [4]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [5]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.6786786786786787},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.5151515151515151},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.7153846153846154},
 'address': {'entity': 'LOCATION', 'confidence_score': 0.3220338983050847},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8498727735368957},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7424593967517401},
 'state': {'entity': 'LOCATION', 'confidence_score': 1.0},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.95},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.958},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.966}}

Create a faker generator to synthesize new PII

In [6]:
faker_generator = FakerGenerator(personal_info, recognizer.dict_global_entities)

In [7]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized with Faker.
Column [1;32mphone1[0m synthesized with Faker.
Column [1;32mphone2[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32memail[0m synthesized with Faker.
Column [1;32mcity[0m synthesized with Faker.
Column [1;32mstate[0m synthesized with Faker.
Column [1;32mweb[0m synthesized with Faker.
Column [1;32mzip[0m synthesized with Faker.
Column [1;31mcompany_name[0m not synthesized with Faker.
Column [1;31mcounty[0m not synthesized with Faker.


In [8]:
personal_info

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,Elizabeth,Myers,"Benton, John B Jr",412 Joshua Court,Garciaview,Orleans,VA,11105,646.053.3626x58920,(132)452-5837x1477,elizabeth.myers@yahoo.com,http://mann.org/
1,Debra,Pratt,"Chanay, Jeffrey A Esq",815 Cindy Parks Suite 334,West Adam,Livingston,TN,15065,(841)952-7083x645,090.569.7733,debra.pratt@gmail.com,https://duran-hamilton.biz/
2,Amanda,Clay,"Chemel, James L Cpa",59862 Dalton Creek,West Christopher,Gloucester,OR,31338,6978433679,008.782.0878,amanda.clay@yahoo.com,https://www.jensen.info/
3,Paul,Dorsey,Feltz Printing Service,837 Laura Stravenue,Elliston,Anchorage,PA,02598,(408)399-9434x5622,150-207-7612x145,paul.dorsey@hotmail.com,http://freeman-anderson.net/
4,Lori,Wilson,Printing Dimensions,6538 Ryan Union,Port Aaronborough,Butler,AL,87583,900.849.7268x49173,536.412.4087x4110,lori.wilson@hotmail.com,https://wong.com/
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Joshua,Le,Inner Label,75107 Hale Ramp,Port Katherine,Ada,KS,49400,433-683-6608,(467)578-8467,joshua.le@hotmail.com,http://www.allen-lewis.com/
496,Nathan,Ruiz,Hermar Inc,722 Adam Forest,Brownmouth,Elkhart,WV,27979,820.809.2090,(035)842-9019,nathan.ruiz@hotmail.com,https://powell.com/
497,Gary,Schroeder,Simonton Howe & Schneider Pc,72701 Diana Oval,Heatherview,Box Butte,CO,67397,(744)970-7310,765-368-5289x3308,gary.schroeder@gmail.com,https://www.trevino.info/
498,Doris,Owens,Warehouse Office & Paper Prod,98975 Collins Mills Apt. 538,Durantown,King,LA,05597,+1-546-451-2113x316,+1-559-531-2153,doris.owens@gmail.com,https://www.gray.net/


### Full Foia Contacts Dataset

In [28]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In [29]:
full_foia_contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Agency               747 non-null    object
 1   Department           747 non-null    object
 2   Name                 510 non-null    object
 3   Title                747 non-null    object
 4   Room Number          248 non-null    object
 5   Street Address       272 non-null    object
 6   City                 282 non-null    object
 7   State                282 non-null    object
 8   Zip Code             279 non-null    object
 9   Telephone            732 non-null    object
 10  Fax                  277 non-null    object
 11  Email Address        304 non-null    object
 12  Website              254 non-null    object
 13  Online Request Form  93 non-null     object
 14  Notes                48 non-null     object
dtypes: object(15)
memory usage: 87.7+ KB


In this dataset, it is necessary to split the name into first_name and last_name columns. To do so, the function split_name() is used.

In [30]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [31]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [32]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [33]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.1996535296665223},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.2895193977996526},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.20382513661202187},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.24390243902439024},
 'Street Address': {'entity': 'ADDRESS',
  'confidence_score': 0.910828025477707},
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9772727272727273},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9939148073022313},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.9792746113989638},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.9855072463768116},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': {'entity': 'ORGANIZATION', 'confidence_score': 0.125},
 'Notes': {'entity': 'ORGANIZATION', 'confidence_scor

In [34]:
faker_generator = FakerGenerator(full_foia_contacts, recognizer.dict_global_entities)

In [35]:
faker_generator.get_faker_generation()

Column [1;32mStreet Address[0m synthesized with Faker.
Column [1;32mTelephone[0m synthesized with Faker.
Column [1;32mFax[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mEmail Address[0m synthesized with Faker.
Column [1;32mCity[0m synthesized with Faker.
Column [1;32mWebsite[0m synthesized with Faker.
Column [1;32mZip Code[0m synthesized with Faker.
Column [1;31mAgency[0m not synthesized with Faker.
Column [1;31mDepartment[0m not synthesized with Faker.
Column [1;31mTitle[0m not synthesized with Faker.
Column [1;31mRoom Number[0m not synthesized with Faker.
Column [1;31mOnline Request Form[0m not synthesized with Faker.
Column [1;31mNotes[0m not synthesized with Faker.


In [36]:
full_foia_contacts

Unnamed: 0,Agency,Department,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes,first_name,last_name
0,Agricultural Marketing Service,Department of Agriculture,FOIA Officer,"AG Stop 0202, Room 3521-S",9232 Mays Parkways Suite 567,East Judyton,DC,23532,001-619-113-7841,748.489.1526x64954,amy.brown@gmail.com,http://www.ochoa.info/,,,Amy,Brown
1,Agricultural Marketing Service,Department of Agriculture,FOIA Requester Service Center,,,,,,186.949.6596x982,,,,,,Michelle,Castillo
2,Agricultural Marketing Service,Department of Agriculture,FOIA Public Liaison,"AG Stop 0202, Room 3521-S",307 Zhang Fords,Millerview,DC,85168,995-534-2332x52311,,heather.price@yahoo.com,,,,Heather,Price
3,Animal & Plant Health Inspection,Department of Agriculture,FOIA Director,Unit 50,5590 Bailey Divide,Turnerview,MD,49191,277.046.3341x1743,943-574-6087x54043,ashley.miller@gmail.com,https://www.brown-sanders.com/,http://www.aphis.usda.gov/wps/portal/aphis/res...,,Ashley,Miller
4,Animal & Plant Health Inspection,Department of Agriculture,FOIA Requester Service Center,,,,,,859.355.7909,+1-316-819-3604x8901,sara.perry@gmail.com,,,* Please mail requests to FOIA.Officer@aphis.u...,Sara,Perry
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,,,,,,,Austin,Hughes
743,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Public Liaison,,,,,,,,,,,,Marilyn,Wells
744,I don't know which office,Department of Veterans Affairs,FOIA Team Lead,(005R1C) VACO,87014 Dean Turnpike Suite 471,Jillstad,DC,13743,+1-165-182-6671,+1-052-683-4402x4877,,https://www.smith-diaz.com/,,,Cynthia,Taylor
745,I don't know which office,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,(198)522-7317,,,,,,James,Carson
