## Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from nerpii.named_entity_recognizer import NamedEntityRecognizer, split_name
from nerpii.faker_generator import FakerGenerator

  from .autonotebook import tqdm as notebook_tqdm


### Personal Information Dataset

In [13]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [14]:
recognizer = NamedEntityRecognizer(personal_info)

The functions below try to assign different named entities to the columns of the dataset.

In [15]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [16]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.9127725856697819},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.8625},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.9096385542168675},
 'address': {'entity': 'ADDRESS', 'confidence_score': 0.8926174496644296},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8731343283582089},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7171717171717171},
 'state': {'entity': 'LOCATION', 'confidence_score': 0.976},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.888},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.918},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.998},
 'first_name_gender': None}

In [17]:
recognizer.dataset

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web,first_name_gender
178,Stephaine,Vinning,Birite Foodservice Distr,3717 Hamann Industrial Pky,San Francisco,San Francisco,CA,94104,415-767-6596,415-712-9530,stephaine@cox.net,http://www.biritefoodservicedistr.com,unknown
443,Jeanice,Claucherty,Accurel Systems Intrntl Corp,19 Amboy Ave,Miami,Miami-Dade,FL,33142,305-988-4162,305-306-7834,jeanice.claucherty@yahoo.com,http://www.accurelsystemsintrntlcorp.com,unknown
457,Nu,Mcnease,Amazonia Film Project,88 Sw 28th Ter,Harrison,Hudson,NJ,7029,973-751-9003,973-903-4175,nu@gmail.com,http://www.amazoniafilmproject.com,andy
388,Gertude,Witten,"Thompson, John Randolph Jr",7 Tarrytown Rd,Cincinnati,Hamilton,OH,45217,513-977-7043,513-863-9471,gertude.witten@gmail.com,http://www.thompsonjohnrandolphjr.com,unknown
288,Reena,Maisto,Lane Promotions,9648 S Main,Salisbury,Wicomico,MD,21801,410-351-1863,410-951-2667,reena@hotmail.com,http://www.lanepromotions.com,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Oretha,Menter,Custom Engineering Inc,8 County Center Dr #647,Boston,Suffolk,MA,2210,617-418-5043,617-697-6024,oretha_menter@yahoo.com,http://www.customengineeringinc.com,unknown
410,Mitzie,Hudnall,Cangro Transmission Co,17 Jersey Ave,Englewood,Arapahoe,CO,80110,303-402-1940,303-997-7760,mitzie_hudnall@yahoo.com,http://www.cangrotransmissionco.com,unknown
235,Yoko,Fishburne,Sams Corner Store,9122 Carpenter Ave,New Haven,New Haven,CT,6511,203-506-4706,203-840-8634,yoko@fishburne.com,http://www.samscornerstore.com,female
183,Claribel,Varriano,Meca,6 Harry L Dr #6327,Perrysburg,Wood,OH,43551,419-544-4900,419-573-2033,claribel_varriano@cox.net,http://www.meca.com,female


Create a faker generator to synthesize new PII

In [6]:
faker_generator = FakerGenerator(personal_info, recognizer.dict_global_entities)

In [8]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized with Faker.
Column [1;32mphone1[0m synthesized with Faker.
Column [1;32mphone2[0m synthesized with Faker.
Column [1;32memail[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mcity[0m synthesized with Faker.
Column [1;32mstate[0m synthesized with Faker.
Column [1;32mweb[0m synthesized with Faker.
Column [1;32mzip[0m synthesized with Faker.
Column [1;31mcompany_name[0m not synthesized with Faker.
Column [1;31mcounty[0m not synthesized with Faker.


In [9]:
personal_info

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,Scott,Thomas,"Benton, John B Jr",04906 Peterson Circle Apt. 368,Emilyfort,Orleans,VI,07678,001-834-833-7300x43326,(219)556-4567x88068,tdunn@gmail.com,http://www.sanders-nunez.org/
1,Charles,Clarke,"Chanay, Jeffrey A Esq",78294 Tucker Junction Apt. 583,Codyview,Livingston,VI,12446,500.560.7127x3065,001-843-991-9896,leslienguyen@hotmail.com,http://smith-morrow.com/
2,Anthony,Sims,"Chemel, James L Cpa",10743 David Mall,Wigginsbury,Gloucester,DC,38109,(910)171-3581x3040,001-101-765-5068,lisa26@yahoo.com,https://morales-jones.com/
3,John,Hebert,Feltz Printing Service,28847 Johnson Grove,New Alexandra,Anchorage,CO,13536,(161)928-6667x9572,(092)996-1053x904,andreadavis@gmail.com,http://tucker.com/
4,Brandon,Smith,Printing Dimensions,7641 Elizabeth Rue,West Wesleyville,Butler,OK,86021,6826544031,830-986-1689x91233,scottcarter@yahoo.com,http://arnold.com/
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Mitchell,Jimenez,Inner Label,0420 Susan Parkways,West Michael,Ada,TX,55123,650.479.4199x68630,4144551877,rebekah18@hotmail.com,https://www.mcdowell.com/
496,Luke,Oconnor,Hermar Inc,318 Bradley Junctions,South Josephfurt,Elkhart,RI,18926,001-982-775-3419x96186,658-840-3701x892,avillanueva@hotmail.com,http://www.dixon.biz/
497,David,Garza,Simonton Howe & Schneider Pc,30273 Ruiz Trail Suite 835,Kathleenville,Box Butte,GA,13897,339-852-1776,(011)424-5349,emmahood@hotmail.com,http://sheppard-jackson.com/
498,Kenneth,Johnson,Warehouse Office & Paper Prod,851 Moore Lodge,South Susan,King,MD,77842,(299)859-2397x52923,573-630-0237x90051,muellertimothy@yahoo.com,http://www.hendricks.com/


### Full Foia Contacts Dataset

In [7]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In [8]:
full_foia_contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Agency               747 non-null    object
 1   Department           747 non-null    object
 2   Name                 510 non-null    object
 3   Title                747 non-null    object
 4   Room Number          248 non-null    object
 5   Street Address       272 non-null    object
 6   City                 282 non-null    object
 7   State                282 non-null    object
 8   Zip Code             279 non-null    object
 9   Telephone            732 non-null    object
 10  Fax                  277 non-null    object
 11  Email Address        304 non-null    object
 12  Website              254 non-null    object
 13  Online Request Form  93 non-null     object
 14  Notes                48 non-null     object
dtypes: object(15)
memory usage: 87.7+ KB


In this dataset, it is necessary to split the name into first_name and last_name columns. To do so, the function split_name() is used.

In [9]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [10]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [25]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [26]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.19430272108843538},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.2891844997108155},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.2054718034617532},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.22950819672131148},
 'Street Address': {'entity': 'ADDRESS',
  'confidence_score': 0.9119496855345912},
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9540229885057471},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9876543209876543},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.9898477157360406},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.9906542056074766},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': {'entity': 'ORGANIZATION', 'confidence_score': 0.125},
 'Notes': {'entity': 'ORGANIZATION', 'confidence_sco

In [27]:
faker_generator = FakerGenerator(full_foia_contacts, recognizer.dict_global_entities)

In [28]:
faker_generator.get_faker_generation()

Column [1;32mStreet Address[0m synthesized with Faker.
Column [1;32mTelephone[0m synthesized with Faker.
Column [1;32mFax[0m synthesized with Faker.
Column [1;32mEmail Address[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mCity[0m synthesized with Faker.
Column [1;32mWebsite[0m synthesized with Faker.
Column [1;32mZip Code[0m synthesized with Faker.
Column [1;31mAgency[0m not synthesized with Faker.
Column [1;31mDepartment[0m not synthesized with Faker.
Column [1;31mTitle[0m not synthesized with Faker.
Column [1;31mRoom Number[0m not synthesized with Faker.
Column [1;31mOnline Request Form[0m not synthesized with Faker.
Column [1;31mNotes[0m not synthesized with Faker.


In [29]:
full_foia_contacts

Unnamed: 0,Agency,Department,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes,first_name,last_name
0,Agricultural Marketing Service,Department of Agriculture,FOIA Officer,"AG Stop 0202, Room 3521-S",96032 Kristin Streets Suite 121,Lauramouth,DC,58693,(220)246-0297,806-659-7329,sarahthompson@gmail.com,https://www.brown-keller.com/,,,Michael,Johnson
1,Agricultural Marketing Service,Department of Agriculture,FOIA Requester Service Center,,,,,,(038)771-3192x22712,,,,,,George,Lopez
2,Agricultural Marketing Service,Department of Agriculture,FOIA Public Liaison,"AG Stop 0202, Room 3521-S",3301 Roberts Shoal,South Melinda,DC,84268,1424694073,,ypeterson@hotmail.com,,,,Deborah,Taylor
3,Animal & Plant Health Inspection,Department of Agriculture,FOIA Director,Unit 50,220 Jennifer Turnpike,West Rita,MD,48360,(508)570-0805,633.692.5508,stevensmith@yahoo.com,https://www.williams.com/,http://www.aphis.usda.gov/wps/portal/aphis/res...,,Donald,Johnson
4,Animal & Plant Health Inspection,Department of Agriculture,FOIA Requester Service Center,,,,,,(899)365-5307x0705,020-673-6243,stacy39@gmail.com,,,* Please mail requests to FOIA.Officer@aphis.u...,Kendra,Johnson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,,,,,,,Robin,Stewart
743,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Public Liaison,,,,,,,,,,,,Megan,Johnson
744,I don't know which office,Department of Veterans Affairs,FOIA Team Lead,(005R1C) VACO,794 Roth Track,North Debra,DC,84216,264-673-6991x53592,317.297.7950,,https://www.griffin-stewart.com/,,,Angelica,Medina
745,I don't know which office,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,507-980-3891,,,,,,Debra,Reed
