## Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from nerpii.named_entity_recognizer import NamedEntityRecognizer, split_name
from nerpii.faker_generator import FakerGenerator

  from .autonotebook import tqdm as notebook_tqdm


### Personal Information Dataset

In [2]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [20]:
recognizer = NamedEntityRecognizer(personal_info)

The functions below try to assign different named entities to the columns of the dataset.

In [21]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [22]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.9127725856697819},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.8625},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.9096385542168675},
 'address': {'entity': 'ADDRESS', 'confidence_score': 0.8926174496644296},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8731343283582089},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7171717171717171},
 'state': {'entity': 'LOCATION', 'confidence_score': 0.976},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.888},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.918},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.998}}

Create a FakerGenerator to synthetize PII 

In [23]:
faker_generator = FakerGenerator(personal_info, recognizer.dict_global_entities)

In [24]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized with Faker.
Column [1;32mphone1[0m synthesized with Faker.
Column [1;32mphone2[0m synthesized with Faker.
Column [1;32memail[0m synthesized with Faker.
Column [1;32mfirst_name[0m synthesized with Faker.
Column [1;32mlast_name[0m synthesized with Faker.
Column [1;32mcity[0m synthesized with Faker.
Column [1;32mstate[0m synthesized with Faker.
Column [1;32mweb[0m synthesized with Faker.
Column [1;32mzip[0m synthesized with Faker.
Column [1;31mcompany_name[0m not synthesized with Faker.
Column [1;31mcounty[0m not synthesized with Faker.


In [25]:
personal_info

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,John,Rivera,"Benton, John B Jr",56497 Emily Shoal Suite 896,Port Tanya,Orleans,LA,23465,104-257-2621x36609,(161)969-4904,holly42@gmail.com,http://www.lowery-wheeler.com/
1,Carol,Padilla,"Chanay, Jeffrey A Esq",5071 Meyers Ranch,North Emily,Livingston,OR,83672,+1-643-108-6119x726,701-535-4431x324,gordonrogers@gmail.com,http://www.ramos.com/
2,Andrew,Salazar,"Chemel, James L Cpa",40719 Nguyen Mill Suite 000,West Chadshire,Gloucester,MI,82238,711.108.4721,6353529058,emoore@hotmail.com,http://gutierrez-carroll.com/
3,Brianna,Gardner,Feltz Printing Service,8283 Arnold Groves,West Katelynburgh,Anchorage,AR,11994,(356)823-9517x098,+1-362-185-8717x0337,sarabuchanan@hotmail.com,http://www.thompson.com/
4,Donna,Porter,Printing Dimensions,74662 Pope Pass,North Lawrencefort,Butler,CA,14089,732-991-9930x4767,+1-399-537-1395x14353,padillakevin@yahoo.com,http://www.johnson-ferrell.com/
...,...,...,...,...,...,...,...,...,...,...,...,...
495,Danielle,Roberts,Inner Label,57810 Maxwell Ridge,Lake Davidtown,Ada,FL,87512,001-796-274-7530x3261,001-618-189-7079x26873,marcdavis@hotmail.com,http://ortiz-moore.biz/
496,David,Gibson,Hermar Inc,539 Whitney Tunnel,West Joshua,Elkhart,HI,14954,+1-189-224-8186x6094,639-344-8005,nichole05@gmail.com,http://www.hernandez-dominguez.net/
497,Stacy,George,Simonton Howe & Schneider Pc,201 Moore Glen,Riceburgh,Box Butte,CT,76852,873.023.9243,976-276-0919,kcooke@hotmail.com,https://patterson.biz/
498,David,Perez,Warehouse Office & Paper Prod,21832 Alexis Extensions,Jasmineborough,King,IN,46084,(741)078-3131x17379,+1-005-204-6847x855,davidclark@yahoo.com,http://www.lewis-anderson.com/


### Full Foia Contacts Dataset

In [2]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In [3]:
full_foia_contacts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Agency               747 non-null    object
 1   Department           747 non-null    object
 2   Name                 510 non-null    object
 3   Title                747 non-null    object
 4   Room Number          248 non-null    object
 5   Street Address       272 non-null    object
 6   City                 282 non-null    object
 7   State                282 non-null    object
 8   Zip Code             279 non-null    object
 9   Telephone            732 non-null    object
 10  Fax                  277 non-null    object
 11  Email Address        304 non-null    object
 12  Website              254 non-null    object
 13  Online Request Form  93 non-null     object
 14  Notes                48 non-null     object
dtypes: object(15)
memory usage: 87.7+ KB


In this dataset, it is necessary to split the name into first_name and last_name columns. To do so, the function split_name() is used.

In [4]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [17]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [18]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [19]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.2039268183846497},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.2898550724637681},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.20320265046935396},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.23961661341853036},
 'Street Address': None,
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9526627218934911},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9918367346938776},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.983957219251337},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.985981308411215},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': {'entity': 'ORGANIZATION', 'confidence_score': 0.2},
 'Notes': {'entity': 'ORGANIZATION', 'confidence_score': 0.5357142857142857},
 'first_name': {'entity': 'PERSON', '

In [8]:
faker_generator = FakerGenerator(full_foia_contacts, recognizer.dict_global_entities)

In [9]:
faker_generator.get_faker_generation()

TypeError: FakerGenerator.get_address.<locals>.<lambda>() got an unexpected keyword argument 'axis'

In [10]:
faker_generator.dataset

Unnamed: 0,Agency,Department,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes,first_name,last_name
0,Agricultural Marketing Service,Department of Agriculture,FOIA Officer,"AG Stop 0202, Room 3521-S",38092 Walker Plaza Suite 518,Patriciafort,DC,66606,2402950205,(424)400-4806,amcknight@hotmail.com,http://garza-leon.com/,,,David,Wilson
1,Agricultural Marketing Service,Department of Agriculture,FOIA Requester Service Center,,723 Kristen Track Suite 725,Erinhaven,,23502,051.923.9351x0075,+1-598-231-6561x4879,dprice@gmail.com,http://www.nichols.biz/,,,Caitlin,Branch
2,Agricultural Marketing Service,Department of Agriculture,FOIA Public Liaison,"AG Stop 0202, Room 3521-S",601 Moore Squares Suite 024,Heidibury,DC,95880,576.594.9500x871,001-986-340-6537x722,bchavez@gmail.com,http://www.potter.com/,,,Daniel,Davis
3,Animal & Plant Health Inspection,Department of Agriculture,FOIA Director,Unit 50,014 Anderson Springs,Hernandezburgh,MD,18509,013.212.3182,191-516-7710x7130,david14@yahoo.com,https://wiley.net/,http://www.aphis.usda.gov/wps/portal/aphis/res...,,Melissa,Horn
4,Animal & Plant Health Inspection,Department of Agriculture,FOIA Requester Service Center,,38117 Grace Causeway Apt. 912,Stephenfurt,,04301,456-025-3942,173-492-5388,smorton@hotmail.com,http://www.melton-james.biz/,,* Please mail requests to FOIA.Officer@aphis.u...,Laura,Sanders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Requester Service Center,,261 Adam Point Suite 589,Lake Shari,,04841,001-929-359-1527x31377,(935)778-7315x1647,donnaneal@gmail.com,https://www.riley.biz/,,,John,Andrews
743,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Public Liaison,,825 White Pass,New Sean,,17268,+1-258-631-0298x104,+1-015-846-3059x2461,zjones@gmail.com,http://www.schneider.com/,,,Jeffrey,Sanchez
744,I don't know which office,Department of Veterans Affairs,FOIA Team Lead,(005R1C) VACO,6831 Rogers Brooks,Bondchester,DC,73170,001-323-760-9977x0252,725-158-3087,michael83@gmail.com,https://www.martin.com/,,,Deanna,Jarvis
745,I don't know which office,Department of Veterans Affairs,FOIA Requester Service Center,,794 Collins Forge Apt. 683,Lake David,,06889,(816)226-9631,9490347981,willietrevino@yahoo.com,https://www.castillo-clark.com/,,,Ryan,Ortega


In [16]:
faker_generator.dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747 entries, 0 to 746
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Agency               747 non-null    object
 1   Department           747 non-null    object
 2   Title                747 non-null    object
 3   Room Number          248 non-null    object
 4   Street Address       747 non-null    object
 5   City                 747 non-null    object
 6   State                282 non-null    object
 7   Zip Code             747 non-null    object
 8   Telephone            747 non-null    object
 9   Fax                  747 non-null    object
 10  Email Address        747 non-null    object
 11  Website              747 non-null    object
 12  Online Request Form  93 non-null     object
 13  Notes                48 non-null     object
 14  first_name           747 non-null    object
 15  last_name            747 non-null    object
dtypes: objec

In [4]:
full_foia_contacts['Name'] = full_foia_contacts['first_name'] + ' ' + full_foia_contacts['last_name']

In [5]:
del full_foia_contacts['first_name']
del full_foia_contacts['last_name']

In [6]:
full_foia_contacts

Unnamed: 0,Agency,Department,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes,Name
0,Agricultural Marketing Service,Department of Agriculture,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,,Gregory Bridges
1,Agricultural Marketing Service,Department of Agriculture,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,,- -
2,Agricultural Marketing Service,Department of Agriculture,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,,William Allen
3,Animal & Plant Health Inspection,Department of Agriculture,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,,Tonya Woods
4,Animal & Plant Health Inspection,Department of Agriculture,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...,- -
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,,,,,,,- -
743,Office of Small and Disadvantaged Business Uti...,Department of Veterans Affairs,FOIA Public Liaison,,,,,,,,,,,,James Horan
744,I don't know which office,Department of Veterans Affairs,FOIA Team Lead,(005R1C) VACO,"810 Vermont Avenue, NW",Washington,DC,20420,(202) 632-7465,(202) 632-7581,,http://www.foia.va.gov/,,,Laurie Karnay
745,I don't know which office,Department of Veterans Affairs,FOIA Requester Service Center,,,,,,(202) 632-7600,,,,,,- -
