## Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from nerpii.named_entity_recognizer import NamedEntityRecognizer, split_name
from nerpii.faker_generator import FakerGenerator

  from .autonotebook import tqdm as notebook_tqdm


### Personal Information Dataset

In [2]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [3]:
recognizer = NamedEntityRecognizer(personal_info)

The functions below try to assign different named entities to the columns of the dataset.

In [4]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [5]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.9127725856697819},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.8625},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.9096385542168675},
 'address': {'entity': 'ADDRESS', 'confidence_score': 0.8926174496644296},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8731343283582089},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7171717171717171},
 'state': {'entity': 'LOCATION', 'confidence_score': 0.976},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.888},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.918},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.998}}

Create a FakerGenerator to synthetize PII 

In [6]:
faker_generator = FakerGenerator(recognizer.dataset, recognizer.dict_global_entities)

In [7]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized.
Column [1;32mphone1[0m synthesized.
Column [1;32mphone2[0m synthesized.
Column [1;32memail[0m synthesized.
Column [1;32mfirst_name[0m synthesized.
Column [1;32mlast_name[0m synthesized.
Column [1;32mcity[0m synthesized.
Column [1;32mstate[0m synthesized.
Column [1;32mweb[0m synthesized.
Column [1;32mzip[0m synthesized.
Column [1;31mcompany_name[0m not synthesized.
Column [1;31mcounty[0m not synthesized.


In [8]:
faker_generator.dataset

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web,first_name_gender
380,Stacy,Parker,"Soward, Anne Esq",074 Jones Meadow Apt. 545,New Ashley,Cook,CO,96371,001-014-355-1017x2834,001-635-202-0683,tiffany68@gmail.com,http://www.green.net/,female
202,Tiffany,Charles,Art Crafters,30270 Rodriguez Unions Apt. 910,New Danielshire,Miami-Dade,AR,11104,121.382.9339x18378,001-279-279-1665,vkeller@yahoo.com,https://www.walls.com/,female
122,Sydney,Tate,Biltmore Investors Bank,79076 Henry Radial Suite 559,Lake Jason,San Bernardino,WA,92946,(344)681-7171,105.099.2307,annette90@gmail.com,http://richmond-merritt.com/,female
249,Courtney,Robinson,Peace Christian Center,7699 Tara Parks,Manningstad,Somerset,ND,66575,+1-678-004-7105,109-545-8100x0649,victoria35@hotmail.com,https://www.roberts.info/,female
7,Alexander,Hansen,Commercial Press,7799 Ellis Corners Suite 984,Luceroborough,Santa Clara,MP,59209,060-896-0291x1925,883.657.4482x08355,levirose@gmail.com,http://williams-nielsen.com/,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,Kyle,Turner,Desco Equipment Corp,904 Christopher Lane,Michellemouth,Alameda,NE,64687,(822)727-0052x43236,2011141243,randall83@yahoo.com,https://smith.biz/,male
188,Sheila,Randall,"Nischwitz, Jeffrey L Esq",540 Rogers Stravenue Apt. 094,Amberstad,Hudson,AR,08209,001-412-055-7165,3546966868,howardtammy@gmail.com,https://gonzalez.org/,female
27,Shane,Mann,"Sider, Donald C Esq",298 Stanley Islands,Andreabury,Talbot,OR,72668,(913)955-4710,001-507-651-8332x30584,frankdawn@yahoo.com,https://camacho-white.com/,unknown
119,Loretta,Oneill,Custom Engineering Inc,67738 Rachel Harbors Apt. 295,Annemouth,Suffolk,VT,48030,282-701-4420,(751)036-6950x62590,amandawashington@hotmail.com,https://www.figueroa-stephenson.com/,female


### Full Foia Contacts Dataset

In [10]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In this dataset, it is necessary to split the name into first_name and last_name columns. To do so, the function split_name() is used.

In [11]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [12]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [13]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [14]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.20094868477792152},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.2871912693854107},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.2067648663393344},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.23247232472324722},
 'Street Address': {'entity': 'ADDRESS',
  'confidence_score': 0.9337748344370861},
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9532163742690059},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9897750511247444},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.973404255319149},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.9854368932038835},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': None,
 'Notes': {'entity': 'ORGANIZATION', 'confidence_score': 0.5511811023622047},
 'first_name': {'entity'

In [15]:
faker_generator = FakerGenerator(recognizer.dataset, recognizer.dict_global_entities)

In [16]:
faker_generator.get_faker_generation()

Column [1;32mStreet Address[0m synthesized.
Column [1;32mTelephone[0m synthesized.
Column [1;32mFax[0m synthesized.
Column [1;32mEmail Address[0m synthesized.
Column [1;32mfirst_name[0m synthesized.
Column [1;32mlast_name[0m synthesized.
Column [1;32mCity[0m synthesized.
Column [1;32mWebsite[0m synthesized.
Column [1;32mZip Code[0m synthesized.
Column [1;31mAgency[0m not synthesized.
Column [1;31mDepartment[0m not synthesized.
Column [1;31mTitle[0m not synthesized.
Column [1;31mRoom Number[0m not synthesized.
Column [1;31mNotes[0m not synthesized.
