### Named Entity Recognition and Faker PII generation

In [1]:
import pandas as pd

from clearbox_ner.named_entity_recognizer import NamedEntityRecognizer, split_name
from clearbox_ner.faker_generator import FakerGenerator

In [13]:
personal_info = pd.read_csv('dataset/PersonalInfo.csv')
personal_info.head(10)

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web
0,James,Butt,"Benton, John B Jr",6649 N Blue Gum St,New Orleans,Orleans,LA,70116,504-621-8927,504-845-1427,jbutt@gmail.com,http://www.bentonjohnbjr.com
1,Josephine,Darakjy,"Chanay, Jeffrey A Esq",4 B Blue Ridge Blvd,Brighton,Livingston,MI,48116,810-292-9388,810-374-9840,josephine_darakjy@darakjy.org,http://www.chanayjeffreyaesq.com
2,Art,Venere,"Chemel, James L Cpa",8 W Cerritos Ave #54,Bridgeport,Gloucester,NJ,8014,856-636-8749,856-264-4130,art@venere.org,http://www.chemeljameslcpa.com
3,Lenna,Paprocki,Feltz Printing Service,639 Main St,Anchorage,Anchorage,AK,99501,907-385-4412,907-921-2010,lpaprocki@hotmail.com,http://www.feltzprintingservice.com
4,Donette,Foller,Printing Dimensions,34 Center St,Hamilton,Butler,OH,45011,513-570-1893,513-549-4561,donette.foller@cox.net,http://www.printingdimensions.com
5,Simona,Morasca,"Chapman, Ross E Esq",3 Mcauley Dr,Ashland,Ashland,OH,44805,419-503-2484,419-800-6759,simona@morasca.com,http://www.chapmanrosseesq.com
6,Mitsue,Tollner,Morlong Associates,7 Eads St,Chicago,Cook,IL,60632,773-573-6914,773-924-8565,mitsue_tollner@yahoo.com,http://www.morlongassociates.com
7,Leota,Dilliard,Commercial Press,7 W Jackson Blvd,San Jose,Santa Clara,CA,95111,408-752-3500,408-813-1105,leota@hotmail.com,http://www.commercialpress.com
8,Sage,Wieser,Truhlar And Truhlar Attys,5 Boston Ave #88,Sioux Falls,Minnehaha,SD,57105,605-414-2147,605-794-4895,sage_wieser@cox.net,http://www.truhlarandtruhlarattys.com
9,Kris,Marrier,"King, Christopher A Esq",228 Runamuck Pl #2808,Baltimore,Baltimore City,MD,21224,410-655-8723,410-804-4694,kris@gmail.com,http://www.kingchristopheraesq.com


Create a NamedEntityRecognizer

In [15]:
recognizer = NamedEntityRecognizer(personal_info)

The functions below try to assign different named entities to the columns of the dataset.

In [16]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [17]:
recognizer.dict_global_entities

{'first_name': {'entity': 'PERSON', 'confidence_score': 0.9127725856697819},
 'last_name': {'entity': 'PERSON', 'confidence_score': 0.8625},
 'company_name': {'entity': 'PERSON', 'confidence_score': 0.9096385542168675},
 'address': {'entity': 'ADDRESS', 'confidence_score': 0.8926174496644296},
 'city': {'entity': 'LOCATION', 'confidence_score': 0.8731343283582089},
 'county': {'entity': 'LOCATION', 'confidence_score': 0.7171717171717171},
 'state': {'entity': 'LOCATION', 'confidence_score': 0.976},
 'zip': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'phone1': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.888},
 'phone2': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.918},
 'email': {'entity': 'EMAIL_ADDRESS', 'confidence_score': 1.0},
 'web': {'entity': 'URL', 'confidence_score': 0.998}}

Create a FakerGenerator to synthetize PII 

In [18]:
faker_generator = FakerGenerator(recognizer.dataset, recognizer.dict_global_entities)

In [19]:
faker_generator.get_faker_generation()

Column [1;32maddress[0m synthesized.
Column [1;32mphone1[0m synthesized.
Column [1;32mphone2[0m synthesized.
Column [1;32memail[0m synthesized.
Column [1;32mfirst_name[0m synthesized.
Column [1;32mlast_name[0m synthesized.
Column [1;32mcity[0m synthesized.
Column [1;32mstate[0m synthesized.
Column [1;32mweb[0m synthesized.
Column [1;32mzip[0m synthesized.
Column [1;31mcompany_name[0m not synthesized.
Column [1;31mcounty[0m not synthesized.


In [20]:
faker_generator.dataset

Unnamed: 0,first_name,last_name,company_name,address,city,county,state,zip,phone1,phone2,email,web,first_name_gender
382,Stephanie,Daniel,Students In Free Entrprs Natl,69250 Brian Bypass,Jonesview,Butte,MA,51439,919.460.2319x437,317.435.4357,patriciamarshall@yahoo.com,http://www.shannon.com/,female
19,Jenna,Torres,Post Box Services Plus,09689 Richard Curve Suite 963,Steeleland,Winnebago,NV,13765,870.143.8155,001-095-988-4535x824,katherinestevens@hotmail.com,http://mason.com/,female
369,Jose,Carey,Ken Jeter Store Equipment Inc,156 Smith Cove Apt. 437,Kyleside,Maricopa,MO,34012,8689333961,183-998-1005x59934,cooperjason@hotmail.com,https://www.butler.info/,male
225,Matthew,Kelley,Roland Ashcroft,6055 Myers Trail,Thompsonstad,Los Angeles,CA,64841,919.772.9750x403,+1-168-933-2804,susan28@gmail.com,https://hood.biz/,male
343,Alexandria,Flores,"Linhares, Kenneth A Esq",3586 Alexander Green Suite 112,East Jacobmouth,Fairfax,IA,39536,731.311.0927,(024)715-9926,bfields@hotmail.com,https://ferguson-rivera.com/,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,Larry,Nguyen,Goulds Pumps Inc Slurry Pump,71044 Robertson Fork,East Josephville,Worcester,AL,12180,(384)126-3820,014.882.4471x155,cherylproctor@yahoo.com,http://anderson.net/,unknown
158,Brittany,Kelly,Oh My Goodknits Inc,966 Hughes Ranch Apt. 723,Toddchester,Nassau,AL,90860,970-892-4699,880.060.6015,cheryl46@gmail.com,http://www.gregory.com/,female
202,Kaitlyn,Simpson,Art Crafters,026 Stephanie Cliffs,Mallorybury,Miami-Dade,VT,97565,(322)957-7629,(666)196-9711x256,stricklandcharles@gmail.com,https://www.clark.com/,female
392,Stephanie,Smith,"Brooks, Morris J Jr",1116 Tina Village,South Drewville,Wake,OK,42610,001-552-594-8569x10445,001-422-977-3062x52394,eric44@yahoo.com,https://durham-snyder.net/,female


In [23]:
full_foia_contacts = pd.read_csv('dataset/full-foia-contacts.csv')
full_foia_contacts.head(10)

Unnamed: 0,Agency,Department,Name,Title,Room Number,Street Address,City,State,Zip Code,Telephone,Fax,Email Address,Website,Online Request Form,Notes
0,Agricultural Marketing Service,Department of Agriculture,Gregory Bridges,FOIA Officer,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,(202) 690-3767,AMS.FOIA@USDA.gov,http://www.ams.usda.gov/about-ams/foia,,
1,Agricultural Marketing Service,Department of Agriculture,,FOIA Requester Service Center,,,,,,(202) 720-2498,,,,,
2,Agricultural Marketing Service,Department of Agriculture,William Allen,FOIA Public Liaison,"AG Stop 0202, Room 3521-S","1400 Independence Avenue, SW",Washington,DC,20250-0273,(202) 720-2498,,AMS.FOIA@USDA.gov,,,
3,Animal & Plant Health Inspection,Department of Agriculture,Tonya Woods,FOIA Director,Unit 50,4700 River Road,Riverdale,MD,20737-1232,(301) 851-4102,(301) 734-5941,mailto:tonya.g.woods@aphis.usda.gov,http://www.aphis.usda.gov/wps/portal/aphis/res...,http://www.aphis.usda.gov/wps/portal/aphis/res...,
4,Animal & Plant Health Inspection,Department of Agriculture,,FOIA Requester Service Center,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,* Please mail requests to FOIA.Officer@aphis.u...
5,Animal & Plant Health Inspection,Department of Agriculture,Vacant,FOIA Public Liaison,,,,,,(301) 851-4102,(301) 734-5941,mailto:foia.officer@aphis.usda.gov,,,
6,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Alexis R. Graves,Department FOIA Officer,Room 428-W,"1400 Independence Avenue, SW",Washington,DC,20250-0706,(202) 690-3318,(202) 690-0068,mailto:usdafoia@ocio.usda.gov,http://www.dm.usda.gov/foia.htm,,
7,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Marqui Barnes,FOIA Requester Service Center,Room 428-W,1400,Washington,DC,20250-0706,(202)694-1802,,,,,
8,Departmental Management (OSEC/OCIO/ FOIA Servi...,Department of Agriculture,Ravoyne Payton,FOIA Public Liaison,Room 428-W,1400,Washington,DC,20250-0706,(202)690-0048,(202) 205-3755,usdafoia@ocio.usda.gov,,,
9,Farm Service Agency,Department of Agriculture,Kent Politsch,FOIA Officer,Stop 0506,"1400 Independence Avenue, SW",Washington,DC,20250,(202) 720-7163,(202) 720-2979,mailto:kent.politsch@wdc.usda.gov,http://www.fsa.usda.gov/FSA/webapp?area=newsro...,http://www.fsa.usda.gov/FSA/eFOIARequest?area=...,This office has additional FOIA contact inform...


In this dataset, we need to split the name into first_name and last_name. To do so, we use the function split_name()

In [24]:
full_foia_contacts = split_name(full_foia_contacts, 'Name')

In [26]:
recognizer = NamedEntityRecognizer(full_foia_contacts)

In [27]:
recognizer.assign_entities_with_presidio()
recognizer.assign_entities_manually()
recognizer.assign_organization_entity_with_model()

In [28]:
recognizer.dict_global_entities

{'Agency': {'entity': 'ORGANIZATION', 'confidence_score': 0.19930374238468232},
 'Department': {'entity': 'ORGANIZATION',
  'confidence_score': 0.286697247706422},
 'Title': {'entity': 'ORGANIZATION', 'confidence_score': 0.20316622691292877},
 'Room Number': {'entity': 'ORGANIZATION',
  'confidence_score': 0.27058823529411763},
 'Street Address': None,
 'City': {'entity': 'LOCATION', 'confidence_score': 0.9451219512195121},
 'State': None,
 'Zip Code': {'entity': 'ZIPCODE', 'confidence_score': 1.0},
 'Telephone': {'entity': 'PHONE_NUMBER',
  'confidence_score': 0.9877800407331976},
 'Fax': {'entity': 'PHONE_NUMBER', 'confidence_score': 0.9833333333333333},
 'Email Address': {'entity': 'EMAIL_ADDRESS',
  'confidence_score': 0.9897959183673469},
 'Website': {'entity': 'URL', 'confidence_score': 1.0},
 'Online Request Form': {'entity': 'ORGANIZATION', 'confidence_score': 0.125},
 'Notes': {'entity': 'ORGANIZATION', 'confidence_score': 0.5154639175257731},
 'first_name': {'entity': 'PERSON

In [30]:
faker_generator = FakerGenerator(recognizer.dataset, recognizer.dict_global_entities)

In [31]:
faker_generator.get_faker_generation()

Column [1;32mTelephone[0m synthesized.
Column [1;32mFax[0m synthesized.
Column [1;32mEmail Address[0m synthesized.
Column [1;32mfirst_name[0m synthesized.
Column [1;32mlast_name[0m synthesized.
Column [1;32mCity[0m synthesized.
Column [1;32mWebsite[0m synthesized.
Column [1;32mZip Code[0m synthesized.
Column [1;31mAgency[0m not synthesized.
Column [1;31mDepartment[0m not synthesized.
Column [1;31mTitle[0m not synthesized.
Column [1;31mRoom Number[0m not synthesized.
Column [1;31mOnline Request Form[0m not synthesized.
Column [1;31mNotes[0m not synthesized.
