# Bulk PII Analysis
The following focuses on bulk PII mapping and analysis using Presidio as the primary analyzer.

>> This notebook requires initial set up steps to be completed. For more information, review readme.

In [1]:

from pii_codex.utils import pii_mapping_util
# Confirm data frame loaded from csv mapping file
csv_file_dataframe = pii_mapping_util.open_pii_type_mapping_csv("v1")
csv_file_dataframe

Unnamed: 0,Information_Type,PII_Type,Cluster_Membership_Type,NIST_Category,DHS_Category,HIPAA_Protected_Health_Information_Category,Risk_Level
0,Place of Birth,PLACE_OF_BIRTH,Basic Demographics,Linkable,Not Mentioned,NON_PHI,Semi-Identifiable
1,Race,RACE,Basic Demographics,Linkable,Linkable,PHI,Semi-Identifiable
2,Height,HEIGHT,Basic Demographics,Linkable,Not Mentioned,PHI,Semi-Identifiable
3,Marital Status,MARITAL_STATUS,Basic Demographics,Linkable,Not Mentioned,NON_PHI,Semi-Identifiable
4,Country of Citizenship,COUNTRY_OF_CITIZENSHIP,Basic Demographics,Linkable,Linkable,PHI,Semi-Identifiable
...,...,...,...,...,...,...,...
64,AU Company Number,AU_COMPANY_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,NON_PHI,Identifiable
65,AU Medical Account Number,AU_MEDICAL_ACCOUNT_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,PHI,Identifiable
66,AU Tax File Number,AU_TAX_FILE_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,NON_PHI,Identifiable
67,ES Tax Identification Number,ES_TAX_IDENTIFICATION_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,NON_PHI,Identifiable


In [2]:
# Confirm data frame loaded from json mapping file
json_file_dataframe = pii_mapping_util.open_pii_type_mapping_json("v1")
json_file_dataframe

Unnamed: 0,Information_Type,PII_Type,Cluster_Membership_Type,NIST_Category,DHS_Category,HIPAA_Protected_Health_Information_Category,Risk_Level
0,Place of Birth,PLACE_OF_BIRTH,Basic Demographics,Linkable,Not Mentioned,NON_PHI,Semi-Identifiable
1,Race,RACE,Basic Demographics,Linkable,Linkable,PHI,Semi-Identifiable
2,Height,HEIGHT,Basic Demographics,Linkable,Not Mentioned,PHI,Semi-Identifiable
3,Marital Status,MARITAL_STATUS,Basic Demographics,Linkable,Not Mentioned,NON_PHI,Semi-Identifiable
4,Country of Citizenship,COUNTRY_OF_CITIZENSHIP,Basic Demographics,Linkable,Linkable,PHI,Semi-Identifiable
...,...,...,...,...,...,...,...
64,AU Company Number,AU_COMPANY_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,NON_PHI,Identifiable
65,AU Medical Account Number,AU_MEDICAL_ACCOUNT_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,PHI,Identifiable
66,AU Tax File Number,AU_TAX_FILE_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,NON_PHI,Identifiable
67,ES Tax Identification Number,ES_TAX_IDENTIFICATION_NUMBER,Secure Identifiers,Directly PII,Stand Alone PII,NON_PHI,Identifiable


In [3]:

# Retrieving the entries for "IP Address" Information Type, for example
ip_address = json_file_dataframe[json_file_dataframe.Information_Type=='IP Address']
ip_address

Unnamed: 0,Information_Type,PII_Type,Cluster_Membership_Type,NIST_Category,DHS_Category,HIPAA_Protected_Health_Information_Category,Risk_Level
24,IP Address,IP_ADDRESS,Contact Information,Directly PII,Not Mentioned,PHI,Identifiable


In [5]:
# Retrieving the name for PII_Type Enum from dataframe and mapping it to common PII Type
from pii_codex.models.common import PIIType

pii_type_name = ip_address.PII_Type.item()

pii_type = PIIType[pii_type_name]
print("Enum Type Name for IP Address: ", pii_type.name)
print("Enum Type Name for IP Address: ", pii_type.value)

Enum Type Name for IP Address:  IP_ADDRESS
Enum Type Name for IP Address:  IP_ADDRESS


In [7]:
# Mapping a common PII Type to Third-Party Type (view models folder for currently supported types)
from pii_codex.models.microsoft_presidio_pii import MSFTPresidioPIIType

presidio_pii_type = MSFTPresidioPIIType[PIIType.EMAIL_ADDRESS.name]

print("Enum Type Name for Email: ", presidio_pii_type.name)
print("Enum Type Value for Email: ", presidio_pii_type.value)

Enum Type Name for Email:  EMAIL_ADDRESS
Enum Type Value for Email:  EMAIL_ADDRESS
