# PIIAnalysisService - MSFT Presidio Analyzer Type with list of text to analyze
The following is an example of how to use the PIIAnalysisService with the MSFT presidio analyzer on a large set of PII.

>>Note: This notebook requires all dependencies to have been installed. For more information, review readme.


In [59]:
from pii_codex.models.common import AnalysisProviderType, AnalysisEncoder
from pii_codex.services.pii_analysis import PII_ANALYSIS_SERVICE
import pandas as pd

# Run the detection and assessments in one shot with the PIIAnalysisService
analysis_results = PII_ANALYSIS_SERVICE.run_batch_analysis_and_score(
    analysis_provider=AnalysisProviderType.PRESIDIO.name,
    texts=[
        "Here is my contact information: Phone number 555-555-5555 and my email is example123@email.com",
        "Perfect, mine number if you need me is 777-777=7777. Where is the residence and what is the earliest the crew can arrive?"
        "I'll be at my home at 123 Dark Data Lane, OH, 11111 after 7PM",
        "Cool, I'll be there!"
        ],
    language_code="en"
)

results = AnalysisEncoder().encode(analysis_results.batched_analyses)
batched_df = pd.DataFrame(analysis_results.to_dict()['batchedAnalyses'])

batched_df

Unnamed: 0,analyses,index,averageRiskScore
0,"[{'pii_type_detected': 'EMAIL_ADDRESS', 'risk_...",0,2.666667
1,"[{'pii_type_detected': 'LOCATION', 'risk_level...",1,2.0
2,[],2,1.0


In [60]:

from typing import List

batched_analyses = analysis_results.to_dict()['batchedAnalyses']
batches: List[dict] = []

for batch_index in range(0, len(batched_analyses)):

    batch = {}

    analyses_df = pd.DataFrame(batched_analyses[batch_index]['analyses'])
    if batched_analyses[batch_index]['analyses']:
        batch = {
            "average_risk_score": batched_analyses[batch_index]['averageRiskScore'],
            "pii_count": len(batched_analyses[batch_index]['analyses']),
            "pii_types_detected": analyses_df['pii_type_detected'].values.tolist(),
            "risk_level_definitions": analyses_df['risk_level_definition'].values.tolist(),
            "cluster_membership_types": analyses_df['cluster_membership_type'].values.tolist(),
            "hipaa_categories": analyses_df['hipaa_category'].values.tolist(),
            "dhs_categories": analyses_df['dhs_category'].values.tolist(),
            "nist_categories": analyses_df['nist_category'].values.tolist(),
        }
    else:
        batch = {
            "average_risk_score": batched_analyses[batch_index]['averageRiskScore'],
            "pii_count": len(batched_analyses[batch_index]['analyses']),
        }

    batches.append(batch)

split_view_df = pd.DataFrame(
    batches
)
split_view_df


Unnamed: 0,average_risk_score,pii_count,pii_types_detected,risk_level_definitions,cluster_membership_types,hipaa_categories,dhs_categories,nist_categories
0,2.666667,3,"[EMAIL_ADDRESS, PHONE_NUMBER, URL]","[Identifiable, Identifiable, Semi-Identifiable]","[Personal Preferences, Contact Information, Co...","[Protected Health Information, Protected Healt...","[Stand Alone PII, Stand Alone PII, Linkable]","[Directly PII, Directly PII, Linkable]"
1,2.0,1,[LOCATION],[Semi-Identifiable],[Secure Identifiers],[Protected Health Information],[Not Mentioned],[Linkable]
2,1.0,0,,,,,,
