In [None]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from IPython.display import Markdown, display
from requests import post, put

def printmd(string):
    display(Markdown(string))

# Test our text on our local running API

In [None]:
API_KEY= "[YourSecretKeyCanBeAnything]" # Set the KEY value you deployed your Web App with [YourSecretKeyCanBeAnything]
URL_LOCAL = f"http://0.0.0.0:5000/api/extraction"
WEB_APP_URL = "https://[your_web_app].azurewebsites.net" # This is the deployed web app format: URL https://[appname].azurewebsites.net

def anonymize(text):

    try:
        headers = {
            # Request headers
            "Content-Type": "application/json",
            "Ocp-Apim-Subscription-Key": API_KEY,
        }

        body = {
        "values": [
            {
                "recordId": "0",
                "data": {
                    "text": text

                }
            }
        ]
    }

        url = WEB_APP_URL

        resp = post(url=url, json=body, headers=headers)

        result_response = resp.json()
    
        return result_response

    except Exception as e:
        print('Exception', e)
        
    return result_response
        
# Download test data 
directory = r"../data/"
for filename in os.listdir(directory):
    with open(os.path.join(directory, filename)) as txt:
        text_to_anonymize = txt.read()
        
    printmd(f"***Full Text***")
    print(text_to_anonymize.replace('&amp;', '&'))

    result_response = anonymize(text_to_anonymize.replace('"', '').replace('&amp;', '&'))
    printmd(f"**Anonymized Text**")
    print(''.join(result_response['values'][0]['data']))


# Deploy the PowerSkill to Azure Search

## Create the data source

In [None]:
# Let's create a data source
API_KEY = ""   # Your ACS API Key
ACS_URL = "https://[your_search_instance].search.windows.net"   # Your ACS URL format https://[your ACS instance].search.windows.net
DATA_SOURCE = "presidio-analyze-and-anonymize-pii-datasource"  # The name for your data source
CONTAINER_NAME = "docs"
KEY= "[YourSecretKeyCanBeAnything]" # Set the KEY value you deployed your Web App with [YourSecretKeyCanBeAnything]
WEB_APP_URL = "https://[your_web_app].azurewebsites.net" # This is the deployed web app format: URL https://[appname].azurewebsites.net


json_text = {
    "name" : DATA_SOURCE,
    "type" : "azureblob",
    "credentials" : { "connectionString" : ""},    #  This is your azure blob connection string
    "container" : { "name" : CONTAINER_NAME }  # The name of the container where the data files are
}

headers = {
    "api-key": API_KEY,     
    "Content-Type": "application/json",

}

try:
    url = f"{ACS_URL}/datasources?api-version=2020-06-30"
    resp = post(url=url, json=json_text, headers=headers)
    
    result_response = resp.json()
    if resp.status_code == 403:
        print("Authorisation Failed: Check that your API KEY value is correct")
        
        
    if resp.status_code == 201:
        print("Success creating data source")
        
except Exception as e:
    print('Exception creating data source', e)

## Now we create the index

In [None]:
INDEX_NAME = "presidio-analyze-and-anonymize-pii-index"  # The name for the index

json_text = {
      "name" : INDEX_NAME,
      "fields": [
        { "name": "id", "type": "Edm.String", "key": True, "searchable": False },
        { "name": "file_name", "type": "Edm.String", "searchable": False },
        { "name": "size", "type": "Edm.Int64", "searchable": False },
        { "name": "last_modified", "type": "Edm.DateTimeOffset", "searchable": False },
        { "name": "content", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False, "facetable": False }
      ]
}

try:
    url = f"{ACS_URL}/indexes?api-version=2020-06-30"
    resp = post(url=url, json=json_text, headers=headers)
    
    result_response = resp.json()
    if resp.status_code == 403:
        print("Authorisation Failed: Check that your API KEY value is correct")
        
    if resp.status_code == 400:
        print(f"Error", resp.text)    
            
    if resp.status_code == 201:
        print("Success creating index")
        
except Exception as e:
    print('Exception creating index', e)

## Now we create the skill set

In [None]:
# Note we are passing in the secret header key and the inference API URL to the skillset
SKILLSET_NAME = "presidio-analyze-and-anonymize-pii-skillset"  # The name of your skillset
COGSVC_KEY = ""  #  This is your Cognitive Services key that resides in the same region as ACS (used to compare custom vision captions and object detection)

json_text = {
    "description": "Anonymize documents.",
    "skills": [
        {
          "@odata.type": "#Microsoft.Skills.Text.PresidioPIIAnonymizationSkill",
          "defaultLanguageCode": "en",
          "inputs": [
            {
              "name": "text", "source": "/document/content"
            }
          ],
          "outputs": [
            {
              "name": "data"
            }
          ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
            "description": "A custom skill that anonymize the text",
            "uri": f"{WEB_APP_URL}/api/extraction",
            "timeout": "PT160S",
            "batchSize": 1,
            "context": "/document/content",
            "httpHeaders": {
                "Ocp-Apim-Subscription-Key": KEY
            },
            "httpMethod": "POST",
            "inputs": [
            {
                "name": "text",
                    "source": "/document/content"
                }
            ],
            "outputs": [
                {
                    "name": "data"
                }
            ]
        }
    ],
    "cognitiveServices": {
        "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
        "description": "cogsvc",
        "key": COGSVC_KEY
    }
}



try:
    url = f"{ACS_URL}/skillsets/{SKILLSET_NAME}?api-version=2020-06-30"
    resp = put(url=url, json=json_text, headers=headers)
    
    result_response = resp.json()
    if resp.status_code == 403:
        print("Authorisation Failed: Check that your API KEY value is correct")
    
    if resp.status_code == 400:
        print(f"Error", resp.text)

    if resp.status_code == 201:
        print("Success creating skillset")
        
except Exception as e:
    print('Exception creating skillset', e)


## Now we create the indexer

In [None]:

json_text = {
    "name": "presidio-analyze-and-anonymize-pii-indexer",
    "dataSourceName": DATA_SOURCE,
    "targetIndexName": INDEX_NAME,
    "skillsetName": SKILLSET_NAME,
    "parameters": {
        "configuration": {
            "allowSkillsetToReadFileData": True
        }
    },
    "outputFieldMappings": [
      {
        "sourceFieldName": "/document/content",
      }
    ]
}


try:
    url = f"{ACS_URL}/indexers?api-version=2020-06-30"
    resp = post(url=url, json=json_text, headers=headers)
    
    result_response = resp.json()
    if resp.status_code == 403:
        print("Authorisation Failed: Check that your API KEY value is correct")

    if resp.status_code == 400:
        print(f"Error", resp.text)
        
    if resp.status_code == 201:
        print("Success creating indexer")
        
except Exception as e:
    print('Exception creating indexer', e)

## Let's go and test the ACS index

In [None]:
searchterm = ""

# Create a client
credential = AzureKeyCredential(API_KEY)
client = SearchClient(endpoint=ACS_URL,
                      index_name=INDEX_NAME,
                      credential=credential)


results = client.search(search_text=searchterm, top=5)


for i, result in enumerate(results):
    print(f"Document {i}")
    printmd(f"**Anonymized text**")
    print(result)