Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[text analytics] add domain_filter param #13451

Merged
merged 5 commits into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
OpinionSentiment,
RecognizePiiEntitiesResult,
PiiEntity,
PiiEntityDomainType,
)

__all__ = [
Expand Down Expand Up @@ -59,6 +60,7 @@
'OpinionSentiment',
'RecognizePiiEntitiesResult',
'PiiEntity',
'PiiEntityDomainType',
]

__version__ = VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Licensed under the MIT License.
# ------------------------------------
import re
from enum import Enum
from ._generated.models import (
LanguageInput,
MultiLanguageInput,
Expand Down Expand Up @@ -64,6 +65,10 @@ def get(self, key, default=None):
return self.__dict__[key]
return default

class PiiEntityDomainType(str, Enum):
"""The different domains of PII entities that users can filter by"""
PHI = "PHI"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we include a comment for PHI about what it stands for?

Copy link
Member

@maririos maririos Aug 31, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the Framework Design Guidance:

"Do not use abbreviations or acronyms in identifies. [...] It is more important for names to be readable than it is for them to be brief. I t is equally important not to use abbreviations and acronyms that are not generally understood"

Because of that, we should rename PHI to ProtectedHealthInformation

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like PHI fits better in this case though, since we're abbreviating PII

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even between us we've asked multiple times what PHI is. Pii is more common, so I strongly recommend changing it.
For .NET I will use the complete name

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, since .NET is using the full name, I'll make the change



class DetectedLanguage(DictMixin):
"""DetectedLanguage contains the predicted language found in text,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,10 @@ def recognize_pii_entities( # type: ignore
be used for scoring, e.g. "latest", "2019-10-01". If a model-version
is not specified, the API will default to the latest, non-preview version.
:keyword bool show_stats: If set to true, response will contain document level statistics.
:keyword domain_filter: Filters the response entities to ones only included in your specified domain.
iscai-msft marked this conversation as resolved.
Show resolved Hide resolved
I.e., if set to 'PHI', will only return entities in the Personal Healthcare Information domain.
iscai-msft marked this conversation as resolved.
Show resolved Hide resolved
Currently only supports 'PHI'. See https://aka.ms/tanerpii for more information.
iscai-msft marked this conversation as resolved.
Show resolved Hide resolved
:paramtype domain_filter: str or ~azure.ai.textanalytics.PiiEntityDomainType
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need the same :versionadded: directive here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, they had it on v3.1-preview.1, just never supported it

:return: The combined list of :class:`~azure.ai.textanalytics.RecognizePiiEntitiesResult`
and :class:`~azure.ai.textanalytics.DocumentError` in the order the original documents
were passed in.
Expand All @@ -281,13 +285,15 @@ def recognize_pii_entities( # type: ignore
docs = _validate_input(documents, "language", language)
model_version = kwargs.pop("model_version", None)
show_stats = kwargs.pop("show_stats", False)
domain_filter = kwargs.pop("domain_filter", None)
if self._string_code_unit:
kwargs.update({"string_index_type": self._string_code_unit})
try:
return self._client.entities_recognition_pii(
documents=docs,
model_version=model_version,
show_stats=show_stats,
domain=domain_filter,
cls=kwargs.pop("cls", pii_entities_result),
**kwargs
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ async def recognize_pii_entities( # type: ignore
be used for scoring, e.g. "latest", "2019-10-01". If a model-version
is not specified, the API will default to the latest, non-preview version.
:keyword bool show_stats: If set to true, response will contain document level statistics.
:keyword domain_filter: Filters the response entities to ones only included in your specified domain.
I.e., if set to 'PHI', will only return entities in the Personal Healthcare Information domain.
Currently only supports 'PHI'. See https://aka.ms/tanerpii for more information.
:paramtype domain_filter: str or ~azure.ai.textanalytics.PiiEntityDomainType
:return: The combined list of :class:`~azure.ai.textanalytics.RecognizePiiEntitiesResult`
and :class:`~azure.ai.textanalytics.DocumentError` in the order the original documents
were passed in.
Expand All @@ -283,13 +287,16 @@ async def recognize_pii_entities( # type: ignore
docs = _validate_input(documents, "language", language)
model_version = kwargs.pop("model_version", None)
show_stats = kwargs.pop("show_stats", False)
domain_filter = kwargs.pop("domain_filter", None)

if self._string_code_unit:
kwargs.update({"string_index_type": self._string_code_unit})
try:
return await self._client.entities_recognition_pii(
documents=docs,
model_version=model_version,
show_stats=show_stats,
domain=domain_filter,
cls=kwargs.pop("cls", pii_entities_result),
**kwargs
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "I work at Microsoft and my phone number
is 333-333-3333", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '113'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"333-333-3333","category":"Phone
Number","offset":43,"length":12,"confidenceScore":0.8}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id:
- c2319b95-6fd2-46c9-80e3-06c8f2701825
content-type:
- application/json; charset=utf-8
csp-billing-usage:
- CognitiveServices.TextAnalytics.BatchScoring=1
date:
- Mon, 31 Aug 2020 20:32:54 GMT
strict-transport-security:
- max-age=31536000; includeSubDomains; preload
transfer-encoding:
- chunked
x-content-type-options:
- nosniff
x-envoy-upstream-service-time:
- '79'
status:
code: 200
message: OK
version: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
interactions:
- request:
body: '{"documents": [{"id": "0", "text": "I work at Microsoft and my phone number
is 333-333-3333", "language": "en"}]}'
headers:
Accept:
- application/json, text/json
Content-Length:
- '113'
Content-Type:
- application/json
User-Agent:
- azsdk-python-ai-textanalytics/5.0.1 Python/3.8.5 (macOS-10.13.6-x86_64-i386-64bit)
method: POST
uri: https://westus2.api.cognitive.microsoft.com/text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
response:
body:
string: '{"documents":[{"id":"0","entities":[{"text":"333-333-3333","category":"Phone
Number","offset":43,"length":12,"confidenceScore":0.8}],"warnings":[]}],"errors":[],"modelVersion":"2020-07-01"}'
headers:
apim-request-id: 9265752d-3262-4dbb-94d6-be26889e3db9
content-type: application/json; charset=utf-8
csp-billing-usage: CognitiveServices.TextAnalytics.BatchScoring=1
date: Mon, 31 Aug 2020 20:32:55 GMT
strict-transport-security: max-age=31536000; includeSubDomains; preload
transfer-encoding: chunked
x-content-type-options: nosniff
x-envoy-upstream-service-time: '82'
status:
code: 200
message: OK
url: https://westus2.api.cognitive.microsoft.com//text/analytics/v3.1-preview.1/entities/recognition/pii?showStats=false&domain=PHI&stringIndexType=UnicodeCodePoint
version: 1
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
TextDocumentInput,
VERSION,
TextAnalyticsApiVersion,
PiiEntityDomainType,
)

# pre-apply the client_cls positional argument so it needn't be explicitly passed below
Expand Down Expand Up @@ -573,4 +574,17 @@ def test_recognize_pii_entities_v3(self, client):
with pytest.raises(NotImplementedError) as excinfo:
client.recognize_pii_entities(["this should fail"])

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)
assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
def test_phi_domain_filter(self, client):
# without the domain filter, this should return two entities: Microsoft as an org,
# and the phone number. With the domain filter, it should only return one.
result = client.recognize_pii_entities(
["I work at Microsoft and my phone number is 333-333-3333"],
domain_filter=PiiEntityDomainType.PHI
)
self.assertEqual(len(result[0].entities), 1)
self.assertEqual(result[0].entities[0].text, '333-333-3333')
self.assertEqual(result[0].entities[0].category, 'Phone Number')
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
TextDocumentInput,
VERSION,
TextAnalyticsApiVersion,
PiiEntityDomainType,
)

# pre-apply the client_cls positional argument so it needn't be explicitly passed below
Expand Down Expand Up @@ -572,3 +573,16 @@ async def test_recognize_pii_entities_v3(self, client):
await client.recognize_pii_entities(["this should fail"])

assert "'recognize_pii_entities' endpoint is only available for API version v3.1-preview.1 and up" in str(excinfo.value)

@GlobalTextAnalyticsAccountPreparer()
@TextAnalyticsClientPreparer()
async def test_phi_domain_filter(self, client):
# without the domain filter, this should return two entities: Microsoft as an org,
# and the phone number. With the domain filter, it should only return one.
result = await client.recognize_pii_entities(
["I work at Microsoft and my phone number is 333-333-3333"],
domain_filter=PiiEntityDomainType.PHI
)
self.assertEqual(len(result[0].entities), 1)
self.assertEqual(result[0].entities[0].text, '333-333-3333')
self.assertEqual(result[0].entities[0].category, 'Phone Number')