In [1]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NerModelConfiguration, TransformersNlpEngine

model_config = [
    {"lang_code": "zh",
     "model_name": {
         "spacy": "zh_core_web_sm", # for tokenization, lemmatization
         # "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/deid_roberta_i2b2" # for NER
         "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/RoBERTa-ext-large-crf-chinese-finetuned-ner-v2"
    }
}
]

mapping = dict(
    position="LOCATION",
    name="PERSON",
    movie="TITLE",
    organization="ORGANIZATION",
    company="ORGANIZATION",
    book="TITLE",
    address="LOCATION",
    scene="LOCATION",
    mobile="PHONE_NUMBER",
    email="EMAIL",
    game="TITLE",
    government="ORGANIZATION",
    QQ="ID",
    vx="ID",
)

labels_to_ignore = ["O"]

ner_model_configuration = NerModelConfiguration(
    model_to_presidio_entity_mapping=mapping,
    alignment_mode="expand", # "strict", "contract", "expand"
    aggregation_strategy="max", # "simple", "first", "average", "max"
    labels_to_ignore = labels_to_ignore)

transformers_nlp_engine = TransformersNlpEngine(
    models=model_config,
    ner_model_configuration=ner_model_configuration)

# Transformer-based analyzer
analyzer = AnalyzerEngine(
    nlp_engine=transformers_nlp_engine,
    supported_languages=["zh"]
)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


中文 anonymize 测试

In [2]:
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
engine = AnonymizerEngine()
analyzer_test = "我的名字是万国安，手机电话是18023333333"
results_chinese = analyzer.analyze(text=analyzer_test, language="zh")
result = engine.anonymize(
    text=analyzer_test, analyzer_results=results_chinese
)
print(result.text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


我的名字是<PERSON>，手机电话是<PHONE_NUMBER>


加密操作

In [3]:
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
from presidio_anonymizer.entities import (
    RecognizerResult,
    OperatorResult,
    OperatorConfig,
)

In [4]:
crypto_key = "WmZq4t7w!z%C&F)J"

In [13]:
engine = AnonymizerEngine()

# Invoke the anonymize function with the text,
# analyzer results (potentially coming from presidio-analyzer)
# and an 'encrypt' operator to get an encrypted anonymization output:
anonymize_result = engine.anonymize(
    text=analyzer_test,
    analyzer_results=results_chinese,
    operators={
        "PERSON": OperatorConfig("encrypt", {"key": crypto_key}),
        "PHONE_NUMBER": OperatorConfig("encrypt", {"key": crypto_key}),
    },
)

anonymize_result

text: 我的名字是ItFStolVoqU7NnlMwDc7YdYtwKPsF6OxHshahz1Fhw0=，手机电话是m4c0PqH2dWoTvz0B9qAq+3zGwMduIYNnOTbzyiVWZb8=
items:
[
    {'start': 55, 'end': 99, 'entity_type': 'PHONE_NUMBER', 'text': 'm4c0PqH2dWoTvz0B9qAq+3zGwMduIYNnOTbzyiVWZb8=', 'operator': 'encrypt'},
    {'start': 5, 'end': 49, 'entity_type': 'PERSON', 'text': 'ItFStolVoqU7NnlMwDc7YdYtwKPsF6OxHshahz1Fhw0=', 'operator': 'encrypt'}
]

In [14]:
# Fetch the anonymized text from the result.
anonymized_text = anonymize_result.text

# Fetch the anonynized entities from the result.
anonymized_entities = anonymize_result.items
print(anonymized_entities)

[{'start': 55, 'end': 99, 'entity_type': 'PHONE_NUMBER', 'text': 'm4c0PqH2dWoTvz0B9qAq+3zGwMduIYNnOTbzyiVWZb8=', 'operator': 'encrypt'}, {'start': 5, 'end': 49, 'entity_type': 'PERSON', 'text': 'ItFStolVoqU7NnlMwDc7YdYtwKPsF6OxHshahz1Fhw0=', 'operator': 'encrypt'}]


In [16]:
# Initialize the engine:
engine = DeanonymizeEngine()

# Invoke the deanonymize function with the text, anonymizer results
# and a 'decrypt' operator to get the original text as output.
deanonymized_result = engine.deanonymize(
    text=anonymized_text,
    entities=anonymized_entities,
    operators={"PERSON": OperatorConfig("decrypt", {"key": crypto_key}),
               "PHONE_NUMBER": OperatorConfig("decrypt", {"key": crypto_key}),
               },
)

deanonymized_result

text: 我的名字是万国安，手机电话是18023333333
items:
[
    {'start': 14, 'end': 25, 'entity_type': 'PHONE_NUMBER', 'text': '18023333333', 'operator': 'decrypt'},
    {'start': 5, 'end': 8, 'entity_type': 'PERSON', 'text': '万国安', 'operator': 'decrypt'}
]