#### 输出模型相关属性

In [4]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

model_name_or_path = "/home/elvin/NAS-Disk-1/program/models/NER/RoBERTa-ext-large-crf-chinese-finetuned-ner-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path, return_dict=True, max_length=None)
ner_pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=0, aggregation_strategy="simple")
res = ner_pipe("明略科技智能营销解决方案系列课程中，来自明略科技集团秒针系统全域营销测量事业部（OMI）产品经理周露露进行了主题为《挖掘自有媒体用户资产，优化营销策略》的分享。")
res


Device set to use cuda:0
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'company', 'score': np.float32(0.9997826), 'word': '明 略 科 技', 'start': 0, 'end': 4}, {'entity_group': 'company', 'score': np.float32(0.99398845), 'word': '明 略 科 技 集 团 秒 针 系 统 全 域 营 销 测 量 事 业 部', 'start': 20, 'end': 39}, {'entity_group': 'company', 'score': np.float32(0.76838565), 'word': 'omi', 'start': 40, 'end': 43}, {'entity_group': 'position', 'score': np.float32(0.9998225), 'word': '产 品 经 理', 'start': 44, 'end': 48}, {'entity_group': 'name', 'score': np.float32(0.9999423), 'word': '周 露 露', 'start': 48, 'end': 51}]


In [18]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NerModelConfiguration, TransformersNlpEngine

model_config = [
    {"lang_code": "zh",
     "model_name": {
         "spacy": "zh_core_web_lg", # for tokenization, lemmatization
         # "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/deid_roberta_i2b2" # for NER
         "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/RoBERTa-ext-large-crf-chinese-finetuned-ner-v2"
        }
    }
]

mapping = dict(
    position="LOCATION",
    name="PERSON",
    movie="TITLE",
    organization="ORGANIZATION",
    company="ORGANIZATION",
    book="TITLE",
    address="LOCATION",
    scene="LOCATION",
    mobile="PHONE_NUMBER",
    email="EMAIL",
    game="TITLE",
    government="ORGANIZATION",
    QQ="ID",
    vx="ID",
)

labels_to_ignore = ["O"]

ner_model_configuration = NerModelConfiguration(
    model_to_presidio_entity_mapping=mapping,
    alignment_mode="expand", # "strict", "contract", "expand"
    aggregation_strategy="max", # "simple", "first", "average", "max"
    labels_to_ignore = labels_to_ignore)

transformers_nlp_engine = TransformersNlpEngine(
    models=model_config,
    ner_model_configuration=ner_model_configuration)

# Transformer-based analyzer
analyzer = AnalyzerEngine(
    nlp_engine=transformers_nlp_engine,
    supported_languages=["zh"]
)

Device set to use cpu


中文 anonymize 测试

In [19]:
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
engine = AnonymizerEngine()
analyzer_test = "我是万国安，我的手机号是18023239232"
results_chinese = analyzer.analyze(text=analyzer_test, language="zh")
result = engine.anonymize(
    text=analyzer_test, analyzer_results=results_chinese
)
print(result.text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


我是<PERSON>，我的手机号是<PHONE_NUMBER>


加密操作

#### Faker test

In [13]:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig, EngineResult, RecognizerResult
from faker import Faker


fake = Faker('zh-CN')

# Create faker function (note that it has to receive a value)
def fake_name(x):
    return fake.name()

def fake_phone(x):
    return fake.phone_number()

# Create custom operator for the PERSON entity
operators = {"PERSON": OperatorConfig("custom", {"lambda": fake_name})}

# Analyzer output
analyzer_results = [RecognizerResult(entity_type="PERSON", start=11, end=18, score=0.8)]

text_to_anonymize = "My name is Raphael and I like to fish."

anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize, analyzer_results=analyzer_results, operators=operators
)

print(anonymized_results.text)

My name is 谢丹丹 and I like to fish.


In [None]:
from presidio_anonymizer import AnonymizerEngine, DeanonymizeEngine
from presidio_anonymizer.entities import (
    RecognizerResult,
    OperatorResult,
    OperatorConfig,
)
from faker import Faker

fake = Faker('zh-CN')

def fake_name(x):
    return fake.name()

def fake_phone(x):
    return fake.phone_number()

In [7]:
crypto_key = "WmZq4t7w!z%C&F)J"

In [24]:
engine = AnonymizerEngine()

# Invoke the anonymize function with the text,
# analyzer results (potentially coming from presidio-analyzer)
# and an 'encrypt' operator to get an encrypted anonymization output:
anonymize_result = engine.anonymize(
    text=analyzer_test,
    analyzer_results=results_chinese,
    operators={
        # "PERSON": OperatorConfig("custom", {"lambda": fake_name}),
        # "PHONE_NUMBER": OperatorConfig("custom", {"lambda": fake_phone}),
        "PERSON": OperatorConfig("encrypt", {"key": crypto_key}),
        "PHONE_NUMBER": OperatorConfig("encrypt", {"key": crypto_key}),
    },
)

print(anonymize_result.text)

我是j9ZLZWSpvMIZHG4rdtHK3i9oGhk+03QU5nzscLO3tDo=，我的手机号是Ts+MF5yV2PamJY8EOzBKn728AvdMMyGfz25lwqwtDbg=


In [21]:
# Fetch the anonymized text from the result.
anonymized_text = anonymize_result.text


# Fetch the anonynized entities from the result.
anonymized_entities = anonymize_result.items
print(anonymized_entities)

[{'start': 12, 'end': 23, 'entity_type': 'PHONE_NUMBER', 'text': '14551179183', 'operator': 'custom'}, {'start': 2, 'end': 5, 'entity_type': 'PERSON', 'text': '翁桂芝', 'operator': 'custom'}]


In [11]:
# Initialize the engine:
engine = DeanonymizeEngine()
# anonymized_text = "您的名字是 FQK5g598ofccQyB7qBhrZ1fr1mLEndh9sBeP8Wm66us=。"
# Invoke the deanonymize function with the text, anonymizer results
# and a 'decrypt' operator to get the original text as output.
deanonymized_result = engine.deanonymize(
    text=anonymized_text,
    entities=anonymized_entities,
    operators={"PERSON": OperatorConfig("custom", {"lambda": fake_name}),
               "PHONE_NUMBER": OperatorConfig("decrypt", {"key": crypto_key}),
               },
)

deanonymized_result

InvalidParamError: Invalid operator class 'custom'.