首先需要加载transformers模型，英文模型选择了obi/deid_roberta_i2b2，这个模型采用了i2b2这个医疗实体数据集，其中实体包含了如下，由于这是官方实例，所以给出了configuration.py文件中定义的转换字典。
~~~
# configuration_dict
"MODEL_TO_PRESIDIO_MAPPING": {
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
        "AGE": "AGE",
        "ID": "ID",
        "EMAIL": "EMAIL",
        "PATIENT": "PERSON",
        "STAFF": "PERSON",
        "HOSP": "ORGANIZATION",
        "PATORG": "ORGANIZATION",
        "DATE": "DATE_TIME",
        "PHONE": "PHONE_NUMBER",
    }
~~~
中文模型选择了gyr66/RoBERTa-ext-large-chinese-finetuned-ner,这个模型采用了中文数据集[privacy_detection](https://www.datafountain.cn/competitions/472)，准确率可以达到0.9629

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

transformers_model = "/home/elvin/NAS-Disk-1/program/models/NER/deid_roberta_i2b2"
AutoTokenizer.from_pretrained(transformers_model)
AutoModelForTokenClassification.from_pretrained(transformers_model)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)


加载配置文件，这里直接创建两个字典

In [10]:
# Transformer model config
model_config = [
    {"lang_code": "en",
     "model_name": {
         "spacy": "en_core_web_sm", # for tokenization, lemmatization
         "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/deid_roberta_i2b2" # for NER
         # "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/RoBERTa-ext-large-crf-chinese-finetuned-ner-v2"
    }
}]

# Entity mappings between the model's and Presidio's
mapping = dict(
    PER="PERSON",
    LOC="LOCATION",
    ORG="ORGANIZATION",
    AGE="AGE",
    ID="ID",
    EMAIL="EMAIL",
    DATE="DATE_TIME",
    PHONE="PHONE_NUMBER",
    PERSON="PERSON",
    LOCATION="LOCATION",
    GPE="LOCATION",
    ORGANIZATION="ORGANIZATION",
    NORP="NRP",
    PATIENT="PERSON",
    STAFF="PERSON",
    HOSP="LOCATION",
    PATORG="ORGANIZATION",
    TIME="DATE_TIME",
    HCW="PERSON",
    HOSPITAL="LOCATION",
    FACILITY="LOCATION",
    VENDOR="ORGANIZATION",
)


labels_to_ignore = ["O"]


通过presidio加载transformers库 模型

In [12]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NerModelConfiguration, TransformersNlpEngine
ner_model_configuration = NerModelConfiguration(
    model_to_presidio_entity_mapping=mapping,
    alignment_mode="expand", # "strict", "contract", "expand"
    aggregation_strategy="max", # "simple", "first", "average", "max"
    labels_to_ignore = labels_to_ignore)

transformers_nlp_engine = TransformersNlpEngine(
    models=model_config,
    ner_model_configuration=ner_model_configuration)

# Transformer-based analyzer
analyzer = AnalyzerEngine(
    nlp_engine=transformers_nlp_engine,
    supported_languages=["en"]
)
transformers_nlp_engine.get_supported_languages()

Device set to use cpu


['en']

接口调用：

In [8]:
results_english = analyzer.analyze(text="My name is Morris", language="en")
print(results_english)

[type: PERSON, start: 11, end: 17, score: 0.9946103096008301]




### 中文测试


In [1]:
# Transformer model config
model_config = [
    {"lang_code": "zh",
     "model_name": {
         "spacy": "zh_core_web_sm", # for tokenization, lemmatization
         # "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/deid_roberta_i2b2" # for NER
         "transformers": "/home/elvin/NAS-Disk-1/program/models/NER/RoBERTa-ext-large-crf-chinese-finetuned-ner-v2"
    }
}]

# Entity mappings between the model's and Presidio's
# mapping = dict(
#     PER="PERSON",
#     LOC="LOCATION",
#     ORG="ORGANIZATION",
#     AGE="AGE",
#     ID="ID",
#     EMAIL="EMAIL",
#     DATE="DATE_TIME",
#     PHONE="PHONE_NUMBER",
#     PERSON="PERSON",
#     LOCATION="LOCATION",
#     GPE="LOCATION",
#     ORGANIZATION="ORGANIZATION",
#     NORP="NRP",
#     PATIENT="PERSON",
#     STAFF="PERSON",
#     HOSP="LOCATION",
#     PATORG="ORGANIZATION",
#     TIME="DATE_TIME",
#     HCW="PERSON",
#     HOSPITAL="LOCATION",
#     FACILITY="LOCATION",
#     VENDOR="ORGANIZATION",
# )

mapping = dict(
    position="LOCATION",
    name="PERSON",
    movie="TITLE",
    organization="ORGANIZATION",
    company="ORGANIZATION",
    book="TITLE",
    address="LOCATION",
    scene="LOCATION",
    mobile="PHONE_NUMBER",
    email="EMAIL",
    game="TITLE",
    government="ORGANIZATION",
    QQ="ID",
    vx="ID",
)

labels_to_ignore = ["O"]


In [2]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NerModelConfiguration, TransformersNlpEngine
ner_model_configuration = NerModelConfiguration(
    model_to_presidio_entity_mapping=mapping,
    alignment_mode="expand", # "strict", "contract", "expand"
    aggregation_strategy="max", # "simple", "first", "average", "max"
    labels_to_ignore = labels_to_ignore)

transformers_nlp_engine = TransformersNlpEngine(
    models=model_config,
    ner_model_configuration=ner_model_configuration)

# Transformer-based analyzer
analyzer = AnalyzerEngine(
    nlp_engine=transformers_nlp_engine,
    supported_languages=["zh"]
)
transformers_nlp_engine.get_supported_languages()

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


['zh']

In [3]:
results_chinese = analyzer.analyze(text="我的名字是万国安，手机电话是18023333333", language="zh")
print(results_chinese)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[type: PERSON, start: 5, end: 8, score: 0.9885568618774414, type: PHONE_NUMBER, start: 14, end: 25, score: 0.4]
