In [None]:
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

In [None]:
text_to_anonymize = "My name is John Doe, you can reach me at 555-123-4567 or via email at john.doe@example.com. My company ID is 12345"

In [None]:
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text = text_to_anonymize, entities=["PHONE_NUMBER", "PERSON", "EMAIL_ADDRESS"], language='en')

print(analyzer_results)



[type: EMAIL_ADDRESS, start: 70, end: 90, score: 1.0, type: PERSON, start: 11, end: 19, score: 0.85, type: PHONE_NUMBER, start: 41, end: 53, score: 0.4]


In [None]:
for result in analyzer_results:
  print(text_to_anonymize[result.start:result.end], result.entity_type)

john.doe@example.com EMAIL_ADDRESS
John Doe PERSON
555-123-4567 PHONE_NUMBER


In [None]:
from presidio_analyzer import Pattern, PatternRecognizer

id_pattern = Pattern(name="id_pattern", regex="\d{5}", score=0.5)

id_recognizer = PatternRecognizer(
    supported_entity="ID",
    patterns=[id_pattern],
    supported_language="en"
)

analyzer.registry.add_recognizer(id_recognizer)

In [None]:
analyzer_results = analyzer.analyze(text = text_to_anonymize, entities=["PHONE_NUMBER", "PERSON", "EMAIL_ADDRESS", "ID"], language='en')

print(analyzer_results)

[type: EMAIL_ADDRESS, start: 70, end: 90, score: 1.0, type: PERSON, start: 11, end: 19, score: 0.85, type: ID, start: 109, end: 114, score: 0.5, type: PHONE_NUMBER, start: 41, end: 53, score: 0.4]


In [None]:
for result in analyzer_results:
  print(text_to_anonymize[result.start:result.end], result.entity_type)

john.doe@example.com EMAIL_ADDRESS
John Doe PERSON
12345 ID
555-123-4567 PHONE_NUMBER


In [None]:
anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results
)

print (f"text : {anonymized_results.text}")

text : My name is <PERSON>, you can reach me at <PHONE_NUMBER> or via email at <EMAIL_ADDRESS>. My company ID is <ID>


In [None]:
from presidio_anonymizer.entities import OperatorConfig

operators={"PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char": "*", "chars_to_mask": 12, "from_end": True}),
            "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})}


custom_anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators = operators
)

print (f"text : {custom_anonymized_results.text}")

text : My name is <ANONYMIZED>, you can reach me at ************ or via email at <ANONYMIZED>. My company ID is <ANONYMIZED>
