Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ build/
node_modules
datafog_debug.log
sotu_2023.txt
/examples/*
/examples/*
.DS_Store
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

setup(
name="datafog",
version="2.2.0",
version="2.3.0",
author="Sid Mohan",
author_email="sid@datafog.ai",
description="Scan, redact, and manage PII in your documents before they get uploaded to a Retrieval Augmented Generation (RAG) system.",
Expand Down
667 changes: 0 additions & 667 deletions sotu_2023.txt

This file was deleted.

2 changes: 1 addition & 1 deletion src/datafog/__about__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# SSOT for the package version
__version__ = "2.2.0"
__version__ = "2.3.0"
3 changes: 3 additions & 0 deletions src/datafog/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# datafog-python/src/datafog/__init__.py
import json
import logging

import pandas as pd
import requests
Expand All @@ -8,6 +9,8 @@
from .__about__ import __version__
from .pii_tools import PresidioEngine

logger = logging.getLogger(__name__).setLevel(logging.ERROR)

__all__ = [
"__version__",
"PresidioEngine",
Expand Down
71 changes: 70 additions & 1 deletion src/datafog/pii_tools/PresidioEngine/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,69 @@
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
import logging
from typing import List, Optional

from presidio_analyzer import (
AnalyzerEngine,
Pattern,
PatternRecognizer,
RecognizerRegistry,
)
from presidio_analyzer.nlp_engine import NlpEngineProvider

from .analyzer import CustomSpacyRecognizer

logger = logging.getLogger("presidio-engine-init").setLevel(logging.ERROR)


# Helper methods
def create_ad_hoc_deny_list_recognizer(
deny_list=Optional[List[str]],
) -> Optional[PatternRecognizer]:
if not deny_list:
return None

deny_list_recognizer = PatternRecognizer(
supported_entity="CUSTOM_PII", deny_list=deny_list
)
return deny_list_recognizer


def create_ad_hoc_regex_recognizer(
regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
) -> Optional[PatternRecognizer]:
if not regex:
return None
pattern = Pattern(name="Regex Pattern", regex=regex, score=score)
regex_recognizer = PatternRecognizer(
supported_entity=entity_type, patterns=[pattern], context=context
)
return regex_recognizer


def analyzer_engine():
"""Return AnalyzerEngine."""

spacy_recognizer = CustomSpacyRecognizer()
configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_spacy_pii_fast"}],
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
"PER": "PERSON",
"PERSON": "PERSON",
"NORP": "NRP",
"FAC": "FACILITY",
"LOC": "LOCATION",
"GPE": "LOCATION",
"LOCATION": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
},
"low_confidence_score_multiplier": 0.4,
"low_score_entity_names": ["ORG", "ORGANIZATION"],
"labels_to_ignore": ["DATE_TIME"],
},
}

# Create NLP engine based on configuration
Expand Down Expand Up @@ -59,6 +111,23 @@ def scan(text, **kwargs):
kwargs.setdefault("language", "en")
kwargs.setdefault("score_threshold", 0.35)
kwargs.setdefault("nlp_artifacts", None)
kwargs.setdefault("entities", [])
kwargs.setdefault("allow_list", [])
kwargs.setdefault("deny_list", [])

"""Analyze input using Analyzer engine and input arguments (kwargs)."""
if "entities" not in kwargs or "All" in kwargs["entities"]:
kwargs["entities"] = None

if "deny_list" in kwargs and kwargs["deny_list"] is not None:
ad_hoc_recognizer = create_ad_hoc_deny_list_recognizer(kwargs["deny_list"])
kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
del kwargs["deny_list"]

if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
del kwargs["regex_params"]

# init analyzer instance
analyzer = analyzer_engine()
Expand Down
2 changes: 1 addition & 1 deletion src/datafog/pii_tools/PresidioEngine/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from presidio_analyzer import AnalysisExplanation, LocalRecognizer, RecognizerResult

logger = logging.getLogger("presidio-module")
logger = logging.getLogger("custom-spacy-recognizer").setLevel(logging.ERROR)


class CustomSpacyRecognizer(LocalRecognizer):
Expand Down
Loading