BerriAI · krrishdholakia · Feb 21, 2024 · Feb 20, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/docs/my-website/docs/proxy/pii_masking.md b/docs/my-website/docs/proxy/pii_masking.md
@@ -56,6 +56,39 @@ litellm_settings:
 
 4. User Response: "Hey Jane Doe, nice to meet you!"
 
+## Ad-hoc recognizers 
+
+Send ad-hoc recognizers to presidio `/analyze` by passing a json file to the proxy 
+
+[**Example** ad-hoc recognizer](../../../../litellm/proxy/hooks/example_presidio_ad_hoc_recognizer.json)
+
+```yaml
+litellm_settings: 
+  callbacks: ["presidio"]
+  presidio_ad_hoc_recognizers: "./hooks/example_presidio_ad_hoc_recognizer.json"
+```
+
+You can see this working, when you run the proxy: 
+
+```bash
+litellm --config /path/to/config.yaml --debug
+```
+
+Make a chat completions request, example:
+
+```
+{
+  "model": "azure-gpt-3.5",
+  "messages": [{"role": "user", "content": "John Smith AHV number is 756.3026.0705.92. Zip code: 1334023"}]
+}
+```
+
+And search for any log starting with `Presidio PII Masking`, example:
+```
+Presidio PII Masking: Redacted pii message: <PERSON> AHV number is <AHV_NUMBER>. Zip code: <US_DRIVER_LICENSE>
+```
+
+
 ## Turn on/off per key 
 
 Turn off PII masking for a given key. 

diff --git a/litellm/__init__.py b/litellm/__init__.py
@@ -55,9 +55,12 @@
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
 use_client: bool = False
+### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
+presidio_ad_hoc_recognizers: Optional[str] = None
 google_moderation_confidence_threshold: Optional[float] = None
 llamaguard_unsafe_content_categories: Optional[str] = None
+##################
 logging: bool = True
 caching: bool = (
     False  # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648

diff --git a/litellm/proxy/hooks/example_presidio_ad_hoc_recognizer.json b/litellm/proxy/hooks/example_presidio_ad_hoc_recognizer.json
@@ -0,0 +1,28 @@
+[
+    {
+        "name": "Zip code Recognizer",
+        "supported_language": "en",
+        "patterns": [
+            {
+                "name": "zip code (weak)",
+                "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)",
+                "score": 0.01
+            }
+        ],
+        "context": ["zip", "code"],
+        "supported_entity": "ZIP"
+    },
+    {
+        "name": "Swiss AHV Number Recognizer",
+        "supported_language": "en",
+        "patterns": [
+            {
+                "name": "AHV number (strong)",
+                "regex": "(756\\.\\d{4}\\.\\d{4}\\.\\d{2})|(756\\d{10})",
+                "score": 0.95
+            }
+        ],
+        "context": ["AHV", "social security", "Swiss"],
+        "supported_entity": "AHV_NUMBER"
+    }
+]
diff --git a/litellm/proxy/hooks/presidio_pii_masking.py b/litellm/proxy/hooks/presidio_pii_masking.py
@@ -9,7 +9,7 @@
 
 
 from typing import Optional, Literal, Union
-import litellm, traceback, sys, uuid
+import litellm, traceback, sys, uuid, json
 from litellm.caching import DualCache
 from litellm.proxy._types import UserAPIKeyAuth
 from litellm.integrations.custom_logger import CustomLogger
@@ -27,6 +27,7 @@
 
 class _OPTIONAL_PresidioPIIMasking(CustomLogger):
     user_api_key_cache = None
+    ad_hoc_recognizers = None
 
     # Class variables or attributes
     def __init__(
@@ -40,6 +41,22 @@ def __init__(
         if mock_testing == True:  # for testing purposes only
             return
 
+        ad_hoc_recognizers = litellm.presidio_ad_hoc_recognizers
+        if ad_hoc_recognizers is not None:
+            try:
+                with open(ad_hoc_recognizers, "r") as file:
+                    self.ad_hoc_recognizers = json.load(file)
+            except FileNotFoundError:
+                raise Exception(f"File not found. file_path={ad_hoc_recognizers}")
+            except json.JSONDecodeError as e:
+                raise Exception(
+                    f"Error decoding JSON file: {str(e)}, file_path={ad_hoc_recognizers}"
+                )
+            except Exception as e:
+                raise Exception(
+                    f"An error occurred: {str(e)}, file_path={ad_hoc_recognizers}"
+                )
+
         self.presidio_analyzer_api_base = litellm.get_secret(
             "PRESIDIO_ANALYZER_API_BASE", None
         )
@@ -78,6 +95,8 @@ async def check_pii(self, text: str, output_parse_pii: bool) -> str:
                     analyze_url = f"{self.presidio_analyzer_api_base}analyze"
                     verbose_proxy_logger.debug(f"Making request to: {analyze_url}")
                     analyze_payload = {"text": text, "language": "en"}
+                    if self.ad_hoc_recognizers is not None:
+                        analyze_payload["ad_hoc_recognizers"] = self.ad_hoc_recognizers
                     redacted_text = None
                     async with session.post(
                         analyze_url, json=analyze_payload
@@ -216,7 +235,9 @@ async def async_pre_call_hook(
                         messages[index][
                             "content"
                         ] = r  # replace content with redacted string
-                verbose_proxy_logger.debug(f"Redacted pii message: {data['messages']}")
+                verbose_proxy_logger.info(
+                    f"Presidio PII Masking: Redacted pii message: {data['messages']}"
+                )
             return data
         except Exception as e:
             verbose_proxy_logger.info(f"An error occurred - {str(e)}")