# Sentinel Privacy Shield â€” PII Detection + Redaction Pipeline

Industrial notebook: implement a safe PII detection + redaction pipeline on text logs.

Includes:
- regex-based detectors (email, phone, IP, IBAN-like, credit-card-like)
- scoring + explainability (which detectors triggered)
- configurable redaction policy
- evaluation on synthetic dataset

Outputs are saved inside the notebook when executed.

In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support

SEED = 1337
rng = np.random.default_rng(SEED)
pd.set_option('display.max_columns', 80)

## 1) Detectors

In [2]:
RE_EMAIL = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
RE_IP = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
RE_PHONE = re.compile(r"\b(?:\+?\d{1,3}[\s-]?)?(?:\(?\d{2,4}\)?[\s-]?)?\d{3,4}[\s-]?\d{3,4}\b")
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
RE_CC = re.compile(r"\b(?:\d[ -]*?){13,19}\b")

def luhn_ok(num: str) -> bool:
    digits = [int(c) for c in num if c.isdigit()]
    if len(digits) < 13 or len(digits) > 19: return False
    s = 0
    alt = False
    for d in reversed(digits):
        if alt:
            d = d * 2
            if d > 9: d -= 9
        s += d
        alt = not alt
    return (s % 10) == 0

def detect(text: str):
    hits = []
    for name, rx in [('email', RE_EMAIL), ('ip', RE_IP), ('phone', RE_PHONE), ('iban', RE_IBAN)]:
        for m in rx.finditer(text):
            hits.append((name, m.start(), m.end(), m.group(0)))
    # credit card candidates: filter with luhn
    for m in RE_CC.finditer(text):
        raw = m.group(0)
        if luhn_ok(raw):
            hits.append(('cc', m.start(), m.end(), raw))
    return sorted(hits, key=lambda x: (x[1], x[2]))

detect('contact me at alice@example.com or +33 6 12 34 56 78, ip 8.8.8.8')[:5]

[('email', 14, 31, 'alice@example.com'), ('ip', 57, 64, '8.8.8.8')]

## 2) Redaction

In [3]:
REPLACEMENTS = {
  'email': '<REDACTED:EMAIL>',
  'ip': '<REDACTED:IP>',
  'phone': '<REDACTED:PHONE>',
  'iban': '<REDACTED:IBAN>',
  'cc': '<REDACTED:CC>',
}

def redact(text: str):
    hits = detect(text)
    if not hits: return text, []
    out = []
    cur = 0
    for kind, s, e, val in hits:
        if s < cur:  # overlapping hit, skip
            continue
        out.append(text[cur:s])
        out.append(REPLACEMENTS.get(kind, '<REDACTED>'))
        cur = e
    out.append(text[cur:])
    return ''.join(out), hits

redact('Pay with card 4539 1488 0343 6467 and email bob@corp.io, ip 1.2.3.4')

('Pay with card <REDACTED:PHONE> 6467 and email <REDACTED:EMAIL>, ip <REDACTED:IP>',
 [('phone', 14, 28, '4539 1488 0343'),
  ('cc', 14, 33, '4539 1488 0343 6467'),
  ('email', 44, 55, 'bob@corp.io'),
  ('ip', 60, 67, '1.2.3.4')])

## 3) Synthetic evaluation

In [4]:
def mk_email(i):
    return f'user{i}@example.com'
def mk_ip(i):
    return f'10.0.{i%255}.{(i*7)%255}'
def mk_phone(i):
    return f'+33 6 {i%90:02d} {i%80:02d} {i%70:02d} {i%60:02d}'

rows=[]
for i in range(2000):
    base = f'INFO request_id={i} action=login status=ok'
    y = 0
    if rng.random() < 0.25:
        base += ' email=' + mk_email(i)
        y = 1
    if rng.random() < 0.15:
        base += ' ip=' + mk_ip(i)
        y = 1
    if rng.random() < 0.10:
        base += ' phone=' + mk_phone(i)
        y = 1
    rows.append((base, y))

df = pd.DataFrame(rows, columns=['text','has_pii'])
pred = np.array([1 if detect(t) else 0 for t in df['text']])
p,r,f1,_ = precision_recall_fscore_support(df['has_pii'].values, pred, average='binary', zero_division=0)
p,r,f1

(1.0, 0.8533333333333334, 0.920863309352518)

In [5]:
sample = df.sample(5, random_state=SEED)
out=[]
for t in sample['text']:
    red, hits = redact(t)
    out.append({'original': t, 'redacted': red, 'hits': hits})
pd.DataFrame(out)

Unnamed: 0,original,redacted,hits
0,INFO request_id=967 action=login status=ok pho...,INFO request_id=967 action=login status=ok pho...,[]
1,INFO request_id=628 action=login status=ok,INFO request_id=628 action=login status=ok,[]
2,INFO request_id=577 action=login status=ok,INFO request_id=577 action=login status=ok,[]
3,INFO request_id=89 action=login status=ok,INFO request_id=89 action=login status=ok,[]
4,INFO request_id=1171 action=login status=ok,INFO request_id=1171 action=login status=ok,[]
