# Data Classification & Retention

Labels columns and purges old records with evidence logs.

**Objectives**
- Label columns in `data/customers.csv` (personal/internal/public)
- Purge records older than N months (default 18)
- Save labels, purged dataset, and log

In [7]:
from pathlib import Path
import pandas as pd, re, datetime
base_dir = Path('..').resolve()

In [8]:
data = base_dir/'data'
logs = base_dir/'logs'
logs.mkdir(exist_ok=True)
src = data/'customers.csv'

In [None]:
patterns = {
    'personal': re.compile(r'(name|email|phone|address|dni|curp)', re.I),
    'internal': re.compile(r'(cost|margin|salary|secret|key|token)', re.I),
}

def classify(col):
    for label, rx in patterns.items():
        if rx.search(str(col)):
            return label
    return 'public'

df = pd.read_csv(src)
labels = {c: classify(c) for c in df.columns}
labels_df = pd.DataFrame({'column': list(labels.keys()), 'classification': list(labels.values())})
labels_df.to_csv(data/'customers_labels.csv', index=False)
labels_df.head()

Unnamed: 0,column,classification
0,customer_id,public
1,full_name,personal
2,email,personal
3,phone,personal
4,address,personal


In [11]:
from datetime import datetime, timezone
RETENTION_MONTHS = 18

now_utc = datetime.now(timezone.utc)
cutoff = pd.Timestamp(now_utc) - pd.DateOffset(months=RETENTION_MONTHS)

dates = pd.to_datetime(df['last_order_date'], errors='coerce', utc=True)
mask_old = dates < cutoff 
purged = df[mask_old]
kept = df[~mask_old]

kept.to_csv(data / 'customers_after_purge.csv', index=False)

with (logs/'purge_log.txt').open('a') as f:
    f.write(f"{now_utc.strftime('%Y-%m-%dT%H:%M:%SZ')} Purged {len(purged)}/{len(df)} older than {RETENTION_MONTHS} months. Cutoff={cutoff.date()}\n")

In [13]:
import hashlib
from datetime import datetime
lp = logs/'purge_log.txt'
sha = hashlib.sha256(lp.read_bytes()).hexdigest()
ts = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
print(f'{ts}  purge_log.txt')
#print(f'{ts}  purge_log.txt  SHA256={sha}')

2025-09-11T22:10:34Z  purge_log.txt
