In [12]:
# Install (Colab). Remove --quiet if you want to see install logs
!pip install --quiet pymupdf pdf2image pytesseract spacy Pillow regex

# Imports
import fitz            # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
import spacy
import regex as re
from PIL import Image
import io, os, json, math
from google.colab import files

print("Imports complete.")


Imports complete.


In [13]:
# Create requirements.txt
requirements = """pymupdf
pdf2image
pytesseract
spacy
Pillow
regex
"""
with open("requirements.txt","w") as f:
    f.write(requirements)
print("requirements.txt written.")


requirements.txt written.


In [14]:
# Download and load spaCy model
!python -m spacy download en_core_web_sm >/dev/null 2>&1
nlp = spacy.load("en_core_web_sm")
print("spaCy model loaded:", nlp.meta["name"])


spaCy model loaded: core_web_sm


In [15]:
# Input file selection (upload or use sample)
print("If you'd like to upload a file, enter 'y' when prompted. Otherwise press Enter to use sample.")

choice = input("Upload file? (y/N): ").strip().lower()
if choice == "y":
    uploaded = files.upload()
    file_path = list(uploaded.keys())[0]
else:
    # Primary sample path (from workspace)
    file_path = "/mnt/data/Redact-Samples (2).pdf"
    if not os.path.exists(file_path):
        # fallback to alternate provided path
        alt = "/mnt/data/Take-home assignment (1).pdf"
        if os.path.exists(alt):
            file_path = alt
        else:
            raise FileNotFoundError("Sample files not found. Upload using the prompt.")
print("Input file:", file_path)


If you'd like to upload a file, enter 'y' when prompted. Otherwise press Enter to use sample.
Upload file? (y/N): y


Saving Redact-Samples (2).pdf to Redact-Samples (2) (1).pdf
Input file: Redact-Samples (2) (1).pdf


In [16]:
# Extract text from each PDF page (with OCR fallback)
def extract_text_pages(pdf_path, ocr_dpi=300):
    doc = fitz.open(pdf_path)
    pages_text = []
    for pno in range(len(doc)):
        page = doc.load_page(pno)
        text = page.get_text("text")
        # If text is short/empty, fallback to OCR
        if not text or len(text.strip()) < 10:
            pix = page.get_pixmap(dpi=ocr_dpi)
            img = Image.open(io.BytesIO(pix.tobytes()))
            text = pytesseract.image_to_string(img)
        pages_text.append(text)
    return pages_text

pages = extract_text_pages(file_path)
print(f"Extracted {len(pages)} pages.")


Extracted 2 pages.


In [17]:
# Detect PII (regex + spaCy) with confidence scores
# Regex patterns (tunable)
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", flags=re.IGNORECASE)
PHONE_PATTERN = re.compile(r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,12}\b")
CC_PATTERN = re.compile(r"\b(?:\d[ -]*?){13,19}\b")

def find_ner_pii(text):
    doc = nlp(text)
    persons = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    locations = [ent.text for ent in doc.ents if ent.label_ in ("GPE","LOC","FAC")]
    orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    return persons, locations, orgs

def detect_pii_with_confidence(text):
    # Run detectors
    emails = EMAIL_PATTERN.findall(text)
    phones = PHONE_PATTERN.findall(text)
    ccs = CC_PATTERN.findall(text)
    persons, locations, orgs = find_ner_pii(text)
    # Clean & dedupe
    def clean(lst):
        out=[]
        for it in lst:
            it_str=it.strip()
            if len(it_str)>1 and it_str not in out:
                out.append(it_str)
        return out
    emails = clean(emails); phones = clean(phones); ccs = clean(ccs)
    persons = clean(persons); locations = clean(locations); orgs = clean(orgs)
    # Assign simple confidence heuristics
    def conf_for_label(label, value):
        if label=="emails":
            return 0.98
        if label=="phones":
            # longer digits -> higher confidence
            digits = re.sub(r"\D","", value)
            return 0.9 if len(digits) >= 8 else 0.6
        if label=="credit_cards":
            return 0.85
        if label in ("names","locations","orgs"):
            return 0.65  # NER uncertain; acceptable default
        return 0.5
    pii = {
        "emails": [{"value":v,"confidence":conf_for_label("emails",v)} for v in emails],
        "phones": [{"value":v,"confidence":conf_for_label("phones",v)} for v in phones],
        "credit_cards": [{"value":v,"confidence":conf_for_label("credit_cards",v)} for v in ccs],
        "names": [{"value":v,"confidence":conf_for_label("names",v)} for v in persons],
        "locations": [{"value":v,"confidence":conf_for_label("locations",v)} for v in locations],
        "orgs": [{"value":v,"confidence":conf_for_label("orgs",v)} for v in orgs]
    }
    return pii

# Quick test on page 0
print(json.dumps(detect_pii_with_confidence(pages[0]), indent=2)[:800])


{
  "emails": [
    {
      "value": "emily.johnson@gmail.com",
      "confidence": 0.98
    },
    {
      "value": "starlight@example.com",
      "confidence": 0.98
    },
    {
      "value": "info@bigpublishinghouse.com",
      "confidence": 0.98
    },
    {
      "value": "support@techhelpdesk.com",
      "confidence": 0.98
    },
    {
      "value": "contact@alienworlds.com",
      "confidence": 0.98
    },
    {
      "value": "kumar@company.com",
      "confidence": 0.98
    },
    {
      "value": "financel01l@example.com",
      "confidence": 0.98
    }
  ],
  "phones": [
    {
      "value": "982734",
      "confidence": 0.6
    }
  ],
  "credit_cards": [],
  "names": [
    {
      "value": "Library Card",
      "confidence": 0.65
    },
    {
      "value": "Emily Johnson",
 


In [19]:
# Token-based redaction (text-mode)
def build_simple_pii_map_for_replacement(pii_entry):
    # Flatten values into label -> list of strings
    result = {}
    for label, items in pii_entry.items():
        result[label] = [it["value"] for it in items]
    return result

def redact_text_token_mode(text, pii_entry):
    # Replace longest tokens first
    pii_map = build_simple_pii_map_for_replacement(pii_entry)
    items = []
    for label, vals in pii_map.items():
        for v in vals:
            items.append((label, v))
    items.sort(key=lambda x: -len(x[1]))
    counters = {}
    redacted_text = text
    for label, val in items:
        counters.setdefault(label, 0)
        counters[label] += 1
        token = f"[{label.upper()}_{counters[label]}]"
        # Use regex substitution for exact match (case sensitive by default)
        # Try to replace case-sensitive first, then case-insensitive fallback
        try:
            redacted_text = re.sub(re.escape(val), token, redacted_text)
        except:
            # fallback to case-insensitive
            redacted_text = re.sub(re.escape(val), token, redacted_text, flags=re.IGNORECASE)
    return redacted_text

# Build redacted pages and detailed log
redacted_pages = []
redaction_log = []

for i, page_text in enumerate(pages):
    pii_entry = detect_pii_with_confidence(page_text)
    redacted = redact_text_token_mode(page_text, pii_entry)
    redacted_pages.append(redacted)
    redaction_log.append({
        "page": i+1,
        "pii_detected": pii_entry,
        "counts": {k: len(v) for k,v in build_simple_pii_map_for_replacement(pii_entry).items()}
    })

print("Token redaction completed for all pages.")


Token redaction completed for all pages.


In [20]:
# Save redacted text + JSON audit log
# Save redacted text file (page separators)
txt_out = "redacted_output.txt"
with open(txt_out, "w", encoding="utf-8") as f:
    for pno, rt in enumerate(redacted_pages):
        f.write(f"----- PAGE {pno+1} -----\n")
        f.write(rt)
        f.write("\n\n")

json_out = "redaction_log.json"
with open(json_out, "w", encoding="utf-8") as f:
    json.dump(redaction_log, f, indent=2)

print("Saved:", txt_out, json_out)

# If in Colab, prompt downloads
try:
    files.download(txt_out)
    files.download(json_out)
except Exception as e:
    print("If running locally, files are saved in notebook directory.")


Saved: redacted_output.txt redaction_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
# Summary of PII counts

summary = {}
for entry in redaction_log:
    for k,count in entry["counts"].items():
        summary[k] = summary.get(k, 0) + count

print("PII Summary across document:", summary)
print("Files generated:", txt_out, json_out)


PII Summary across document: {'emails': 9, 'phones': 1, 'credit_cards': 0, 'names': 9, 'locations': 3, 'orgs': 3}
Files generated: redacted_output.txt redaction_log.json
