In [None]:
# Install libraries (Colab)
!pip install --quiet pymupdf pdf2image pytesseract spacy Pillow regex

# (Optional) If you need OpenAI or other optional libs later:
# !pip install --quiet openai

# Imports
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
import spacy
import regex as re
from PIL import Image
import io, os, json, math
from google.colab import files

print("Libraries loaded.")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hLibraries loaded.


In [None]:
# Download spaCy model (only if not present)
!python -m spacy download en_core_web_sm >/dev/null 2>&1
nlp = spacy.load("en_core_web_sm")
print("spaCy loaded:", nlp.meta['name'])


spaCy loaded: core_web_sm


In [None]:
# Option: upload a file manually or use one of the provided local paths.
print("Choose input method:\n1) Upload file\n2) Use sample file path from runtime")

choice = input("Enter 1 or 2 (default 2): ") or "2"

if choice.strip() == "1":
    uploaded = files.upload()
    file_path = list(uploaded.keys())[0]
else:
    # Use provided sample path(s) from the environment (if available)
    # You can replace this with either of your uploaded file paths:
    # '/mnt/data/Redact-Samples (2).pdf' or '/mnt/data/Take-home assignment (1).pdf'
    sample_path = "/mnt/data/Redact-Samples (2).pdf"
    alt_path = "/mnt/data/Take-home assignment (1).pdf"
    file_path = sample_path if os.path.exists(sample_path) else alt_path
    if not os.path.exists(file_path):
        raise FileNotFoundError("Sample files not found in runtime. Upload your PDF instead.")

print("Using file:", file_path)


Choose input method:
1) Upload file
2) Use sample file path from runtime
Enter 1 or 2 (default 2): 1


Saving Redact-Samples (2).pdf to Redact-Samples (2).pdf
Using file: Redact-Samples (2).pdf


In [None]:
def extract_text_pages(pdf_path, ocr_dpi=200):
    doc = fitz.open(pdf_path)
    pages_text = []
    for pno in range(len(doc)):
        page = doc.load_page(pno)
        text = page.get_text("text")
        if not text or len(text.strip()) < 10:
            # OCR fallback
            pix = page.get_pixmap(dpi=ocr_dpi)
            img = Image.open(io.BytesIO(pix.tobytes()))
            text = pytesseract.image_to_string(img)
        pages_text.append(text)
    return pages_text

pages = extract_text_pages(file_path)
print("Extracted pages:", len(pages))


Extracted pages: 2


In [None]:
# Regex patterns (tunable)
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", flags=re.IGNORECASE)
# Phone: rough patterns for international/10-digit; tweak if necessary
PHONE_PATTERN = re.compile(r"\b(?:\+?\d{1,3}[-.\s]?)?(?:\(?\d{2,4}\)?[-.\s]?)?\d{6,12}\b")
# Credit-card-like (simple)
CC_PATTERN = re.compile(r"\b(?:\d[ -]*?){13,19}\b")

def find_ner_pii(text):
    doc = nlp(text)
    persons = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
    locations = [ent.text for ent in doc.ents if ent.label_ in ("GPE", "LOC", "FAC")]
    orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    return persons, locations, orgs

def detect_pii(text):
    emails = EMAIL_PATTERN.findall(text)
    phones = PHONE_PATTERN.findall(text)
    ccs = CC_PATTERN.findall(text)
    persons, locations, orgs = find_ner_pii(text)
    # dedupe and filter short/garbage
    def clean_list(lst):
        return [s.strip() for s in dict.fromkeys([i for i in lst if len(i.strip())>1])]
    return {
        "emails": clean_list(emails),
        "phones": clean_list(phones),
        "credit_cards": clean_list(ccs),
        "names": clean_list(persons),
        "locations": clean_list(locations),
        "orgs": clean_list(orgs)
    }

# quick test
print(detect_pii(pages[0]) if pages else {})


{'emails': ['emily.johnson@gmail.com', 'starlight@example.com', 'info@bigpublishinghouse.com', 'support@techhelpdesk.com', 'contact@alienworlds.com', 'rajesh.kumar@company.com', 'financel0l@example.com'], 'phones': ['982734'], 'credit_cards': [], 'names': ['Library Card', 'Emily Johnson', 'Jane Goodall', 'Email Thread', 'Sarah Lee', 'Rajesh Kumar', 'Priya Nair', 'Floor'], 'locations': ['Pine Street', 'Springfield'], 'orgs': ['Universe', 'Borrower']}


In [None]:
def redact_text_token_mode(text, pii_map):
    redacted = text
    # Replace longer tokens first to avoid partial replacements
    items = []
    for label, vals in pii_map.items():
        for v in vals:
            items.append((label, v))
    items.sort(key=lambda x: -len(x[1]))
    counters = {}
    for label, value in items:
        counters.setdefault(label, 0)
        counters[label] += 1
        token = f"[{label.upper()}_{counters[label]}]"
        # Use regex escape for exact match
        redacted = re.sub(re.escape(value), token, redacted)
    return redacted


In [None]:
# Core function: given input pdf_path, a mapping of detections per page, draw black boxes over spans
def redact_pdf_with_black_boxes(input_pdf_path, output_pdf_path, page_pii_spans, expand_px=1):
    """
    page_pii_spans: dict mapping page_index (0-based) -> list of dicts:
      { "bbox": (x0, y0, x1, y1), "label": "EMAIL", "text": "..." }
    This function draws filled rectangles (black) over the bboxes and saves a new PDF.
    """
    doc = fitz.open(input_pdf_path)
    for pno in range(len(doc)):
        page = doc.load_page(pno)
        if pno not in page_pii_spans:
            continue
        spans = page_pii_spans[pno]
        for s in spans:
            x0, y0, x1, y1 = s["bbox"]
            # optionally expand bbox slightly
            x0 -= expand_px; y0 -= expand_px; x1 += expand_px; y1 += expand_px
            # draw filled rectangle
            rect = fitz.Rect(x0, y0, x1, y1)
            # Use a semi-thick border and fill with black
            page.draw_rect(rect, color=(0,0,0), fill=(0,0,0))
    doc.save(output_pdf_path)
    doc.close()
    print("Saved redacted PDF:", output_pdf_path)


In [None]:
def compute_page_spans_from_pii(pdf_path, detections_per_page):
    """
    Returns page_pii_spans mapping for redact_pdf_with_black_boxes.
    detections_per_page: list of detection dictionaries (same order as pages list)
    """
    doc = fitz.open(pdf_path)
    page_spans = {}
    for pno, det in enumerate(detections_per_page):
        page = doc.load_page(pno)
        spans = []
        page_text = page.get_text("text")
        # If the page_text is likely empty (scanned), use OCR to get boxes
        if not page_text or len(page_text.strip()) < 10:
            # OCR bounding boxes via pytesseract
            pix = page.get_pixmap(dpi=200)
            img = Image.open(io.BytesIO(pix.tobytes()))
            ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
            n_boxes = len(ocr_data['text'])
            for label,vals in det.items():
                for val in vals:
                    # naive matching: find ocr text segments containing token parts
                    for i in range(n_boxes):
                        txt = ocr_data['text'][i].strip()
                        if not txt: continue
                        if val.lower() in txt.lower() or txt.lower() in val.lower():
                            x, y, w, h = ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i]
                            spans.append({"bbox": (x, y, x+w, y+h), "label": label, "text": val})
        else:
            # For digital PDFs, use page.search_for to get rectangles for exact matches
            for label, vals in det.items():
                for val in vals:
                    try:
                        # find all occurrences - fallback to case-insensitive by searching lowercase text
                        rects = page.search_for(val, hit_max=32)
                        if not rects:
                            rects = page.search_for(val.lower(), hit_max=32)
                        for r in rects:
                            spans.append({"bbox": (r.x0, r.y0, r.x1, r.y1), "label": label, "text": val})
                    except Exception as e:
                        # ignore search errors
                        pass
        if spans:
            page_spans[pno] = spans
    doc.close()
    return page_spans


In [None]:
# 1) detect PII per page
detections = [detect_pii(p) for p in pages]

# 2) create redacted text version (token replacement)
redacted_pages_text = [redact_text_token_mode(p, d) for p, d in zip(pages, detections)]

# 3) write redacted text file
with open("redacted_output.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(redacted_pages_text))
print("Saved redacted text: redacted_output.txt")

# 4) compute bboxes for PDF redaction
page_spans = compute_page_spans_from_pii(file_path, detections)
print("Page spans found for black boxes (pages):", list(page_spans.keys()))

# 5) produce redacted pdf using black boxes
redacted_pdf_path = "redacted_output.pdf"
redact_pdf_with_black_boxes(file_path, redacted_pdf_path, page_spans, expand_px=1)

# 6) write redaction log
log = []
for i, det in enumerate(detections):
    log.append({"page": i+1, "pii_detected": det, "spans_count": len(page_spans.get(i, []))})
json.dump(log, open("redaction_log.json","w"), indent=2)
print("Saved redaction_log.json")

# 7) download outputs in Colab
files.download("redacted_output.txt")
files.download("redacted_output.pdf")
files.download("redaction_log.json")


Saved redacted text: redacted_output.txt
Page spans found for black boxes (pages): [0, 1]
Saved redacted PDF: redacted_output.pdf
Saved redaction_log.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>