In [1]:
!pip -q install pymupdf gradio pandas openpyxl xlsxwriter

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.3/175.3 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:

import re, json, os, tempfile
from pathlib import Path
import fitz
import gradio as gr
import pandas as pd

# ---------- Core encryptor ----------
class StudentIDEncryptor:
    def __init__(self, font_name="Helvetica-Bold", size_scale=1.35, min_font=12):
        self.number_to_letter = {'0':'A','1':'B','2':'C','3':'D','4':'E','5':'F','6':'G','7':'H','8':'I','9':'J'}
        self.letter_to_number = {v:k for k,v in self.number_to_letter.items()}
        self.id_mappings = {}
        self.font_name = font_name
        self.size_scale = float(size_scale)
        self.min_font = int(min_font)

    def encrypt_student_id(self, student_id: str) -> str:
        return "".join(self.number_to_letter[c] if c.isdigit() else c for c in str(student_id))

    def _encrypt_match_text(self, text: str):
        m = re.search(r'For:\s*(\d+)', text)
        if not m:
            return text, None
        original_id = m.group(1)
        encrypted_id = self.encrypt_student_id(original_id)
        new_text = text[:m.start(1)] + encrypted_id + text[m.end(1):]
        self.id_mappings[original_id] = encrypted_id
        return new_text, original_id

    def process_pdf(self, input_path: str, output_path: str) -> dict:
        doc = fitz.open(input_path)
        for page in doc:
            page_dict = page.get_text("dict")
            to_redact = []
            for block in page_dict.get("blocks", []):
                if "lines" not in block:
                    continue
                for line in block["lines"]:
                    line_has_for = any("For:" in s.get("text", "") for s in line.get("spans", []))
                    for span in line.get("spans", []):
                        text = span.get("text", "")
                        bbox = span.get("bbox", None)
                        if not bbox or not text:
                            continue

                        # Case A: whole pattern in one span
                        if re.search(r'For:\s*\d+', text):
                            new_text, original_id = self._encrypt_match_text(text)
                            if original_id is not None and new_text != text:
                                rect = fitz.Rect(bbox)
                                size = max(self.min_font, int(span.get("size", 10) * self.size_scale))
                                to_redact.append((rect, new_text, size, self.font_name))
                                continue

                        # Case B: digits are in a separate span on the same line
                        if line_has_for and re.fullmatch(r'\s*\d+\s*', text):
                            digits = re.search(r'(\d+)', text).group(1)
                            encrypted = self.encrypt_student_id(digits)
                            self.id_mappings[digits] = encrypted
                            rect = fitz.Rect(bbox)
                            new_text = re.sub(r'\d+', encrypted, text)
                            size = max(self.min_font, int(span.get("size", 10) * self.size_scale))
                            to_redact.append((rect, new_text, size, self.font_name))

            for rect, replacement, font_size, font_name in to_redact:
                page.add_redact_annot(
                    rect,
                    text=replacement,
                    fill=(1,1,1),
                    text_color=(0,0,0),
                    fontsize=font_size,
                    fontname=font_name,
                )
            if to_redact:
                page.apply_redactions()

        doc.save(output_path)
        doc.close()
        return self.id_mappings

# ---------- Excel restore helpers ----------
LETTER_TO_NUM = dict(zip("ABCDEFGHIJ", "0123456789"))

def decrypt_letters(s: str) -> str:
    return "".join(LETTER_TO_NUM.get(c, c) for c in s)

def restore_text_with_rule(text: str) -> str:
    if not isinstance(text, str):
        return text
    # Replace sequences like 'For: ABCDEF' → 'For: 123456'
    text = re.sub(r'(?i)(For:\s*)([A-J]+)', lambda m: m.group(1) + decrypt_letters(m.group(2).upper()), text)
    # Also convert standalone A–J-only tokens of length ≥ 3 (likely IDs)
    def token_repl(m):
        t = m.group(0)
        return decrypt_letters(t)
    return re.sub(r'\b[A-J]{3,}\b', token_repl, text)

def restore_with_mapping(text: str, enc_to_orig: dict) -> str:
    if not isinstance(text, str):
        return text
    # Replace exact encrypted tokens first
    for enc, orig in sorted(enc_to_orig.items(), key=lambda kv: -len(kv[0])):  # longest first
        text = re.sub(rf'\b{re.escape(enc)}\b', orig, text)
    # Then handle 'For: ENCRYPTED' patterns that include punctuation/spacing
    text = re.sub(
        r'(For:\s*)([A-J]+)',
        lambda m: m.group(1) + enc_to_orig.get(m.group(2), m.group(2)),
        text
    )
    return text

# ---------- Gradio functions ----------
def ui_encrypt_pdf(file, font, scale):
    enc = StudentIDEncryptor(font_name=font, size_scale=scale)
    tmpdir = tempfile.mkdtemp()
    out_pdf = os.path.join(tmpdir, "encrypted.pdf")
    enc.process_pdf(file.name, out_pdf)
    map_path = os.path.join(tmpdir, "id_mappings.json")
    with open(map_path, "w") as f:
        json.dump(enc.id_mappings, f, indent=2)
    # Optional: also give CSV/Excel mapping
    mapping_xlsx = os.path.join(tmpdir, "id_mappings.xlsx")
    pd.DataFrame([{"original": k, "encrypted": v} for k, v in enc.id_mappings.items()]) \
        .to_excel(mapping_xlsx, index=False)
    return out_pdf, map_path, mapping_xlsx

def ui_restore_excel(xlsx_file, mappings_json):
    df = pd.read_excel(xlsx_file)

    # Build mapping if provided
    enc_to_orig = {}
    if mappings_json is not None:
        with open(mappings_json.name, "r") as f:
            m = json.load(f)
        enc_to_orig = {v: k for k, v in m.items()}

    # Apply to all string cells
    df_restored = df.copy()
    for col in df_restored.columns:
        if enc_to_orig:
            df_restored[col] = df_restored[col].apply(lambda x: restore_with_mapping(x, enc_to_orig))
        df_restored[col] = df_restored[col].apply(restore_text_with_rule)

    tmpdir = tempfile.mkdtemp()
    out_xlsx = os.path.join(tmpdir, "restored.xlsx")
    with pd.ExcelWriter(out_xlsx, engine="xlsxwriter") as writer:
        df_restored.to_excel(writer, index=False)
    return out_xlsx

with gr.Blocks(title="Cadet ID Encryptor/Restorer") as demo:
    gr.Markdown("**Encrypt cadet IDs in PDFs and restore them in Excel results.**")

    with gr.Tab("Encrypt PDF"):
        inp = gr.File(label="Upload PDF")
        font = gr.Dropdown(choices=["Helvetica","Helvetica-Bold","Times-Roman","Courier"], value="Helvetica-Bold", label="Font")
        scale = gr.Slider(1.0, 2.0, value=1.35, step=0.05, label="Font size scale")
        btn = gr.Button("Encrypt")
        out_pdf = gr.File(label="Encrypted PDF")
        out_json = gr.File(label="ID Mappings JSON")
        out_map_xlsx = gr.File(label="ID Mappings Excel")
        btn.click(ui_encrypt_pdf, inputs=[inp, font, scale], outputs=[out_pdf, out_json, out_map_xlsx])

    with gr.Tab("Restore IDs in Excel"):
        xlsx = gr.File(label="Upload Excel with GPT outputs (.xlsx)")
        map_json = gr.File(label="Upload ID Mappings JSON (optional, improves accuracy)")
        btn2 = gr.Button("Restore IDs")
        out_xlsx = gr.File(label="Restored Excel")
        btn2.click(ui_restore_excel, inputs=[xlsx, map_json], outputs=out_xlsx)

demo.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://56f4ece34e62c06860.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


