In [None]:
!pip install -q -U easyocr opencv-python-headless Pillow google-generativeai

In [None]:
!pip install python-doctr[torch]

In [None]:
import os
import json
import random
import re
import time
import easyocr
import google.generativeai as genai
import cv2
from PIL import Image as PILImage, ImageDraw, ImageFont
from IPython.display import Image, display
import torch

In [None]:
reader = easyocr.Reader(['en', 'de'])

In [None]:
IMAGE_PATH = '' 
FONT_PATH_CV = cv2.FONT_HERSHEY_SIMPLEX 
OUTPUT_DIR = "/kaggle/working/"

In [None]:
SELECTABLE_IMAGE_PATH = os.path.join(OUTPUT_DIR, "selectable_ocr_output.png")
OCR_RESULTS_PATH = os.path.join(OUTPUT_DIR, "ocr_results.json")

In [None]:
#EasyOCR

print("1. Performing OCR to find all text fragments...")
if not os.path.exists(IMAGE_PATH):
    print(f"🛑 ERROR: Image not found at '{IMAGE_PATH}'. Please check the path.")
else:
    image = cv2.imread(IMAGE_PATH)
    # Use paragraph=False for more granular text boxes, which is better for selection
    ocr_results = reader.readtext(image, paragraph=False)
    print(f"2. Found {len(ocr_results)} text fragments.")

    # Prepare data for saving (and for the next cell)
    output_data_for_json = []
    output_image = image.copy() # Create a copy to draw on

    # --- Print options and draw on the image ---
    print("\n--- Detected Text (for selection in Cell 2) ---")
    for i, (bbox, text, _) in enumerate(ocr_results):
        text_id = i + 1  # Use 1-based IDs for user-friendliness
        print(f"  ID: {text_id:<4} | Text: {text}")

        # Add to the list that we will save as JSON
        output_data_for_json.append({
            "id": text_id,
            "bbox": [tuple(map(int, p)) for p in bbox],
            "text": text
        })

        # --- Draw labeled bounding boxes on the image ---
        top_left = tuple(map(int, bbox[0]))
        bottom_right = tuple(map(int, bbox[2]))
        
        cv2.rectangle(output_image, top_left, bottom_right, (0, 0, 255), 2) # Red box
        label_pos = (top_left[0], top_left[1] - 10)
        cv2.putText(output_image, str(text_id), label_pos, FONT_PATH_CV, 0.7, (255, 0, 0), 2) # Blue ID

    # --- Save artifacts for Cell 2 and for viewing ---
    # Save the OCR results to a file that Cell 2 can read
    with open(OCR_RESULTS_PATH, 'w', encoding='utf-8') as f:
        json.dump(output_data_for_json, f, indent=2)
    print(f"\n✅ OCR results saved to '{OCR_RESULTS_PATH}'")

    # Save the image with bounding boxes
    cv2.imwrite(SELECTABLE_IMAGE_PATH, output_image)
    print(f"✅ Selectable image with labeled boxes saved to '{SELECTABLE_IMAGE_PATH}'")

    print("\n--- Visual Guide ---")
    # Display the image with bounding boxes directly in the notebook output
    display(Image(filename=SELECTABLE_IMAGE_PATH, width=700))

    print("\n➡️ ACTION: Go to Cell 2, edit the 'IDS_TO_REPLACE' list with the IDs you want, and run it.")

In [None]:
#DoCtR OCR

import os
import cv2
import json
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from PIL import Image as PILImage
from IPython.display import display

print("1. Performing OCR with doctR OCR line model...")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"▶️ Running on device: {device}")


if not os.path.exists(IMAGE_PATH):
    print(f"🛑 ERROR: Image not found at '{IMAGE_PATH}'. Please check the path.")
else:
    # Load the image using doctr's supported loader
    doc = DocumentFile.from_images(IMAGE_PATH)
    model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
    model.det_predictor.model.to(device)
    model.reco_predictor.model.to(device)
    # Predict using doctr (extracts lines, not paragraphs)
    result = model(doc)

    # The output will contain pages, lines, and words. We'll extract lines for selection.
    ocr_results = []
    page = result.pages[0]
    for block in page.blocks:
        for line in block.lines:
            bbox = line.geometry
            # doctr gives bbox as relative coordinates (x_min, y_min, x_max, y_max)
            # We'll convert them to pixel coordinates for drawing on the image
            img = cv2.imread(IMAGE_PATH)
            h, w = img.shape[:2]
            x_min, y_min, x_max, y_max = bbox[0][0]*w, bbox[0][1]*h, bbox[1][0]*w, bbox[1][1]*h
            bbox_pixels = [
                (int(x_min), int(y_min)),
                (int(x_max), int(y_min)),
                (int(x_max), int(y_max)),
                (int(x_min), int(y_max)),
            ]
            text = " ".join(word.value for word in line.words)
            ocr_results.append((bbox_pixels, text))

    print(f"2. Found {len(ocr_results)} text line fragments.")

    # Prepare data for saving
    output_data_for_json = []
    output_image = cv2.imread(IMAGE_PATH).copy() # Create a copy to draw on

    print("\n--- Detected Text (for selection in Cell 2) ---")
    for i, (bbox, text) in enumerate(ocr_results):
        text_id = i + 1  # Use 1-based IDs for user-friendliness
        print(f"  ID: {text_id:<4} | Text: {text}")

        # Add to the JSON list
        output_data_for_json.append({
            "id": text_id,
            "bbox": bbox,
            "text": text
        })

        # Draw bounding boxes on the image
        top_left = bbox[0]
        bottom_right = bbox[2]
        cv2.rectangle(output_image, top_left, bottom_right, (0, 0, 255), 2) # Red box
        label_pos = (top_left[0], top_left[1] - 10)
        cv2.putText(output_image, str(text_id), label_pos, FONT_PATH_CV, 0.7, (255, 0, 0), 2) # Blue ID

    # Save OCR results to JSON
    with open(OCR_RESULTS_PATH, 'w', encoding='utf-8') as f:
        json.dump(output_data_for_json, f, indent=2)
    print(f"\n✅ OCR results saved to '{OCR_RESULTS_PATH}'")

    # Save the image with bounding boxes
    cv2.imwrite(SELECTABLE_IMAGE_PATH, output_image)
    print(f"✅ Selectable image with labeled boxes saved to '{SELECTABLE_IMAGE_PATH}'")

    print("\n--- Visual Guide ---")
    display(PILImage.open(SELECTABLE_IMAGE_PATH).resize((700, int(output_image.shape[0] * 700 / output_image.shape[1]))))

    print("\n➡️ ACTION: Go to Cell 2, edit the 'IDS_TO_REPLACE' list with the IDs you want, and run it.")


In [None]:
IDS_TO_REPLACE = [] 

In [None]:
model = genai.GenerativeModel(model_name="gemini-1.5-flash-latest")

In [None]:
generation_config = genai.types.GenerationConfig(
        temperature=0.9  
    )

In [None]:
OUTPUT_DIR = "/kaggle/working/"
OCR_RESULTS_PATH = os.path.join(OUTPUT_DIR, "ocr_results.json")
FINAL_IMAGE_PATH = os.path.join(OUTPUT_DIR, "final_anonymized_image.png")
FONT_PATH = "/kaggle/input/arial-ttf/arial.ttf"

In [None]:
with open(OCR_RESULTS_PATH, 'r', encoding='utf-8') as f:
    ocr_data = json.load(f)

# 2. Create dictionary for selected IDs
text_replacements = {}
for entry in ocr_data:
    if entry['id'] in IDS_TO_REPLACE:
        # Corrected syntax: close bracket and use assignment operator
        text_replacements[entry['id']] = entry['text']

# 3. Print results for verification
print("Selected Text Fragments for Replacement:")
print(json.dumps(text_replacements, indent=2))
print(f"\nTotal items to replace: {len(text_replacements)}")

output_path = "/kaggle/working/text_replacements.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(text_replacements, f, ensure_ascii=False, indent=2)

print(f"\nSaved JSON file to: {output_path}")

In [None]:
try:
    api_key = "Your api key"
    genai.configure(api_key=api_key)
    print("✅ Successfully configured Gemini API.")
except Exception as e:
    print(f"🛑 ERROR: Could not configure Gemini. Please ensure you have set the 'GOOGLE_API_KEY' secret. Details: {e}")

In [None]:
import time
random_seed = time.time()

In [None]:
#Add as many styles as you wish depending on the context
import random

style_themes = [
    "plain and simple", "traditional"
]

chosen_theme = random.choice(style_themes)
print(f"Injecting random theme for this run: '{chosen_theme}'")

In [None]:
def apply_fake_data(data, fake_map):
    """
    Recursively replace all int or str leaf values in data
    if they match a key in fake_map. Also handles comma-separated
    strings by splitting, mapping each element, then rejoining.
    """
    if isinstance(data, dict):
        return {k: apply_fake_data(v, fake_map) for k, v in data.items()}

    elif isinstance(data, list):
        return [apply_fake_data(item, fake_map) for item in data]

    elif isinstance(data, int):
        key = str(data)
        if key in fake_map:
            print(f"Replacing {data!r} → {fake_map[key]!r}")
            return fake_map[key]
        else:
            print(f"Keeping   {data!r}")
            return data

    elif isinstance(data, str):
        # if it looks like a comma-separated list, handle each piece
        if ',' in data:
            parts = [part.strip() for part in data.split(',')]
            new_parts = []
            for part in parts:
                if part in fake_map:
                    print(f"Replacing segment {part!r} → {fake_map[part]!r}")
                    new_parts.append(str(fake_map[part]))
                else:
                    print(f"Keeping   segment {part!r}")
                    new_parts.append(part)
            return ', '.join(new_parts)

        # otherwise treat as a single value
        key = data
        if key in fake_map:
            print(f"Replacing {data!r} → {fake_map[key]!r}")
            return fake_map[key]
        else:
            print(f"Keeping   {data!r}")
            return data

    else:
        # other types unchanged
        return data

In [None]:
def anonymize_selected_text(
    original_image_path,
    ocr_data_path,
    ids_to_replace,
    output_image_path="anonymized_output.png",
    output_json_path="anonymized_output.json"  # ✅ NEW ARGUMENT
):
    
    FINAL_IMAGE_PATH = output_image_path

    print(f"\n1. Loading OCR data and filtering for {len(ids_to_replace)} selected IDs...")
    with open(ocr_data_path, 'r', encoding='utf-8') as f:
        all_ocr_data = json.load(f)

    items_to_replace = [item for item in all_ocr_data if item['id'] in ids_to_replace]
    if not items_to_replace:
        print("Warning: The IDs listed in IDS_TO_REPLACE were not found in the OCR data. Nothing to do.")
        return

    # 2. Call Gemini to generate fake data
    print("2. Calling Gemini to generate fake data...")
    try:
        model = genai.GenerativeModel(model_name="gemini-2.5-flash")
        generation_config = genai.types.GenerationConfig(temperature=2.0, top_p=0.95)

        prompt_data = [{"id": item["id"], "text": item["text"]} for item in items_to_replace]

        prompt = f"""
You are an expert data anonymizer for English documents. Your creative style for this task is: "{chosen_theme}".
Based on the following list of items, generate a realistic but fake English replacement for each and every item(even if it is repeating) that fits your assigned style.Keep letters and numbers between (1-0) and [A-Z][a-z]
# Respond with ONLY a single JSON object that maps the original 'id' (as a string) to the new fake text string.YOU SHOULD NOT KEEP ANYTHING REPLACE EVERYTHING EXCEPT THE PLACEHOLDERS.Don't forget to replace the tax percentage. Ensure the output is unique and creative.
Keep the Placeholders of the values of the document same when faking the data. You should still be able to recognize if it is an invoice .
YOU SHOULD NOT KEEP ANYTHING REPLACE EVERYTHING EXCEPT THE PLACEHOLDERS. 
Input: {json.dumps(prompt_data, indent=2, ensure_ascii=False)}
"""
        response = model.generate_content(prompt, generation_config=generation_config, request_options={"timeout": 90})
        json_match = re.search(r'\{.*\}', response.text, re.DOTALL)
        if not json_match: raise ValueError("No JSON object found in Gemini response.")
        fake_data_map = json.loads(json_match.group(0))
        print("✅ Gemini returned fake data successfully.")
    except Exception as e:
        print(f"🛑 ERROR: Failed to get fake data from Gemini. Error: {e}")
        return
    #Your JSON You wish to fill with the replaced values 
    with open('/kaggle/input/test-json/inv.json', 'r', encoding='utf-8') as f:
        original_data = json.load(f)

    updated_json = apply_fake_data(original_data, fake_data_map)

    # ✅ Save to the custom JSON output path
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(updated_json, f, indent=2, ensure_ascii=False)
    print(f"✅ Fake data mapped and saved to '{output_json_path}'")

    # 3. Inpaint and Render
    print("3. Inpainting original text and rendering new text...")
    original_image = cv2.imread(original_image_path)
    mask = np.zeros(original_image.shape[:2], dtype="uint8")
    for item in items_to_replace:
        cv2.fillPoly(mask, [np.array(item['bbox'], dtype=np.int32)], (255, 255, 255))

    inpainted_image_cv = cv2.inpaint(original_image, mask, inpaintRadius=5, flags=cv2.INPAINT_TELEA)
    inpainted_image_pil = Image.fromarray(cv2.cvtColor(inpainted_image_cv, cv2.COLOR_BGR2RGB))
    draw = ImageDraw.Draw(inpainted_image_pil)

    base_font = None
    if os.path.exists(FONT_PATH):
        try:
            base_font = ImageFont.truetype(FONT_PATH, 10)
        except Exception as e:
            print(f"Warning: Could not load custom font {FONT_PATH}. Using default. Error: {e}")
            base_font = ImageFont.load_default()
    else:
        print(f"Warning: Custom font at '{FONT_PATH}' not found. Using default.")
        base_font = ImageFont.load_default()

    for item in items_to_replace:
        new_text = fake_data_map.get(str(item['id']))
        if not new_text: continue

        bbox = item['bbox']
        tl, _, br, _ = bbox
        x1, y1, x2, y2 = int(tl[0]), int(tl[1]), int(br[0]), int(br[1])

        box_width, box_height = x2 - x1, y2 - y1
        if box_width <= 0 or box_height <= 0: continue

        # 1. DYNAMIC FONT SIZING
        font_size = int(box_height * 0.9)
        font = base_font.font_variant(size=font_size)
        
        # Pillow >= 10.0.0 uses getbbox, older versions use getsize
        try:
            # Modern way
            while font.getbbox(new_text)[2] > box_width:
                font_size -= 1
                if font_size < 6: font_size = 6; break
                font = base_font.font_variant(size=font_size)
        except AttributeError:
             # Older way
            while font.getsize(new_text)[0] > box_width:
                font_size -= 1
                if font_size < 6: font_size = 6; break
                font = base_font.font_variant(size=font_size)

        # 2. CENTER TEXT IN THE BOX
        text_bbox = draw.textbbox((0, 0), new_text, font=font)
        text_width = text_bbox[2] - text_bbox[0]
        text_height = text_bbox[3] - text_bbox[1]
        
        position_x = x1 + (box_width - text_width) / 2
        position_y = y1 + (box_height - text_height) / 2 - text_bbox[1]

        draw.text((position_x, position_y), new_text, font=font, fill=(0, 0, 0))

    inpainted_image_pil.save(output_image_path)
    print(f"✨ Final image saved to '{output_image_path}'")
    return output_image_path

In [None]:
import os

NUM_VARIANTS = 100
SAVE_DIR = "output_images_1"
JSON_OUTPUT_DIR = "output_jsons"

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(JSON_OUTPUT_DIR, exist_ok=True)

generated_paths = []

for i in range(NUM_VARIANTS):
    print(f"\n=== Generating variant {i+1}/{NUM_VARIANTS} ===")
    
    output_filename = f"anonymized_variant_{i+1}.png"
    output_path = os.path.join(SAVE_DIR, output_filename)

    output_json_filename = f"anonymized_variant_{i+1}.json"
    output_json_path = os.path.join(JSON_OUTPUT_DIR, output_json_filename)

    anonymized_image_path = anonymize_selected_text(
        IMAGE_PATH,
        OCR_RESULTS_PATH,
        IDS_TO_REPLACE,
        output_image_path=output_path,
        output_json_path=output_json_path  # ✅ Pass unique JSON path
    )

    if anonymized_image_path and os.path.exists(anonymized_image_path):
        generated_paths.append(anonymized_image_path)
        print(f"✅ Saved: {output_path} + {output_json_path}")
    else:
        print(f"❌ Variant {i+1} failed to generate.")

In [None]:
import shutil
 

folder_to_zip = '/kaggle/working/output_images'
 

output_zip = '/kaggle/working/output_images.zip'
 

shutil.make_archive(base_name=output_zip.replace('.zip', ''), 
                    format='zip', 
                    root_dir=folder_to_zip) 
print(f"✅ Zipped: {output_zip}")

In [None]:
import shutil
 
# Path to your output folder

folder_to_zip = '/kaggle/working/output_jsons'
 
# Name of the zip file

output_zip = '/kaggle/working/output_jsons.zip'
 
# Make the zip

shutil.make_archive(base_name=output_zip.replace('.zip', ''), 
                    format='zip', 
                    root_dir=folder_to_zip)
print(f"✅ Zipped: {output_zip}")