In [1]:
# create a venv/env if you prefer, or use base env
pip install pillow piexif pytesseract opencv-python


SyntaxError: invalid syntax (2750843489.py, line 2)

In [14]:
"""
image_inspector.py

Scans a folder for image files, extracts metadata + optional OCR text,
and outputs a JSON result per image and a combined JSON file.

Usage:
    python image_inspector.py
"""

import os
import json
import time
from pathlib import Path
from datetime import datetime

from PIL import Image, ExifTags
import piexif
import json
from PIL.TiffImagePlugin import IFDRational
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Optional: OCR
try:
    import pytesseract
    OCR_AVAILABLE = True
except Exception:
    OCR_AVAILABLE = False

# ------------- Configuration -------------
SEARCH_PATH = Path(r"C:\Users\Public\Documents\Plustek-SecureScan\Image")
# Where to save results
OUTPUT_JSON = Path.cwd() / "image_scan_results.json"

# If you installed Tesseract in a non-standard location set it here:
# Example on Windows:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Uncomment and edit the line below if needed:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# File extensions considered images:
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"}


def file_times_to_iso(path: Path):
    """Return creation/modification times as ISO strings (best effort)."""
    try:
        mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat()
    except Exception:
        mtime = None
    # Prefer creation time on Windows; may not be meaningful on other OS
    try:
        ctime = datetime.fromtimestamp(path.stat().st_ctime).isoformat()
    except Exception:
        ctime = None
    return {"created": ctime, "modified": mtime}

def safe_json(obj):
    """Convert objects that aren't JSON serializable."""
    if isinstance(obj, IFDRational):
        return float(obj)
    if isinstance(obj, bytes):
        return obj.decode(errors="ignore")
    if isinstance(obj, (set, tuple)):
        return list(obj)
    # Fallback
    return str(obj)
def extract_exif(image: Image.Image):
    """Return a dictionary of EXIF tags (if present), decoded to human names."""
    exif_data = {}
    try:
        raw_exif = image._getexif()
        if raw_exif:
            for tag_id, value in raw_exif.items():
                tag = ExifTags.TAGS.get(tag_id, tag_id)
                # Many EXIF values are bytes — convert to str where reasonable
                try:
                    if isinstance(value, bytes):
                        value = value.decode(errors="ignore")
                except Exception:
                    pass
                exif_data[tag] = value
    except Exception:
        pass

    # piexif (another option) to get GPS etc.
    try:
        exif_bytes = image.info.get("exif")
        if exif_bytes:
            parsed = piexif.load(exif_bytes)
            # Flatten and stringify some fields
            for ifd_name in parsed:
                if ifd_name == "thumbnail":
                    continue
                for key, val in parsed[ifd_name].items():
                    try:
                        key_name = piexif.TAGS[ifd_name][key]["name"]
                    except Exception:
                        key_name = f"{ifd_name}_{key}"
                    # convert bytes to str where possible
                    if isinstance(val, (bytes, bytearray)):
                        try:
                            val = val.decode("utf-8", errors="ignore")
                        except Exception:
                            val = str(val)
                    exif_data[f"{ifd_name}.{key_name}"] = val
    except Exception:
        # piexif may raise on non-jpeg or missing exif
        pass

    return exif_data


def ocr_image(path: Path, lang: str = "eng"):
    """Run OCR on the image file and return extracted text.
       Returns None if pytesseract/Tesseract not available or OCR fails.
    """
    if not OCR_AVAILABLE:
        return None
    try:
        # If Tesseract path needs to be set, user should set pytesseract.pytesseract.tesseract_cmd above
        text = pytesseract.image_to_string(str(path), lang=lang)
        return text.strip()
    except Exception:
        return None


def process_image(path: Path, do_ocr: bool = True):
    """Extract metadata and optional OCR from one image. Returns a dict."""
    result = {
        "path": str(path),
        "filename": path.name,
        "exists": path.exists(),
    }

    try:
        stat = path.stat()
        result["size_bytes"] = stat.st_size
    except Exception:
        result["size_bytes"] = None

    # file times
    result.update(file_times_to_iso(path))

    # open image to get more details
    try:
        with Image.open(path) as img:
            result["format"] = img.format
            result["mode"] = img.mode
            result["width"], result["height"] = img.size
            # EXIF
            result["exif"] = extract_exif(img)
    except Exception as e:
        result["format"] = None
        result["mode"] = None
        result["width"] = None
        result["height"] = None
        result["exif"] = {}
        result["error_opening_image"] = str(e)

    # OCR (if requested and available)
    ocr_text = None
    if do_ocr:
        ocr_text = ocr_image(path)
    result["ocr_text"] = ocr_text

    return result


def scan_folder(folder: Path, do_ocr: bool = True):
    """Scan folder for image files and return list of result dicts."""
    results = []
    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder}")

    for entry in folder.iterdir():
        if entry.is_file() and entry.suffix.lower() in IMAGE_EXTS:
            try:
                res = process_image(entry, do_ocr=do_ocr)
                results.append(res)
            except Exception as e:
                results.append({
                    "path": str(entry),
                    "error": f"Processing error: {e}"
                })
    return results


def main():
    print(f"Scanning folder: {SEARCH_PATH}")
    try:
        results = scan_folder(SEARCH_PATH, do_ocr=True)
    except FileNotFoundError as e:
        print("Error:", e)
        return

    # Output JSON to stdout and save to file
    combined = {
        "scanned_at": datetime.now().isoformat(),
        "folder": str(SEARCH_PATH),
        "image_count": len(results),
        "images": results
    }

    # Pretty print to console
    print(json.dumps(combined, indent=2, ensure_ascii=False, default=safe_json))


    # Save to file
    try:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(combined, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to {OUTPUT_JSON}")
    except Exception as e:
        print("Failed to save results:", e)


if __name__ == "__main__":
    main()


Scanning folder: C:\Users\Public\Documents\Plustek-SecureScan\Image
{
  "scanned_at": "2025-10-07T14:56:49.442705",
  "folder": "C:\\Users\\Public\\Documents\\Plustek-SecureScan\\Image",
  "image_count": 1,
  "images": [
    {
      "path": "C:\\Users\\Public\\Documents\\Plustek-SecureScan\\Image\\DEI.jpg - Copy.jpg",
      "filename": "DEI.jpg - Copy.jpg",
      "exists": true,
      "size_bytes": 368861,
      "created": "2025-10-07T14:48:54.943660",
      "modified": "2024-06-12T15:18:14.378948",
      "format": "JPEG",
      "mode": "RGB",
      "width": 1300,
      "height": 813,
      "exif": {},
      "ocr_text": null
    }
  ]
}

Results saved to C:\Users\iTEK\Desktop\python\image_scan_results.json


In [23]:
from PIL import Image
import pytesseract
import json
import os
import pytesseract
from PIL import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

text = pytesseract.image_to_string(
    Image.open(r"C:\Users\Public\Documents\Plustek-SecureScan\Image\")
)

# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\tesseract-main\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract raw text from image."""
    text = pytesseract.image_to_string(Image.open(image_path))
    return text

def extract_id_info(text):
    """Example of pattern-based text extraction (safe generic template)."""
    info = {}
    lines = text.split('\n')

    # Example pattern (for local testing)
    for line in lines:
        line = line.strip()
        if "Names" in line:
            info["Names"] = line.split(":")[-1].strip()
        elif "Date of Birth" in line:
            info["Date of Birth"] = line.split(":")[-1].strip()
        elif "National ID" in line:
            info["National ID"] = line.split(":")[-1].strip()

    return info

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            path = os.path.join(folder, file)
            text = extract_text_from_image(path)
            info = extract_id_info(text)
            results.append({
                "file": file,
                "data": info
            })

    print(json.dumps(results, indent=2, ensure_ascii=False))

if __name__ == "__main__":
    main()


SyntaxError: unterminated string literal (detected at line 12); perhaps you escaped the end quote? (2837278767.py, line 12)

In [27]:
from PIL import Image
import pytesseract
import os
import json

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract raw text from one image."""
    return pytesseract.image_to_string(Image.open(image_path))

def extract_id_info(text):
    """Extract example info from OCR text."""
    info = {}
    for line in text.splitlines():
        if "Names" in line:
            info["Names"] = line.split(":")[-1].strip()
        elif "Date of Birth" in line:
            info["Date of Birth"] = line.split(":")[-1].strip()
        elif "National ID" in line:
            info["National ID"] = line.split(":")[-1].strip()
    return info

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    # Loop through all image files
    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            path = os.path.join(folder, file)
            print(f"Processing: {path}")
            text = extract_text_from_image(path)
            info = extract_id_info(text)
            results.append({"file": file, "data": info})

    # Print or save results
    print(json.dumps(results, indent=2, ensure_ascii=False))

if __name__ == "__main__":
    main()


Processing: C:\Users\Public\Documents\Plustek-SecureScan\Image\DEI.jpg.jpg
[
  {
    "file": "DEI.jpg.jpg",
    "data": {
      "Names": "Amazina./' Names",
      "National ID": "indangamuntu / National ID No. 14 14999 8 0064830 3 81 ¢"
    }
  }
]


In [29]:
from PIL import Image
import pytesseract
import os
import json

# Set path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract full text from one image."""
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    return text.strip()

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    # Check if folder exists
    if not os.path.exists(folder):
        print("⚠️ Folder not found:", folder)
        return

    # Process all images
    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f"🖼️ Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Print JSON result
    json_result = json.dumps(results, indent=2, ensure_ascii=False)
    print("\n✅ Extracted JSON Output:")
    print(json_result)

    # Optionally save JSON to a file
    output_path = os.path.join(folder, "extracted_text.json")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json_result)
    print(f"\n💾 Results saved to: {output_path}")

if __name__ == "__main__":
    main()


🖼️ Processing: Youssouf.jpg ...

✅ Extracted JSON Output:
[
  {
    "file": "Youssouf.jpg",
    "content": "UNION DES COMORES fail pala 4s sean\n\nType /és)) Code du paya sl) jay Passeport p°J jis\nPASSEPORT PO COM ~ ~ .NBE668499_\nhs Noe\n\nNom Jest)\nYOUSSOUF\nPrion / ith\nANRFOUDINE\nNationatité | Ania!\nCOMORIENNE\n\nda nelssance Daa) fos pas :\nj 07-05-2002 UC1085945\n\nJ (sees iced mu da naissance 3 Jae\nMM BANDRABOUA ;\n\nJF blake sombre cet ek Authorité / Gata\n\n{ 09-08-2022 Dipsetlon Géinérafe.de is Police,\n\n\"et de la Sareté Nation:\nDate d’expiration | Yanail plysll\n\n09-08-2027\n\nPOCOMYOUSSOUF<<ANRFOUDINE<<<<<<<<<<<<<<<<<<<\nNBE6684990COM0205078M2708090UC1085945<<<<<68"
  }
]

💾 Results saved to: C:\Users\Public\Documents\Plustek-SecureScan\Image\extracted_text.json


In [30]:
from PIL import Image
import pytesseract
import os
import json

# Set path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract full text from one image."""
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    return text.strip()

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    # Check if folder exists
    if not os.path.exists(folder):
        print("⚠️ Folder not found:", folder)
        return

    # Process all images
    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f"🖼️ Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Print JSON result
    json_result = json.dumps(results, indent=2, ensure_ascii=False)
    print("\n✅ Extracted JSON Output:")
    print(json_result)

    # Optionally save JSON to a file
    output_path = os.path.join(folder, "extracted_text.json")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json_result)
    print(f"\n💾 Results saved to: {output_path}")

if __name__ == "__main__":
    main()


🖼️ Processing: Youssouf.jpg ...

✅ Extracted JSON Output:
[
  {
    "file": "Youssouf.jpg",
    "content": "UNION DES COMORES fail pala 4s sean\n\nType /és)) Code du paya sl) jay Passeport p°J jis\nPASSEPORT PO COM ~ ~ .NBE668499_\nhs Noe\n\nNom Jest)\nYOUSSOUF\nPrion / ith\nANRFOUDINE\nNationatité | Ania!\nCOMORIENNE\n\nda nelssance Daa) fos pas :\nj 07-05-2002 UC1085945\n\nJ (sees iced mu da naissance 3 Jae\nMM BANDRABOUA ;\n\nJF blake sombre cet ek Authorité / Gata\n\n{ 09-08-2022 Dipsetlon Géinérafe.de is Police,\n\n\"et de la Sareté Nation:\nDate d’expiration | Yanail plysll\n\n09-08-2027\n\nPOCOMYOUSSOUF<<ANRFOUDINE<<<<<<<<<<<<<<<<<<<\nNBE6684990COM0205078M2708090UC1085945<<<<<68"
  }
]

💾 Results saved to: C:\Users\Public\Documents\Plustek-SecureScan\Image\extracted_text.json


In [31]:
from PIL import Image
import pytesseract
import os
import json
import requests

# Set the path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Your API endpoint
API_URL = "https://your-api-endpoint.com/upload"  # 🟢 change this to your actual API URL

def extract_text_from_image(image_path):
    """Extract full text from an image using OCR."""
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    return text.strip()

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    if not os.path.exists(folder):
        print("⚠️ Folder not found:", folder)
        return

    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f"🖼️ Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Convert to JSON
    json_payload = json.dumps(results, ensure_ascii=False)

    # POST to API
    try:
        print("📤 Sending data to API...")
        headers = {"Content-Type": "application/json"}
        response = requests.post(API_URL, data=json_payload.encode('utf-8'), headers=headers)
        
        print("✅ Server responded with:")
        print(response.status_code, response.text)
    except Exception as e:
        print("❌ Failed to post data:", e)

if __name__ == "__main__":
    main()


🖼️ Processing: Youssouf.jpg ...
📤 Sending data to API...
❌ Failed to post data: HTTPSConnectionPool(host='your-api-endpoint.com', port=443): Max retries exceeded with url: /upload (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021550C09BE0>, 'Connection to your-api-endpoint.com timed out. (connect timeout=None)'))
