In [1]:
# create a venv/env if you prefer, or use base env
pip install pillow piexif pytesseract opencv-python


SyntaxError: invalid syntax (2750843489.py, line 2)

In [14]:
"""
image_inspector.py

Scans a folder for image files, extracts metadata + optional OCR text,
and outputs a JSON result per image and a combined JSON file.

Usage:
    python image_inspector.py
"""

import os
import json
import time
from pathlib import Path
from datetime import datetime

from PIL import Image, ExifTags
import piexif
import json
from PIL.TiffImagePlugin import IFDRational
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Optional: OCR
try:
    import pytesseract
    OCR_AVAILABLE = True
except Exception:
    OCR_AVAILABLE = False

# ------------- Configuration -------------
SEARCH_PATH = Path(r"C:\Users\Public\Documents\Plustek-SecureScan\Image")
# Where to save results
OUTPUT_JSON = Path.cwd() / "image_scan_results.json"

# If you installed Tesseract in a non-standard location set it here:
# Example on Windows:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Uncomment and edit the line below if needed:
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# File extensions considered images:
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"}


def file_times_to_iso(path: Path):
    """Return creation/modification times as ISO strings (best effort)."""
    try:
        mtime = datetime.fromtimestamp(path.stat().st_mtime).isoformat()
    except Exception:
        mtime = None
    # Prefer creation time on Windows; may not be meaningful on other OS
    try:
        ctime = datetime.fromtimestamp(path.stat().st_ctime).isoformat()
    except Exception:
        ctime = None
    return {"created": ctime, "modified": mtime}

def safe_json(obj):
    """Convert objects that aren't JSON serializable."""
    if isinstance(obj, IFDRational):
        return float(obj)
    if isinstance(obj, bytes):
        return obj.decode(errors="ignore")
    if isinstance(obj, (set, tuple)):
        return list(obj)
    # Fallback
    return str(obj)
def extract_exif(image: Image.Image):
    """Return a dictionary of EXIF tags (if present), decoded to human names."""
    exif_data = {}
    try:
        raw_exif = image._getexif()
        if raw_exif:
            for tag_id, value in raw_exif.items():
                tag = ExifTags.TAGS.get(tag_id, tag_id)
                # Many EXIF values are bytes — convert to str where reasonable
                try:
                    if isinstance(value, bytes):
                        value = value.decode(errors="ignore")
                except Exception:
                    pass
                exif_data[tag] = value
    except Exception:
        pass

    # piexif (another option) to get GPS etc.
    try:
        exif_bytes = image.info.get("exif")
        if exif_bytes:
            parsed = piexif.load(exif_bytes)
            # Flatten and stringify some fields
            for ifd_name in parsed:
                if ifd_name == "thumbnail":
                    continue
                for key, val in parsed[ifd_name].items():
                    try:
                        key_name = piexif.TAGS[ifd_name][key]["name"]
                    except Exception:
                        key_name = f"{ifd_name}_{key}"
                    # convert bytes to str where possible
                    if isinstance(val, (bytes, bytearray)):
                        try:
                            val = val.decode("utf-8", errors="ignore")
                        except Exception:
                            val = str(val)
                    exif_data[f"{ifd_name}.{key_name}"] = val
    except Exception:
        # piexif may raise on non-jpeg or missing exif
        pass

    return exif_data


def ocr_image(path: Path, lang: str = "eng"):
    """Run OCR on the image file and return extracted text.
       Returns None if pytesseract/Tesseract not available or OCR fails.
    """
    if not OCR_AVAILABLE:
        return None
    try:
        # If Tesseract path needs to be set, user should set pytesseract.pytesseract.tesseract_cmd above
        text = pytesseract.image_to_string(str(path), lang=lang)
        return text.strip()
    except Exception:
        return None


def process_image(path: Path, do_ocr: bool = True):
    """Extract metadata and optional OCR from one image. Returns a dict."""
    result = {
        "path": str(path),
        "filename": path.name,
        "exists": path.exists(),
    }

    try:
        stat = path.stat()
        result["size_bytes"] = stat.st_size
    except Exception:
        result["size_bytes"] = None

    # file times
    result.update(file_times_to_iso(path))

    # open image to get more details
    try:
        with Image.open(path) as img:
            result["format"] = img.format
            result["mode"] = img.mode
            result["width"], result["height"] = img.size
            # EXIF
            result["exif"] = extract_exif(img)
    except Exception as e:
        result["format"] = None
        result["mode"] = None
        result["width"] = None
        result["height"] = None
        result["exif"] = {}
        result["error_opening_image"] = str(e)

    # OCR (if requested and available)
    ocr_text = None
    if do_ocr:
        ocr_text = ocr_image(path)
    result["ocr_text"] = ocr_text

    return result


def scan_folder(folder: Path, do_ocr: bool = True):
    """Scan folder for image files and return list of result dicts."""
    results = []
    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder}")

    for entry in folder.iterdir():
        if entry.is_file() and entry.suffix.lower() in IMAGE_EXTS:
            try:
                res = process_image(entry, do_ocr=do_ocr)
                results.append(res)
            except Exception as e:
                results.append({
                    "path": str(entry),
                    "error": f"Processing error: {e}"
                })
    return results


def main():
    print(f"Scanning folder: {SEARCH_PATH}")
    try:
        results = scan_folder(SEARCH_PATH, do_ocr=True)
    except FileNotFoundError as e:
        print("Error:", e)
        return

    # Output JSON to stdout and save to file
    combined = {
        "scanned_at": datetime.now().isoformat(),
        "folder": str(SEARCH_PATH),
        "image_count": len(results),
        "images": results
    }

    # Pretty print to console
    print(json.dumps(combined, indent=2, ensure_ascii=False, default=safe_json))


    # Save to file
    try:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(combined, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to {OUTPUT_JSON}")
    except Exception as e:
        print("Failed to save results:", e)


if __name__ == "__main__":
    main()


Scanning folder: C:\Users\Public\Documents\Plustek-SecureScan\Image
{
  "scanned_at": "2025-10-07T14:56:49.442705",
  "folder": "C:\\Users\\Public\\Documents\\Plustek-SecureScan\\Image",
  "image_count": 1,
  "images": [
    {
      "path": "C:\\Users\\Public\\Documents\\Plustek-SecureScan\\Image\\DEI.jpg - Copy.jpg",
      "filename": "DEI.jpg - Copy.jpg",
      "exists": true,
      "size_bytes": 368861,
      "created": "2025-10-07T14:48:54.943660",
      "modified": "2024-06-12T15:18:14.378948",
      "format": "JPEG",
      "mode": "RGB",
      "width": 1300,
      "height": 813,
      "exif": {},
      "ocr_text": null
    }
  ]
}

Results saved to C:\Users\iTEK\Desktop\python\image_scan_results.json


In [23]:
from PIL import Image
import pytesseract
import json
import os
import pytesseract
from PIL import Image
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

text = pytesseract.image_to_string(
    Image.open(r"C:\Users\Public\Documents\Plustek-SecureScan\Image\")
)

# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\tesseract-main\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract raw text from image."""
    text = pytesseract.image_to_string(Image.open(image_path))
    return text

def extract_id_info(text):
    """Example of pattern-based text extraction (safe generic template)."""
    info = {}
    lines = text.split('\n')

    # Example pattern (for local testing)
    for line in lines:
        line = line.strip()
        if "Names" in line:
            info["Names"] = line.split(":")[-1].strip()
        elif "Date of Birth" in line:
            info["Date of Birth"] = line.split(":")[-1].strip()
        elif "National ID" in line:
            info["National ID"] = line.split(":")[-1].strip()

    return info

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            path = os.path.join(folder, file)
            text = extract_text_from_image(path)
            info = extract_id_info(text)
            results.append({
                "file": file,
                "data": info
            })

    print(json.dumps(results, indent=2, ensure_ascii=False))

if __name__ == "__main__":
    main()


SyntaxError: unterminated string literal (detected at line 12); perhaps you escaped the end quote? (2837278767.py, line 12)

In [27]:
from PIL import Image
import pytesseract
import os
import json

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract raw text from one image."""
    return pytesseract.image_to_string(Image.open(image_path))

def extract_id_info(text):
    """Extract example info from OCR text."""
    info = {}
    for line in text.splitlines():
        if "Names" in line:
            info["Names"] = line.split(":")[-1].strip()
        elif "Date of Birth" in line:
            info["Date of Birth"] = line.split(":")[-1].strip()
        elif "National ID" in line:
            info["National ID"] = line.split(":")[-1].strip()
    return info

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    # Loop through all image files
    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png")):
            path = os.path.join(folder, file)
            print(f"Processing: {path}")
            text = extract_text_from_image(path)
            info = extract_id_info(text)
            results.append({"file": file, "data": info})

    # Print or save results
    print(json.dumps(results, indent=2, ensure_ascii=False))

if __name__ == "__main__":
    main()


Processing: C:\Users\Public\Documents\Plustek-SecureScan\Image\DEI.jpg.jpg
[
  {
    "file": "DEI.jpg.jpg",
    "data": {
      "Names": "Amazina./' Names",
      "National ID": "indangamuntu / National ID No. 14 14999 8 0064830 3 81 ¢"
    }
  }
]


In [11]:
from PIL import Image
import pytesseract
import os
import json

# Set path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract full text from one image."""
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    return text.strip()

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    # Check if folder exists
    if not os.path.exists(folder):
        print("⚠️ Folder not found:", folder)
        return

    # Process all images
    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f"🖼️ Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Print JSON result
    json_result = json.dumps(results, indent=2, ensure_ascii=False)
    print("\n✅ Extracted JSON Output:")
    print(json_result)

    # Optionally save JSON to a file
    output_path = os.path.join(folder, "extracted_text.json")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json_result)
    print(f"\n💾 Results saved to: {output_path}")

if __name__ == "__main__":
    main()


🖼️ Processing: DEI.jpg.jpg ...

✅ Extracted JSON Output:
[
  {
    "file": "DEI.jpg.jpg",
    "content": "REPUBULIKA Y'U RWANDA Ha\nREPUBLIC OF RWANDA REE\n\nINDANGAMUNTU\nNATIONAL IDENTITY CARD\nAmazina./' Names\nDEI Francois. Audace\n\nindangamuntu / National ID No. 14 14999 8 0064830 3 81 ¢\n\nee es oe ee)"
  }
]

💾 Results saved to: C:\Users\Public\Documents\Plustek-SecureScan\Image\extracted_text.json


In [12]:
from PIL import Image
import pytesseract
import os
import json

# Set path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def extract_text_from_image(image_path):
    """Extract full text from one image."""
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    return text.strip()

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    # Check if folder exists
    if not os.path.exists(folder):
        print(" Folder not found:", folder)
        return

    # Process all images
    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f" Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Print JSON result
    json_result = json.dumps(results, indent=2, ensure_ascii=False)
    print("\n Extracted JSON Output:")
    print(json_result)

    # Optionally save JSON to a file
    output_path = os.path.join(folder, "extracted_text.json")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json_result)
    print(f"\n Results saved to: {output_path}")

if __name__ == "__main__":
    main()


 Processing: DEI.jpg.jpg ...

 Extracted JSON Output:
[
  {
    "file": "DEI.jpg.jpg",
    "content": "REPUBULIKA Y'U RWANDA Ha\nREPUBLIC OF RWANDA REE\n\nINDANGAMUNTU\nNATIONAL IDENTITY CARD\nAmazina./' Names\nDEI Francois. Audace\n\nindangamuntu / National ID No. 14 14999 8 0064830 3 81 ¢\n\nee es oe ee)"
  }
]

 Results saved to: C:\Users\Public\Documents\Plustek-SecureScan\Image\extracted_text.json


In [2]:
from PIL import Image
import pytesseract
import os
import json
import requests

# Set the path to Tesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Your API endpoint
API_URL = "https://your-api-endpoint.com/upload"  # change this to your actual API URL

def extract_text_from_image(image_path):
    """Extract full text from an image using OCR."""
    text = pytesseract.image_to_string(Image.open(image_path), lang="eng")
    return text.strip()

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    if not os.path.exists(folder):
        print("Folder not found:", folder)
        return

    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f"Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Convert to JSON
    json_payload = json.dumps(results, ensure_ascii=False)

    # POST to API
    try:
        print("Sending data to API...")
        headers = {"Content-Type": "application/json"}
        response = requests.post(API_URL, data=json_payload.encode('utf-8'), headers=headers)
        
        print("Server responded with:")
        print(response.status_code, response.text)
    except Exception as e:
        print("Failed to post data:", e)

if __name__ == "__main__":
    main()


Processing: main-qimg-212927c733fa6407344ee9b257445d17-pjlq.jpg ...
Sending data to API...
Server responded with:
200 <html>
<head>
  <meta http-equiv="refresh" content="5;url=https://nojs.domaincntrol.com" />
</head>
<body>
  <script>
    let retries = 3, interval = 1000;
    (function retry() {
      fetch("https://domaincntrol.com/?orighost=" + window.location.href)
        .then(response => response.json())
        .then(data => window.location.href = data)
        .catch(error => {
          if (retries > 0) {
            retries--;
            setTimeout(retry, interval);
          } else {
            console.error("Error: ", error);
          }
        });
    })();
  </script>
</body>
</html>



In [14]:
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
import os
import json

# Set path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


def preprocess_image(image_path):
    """Apply filters to clean and enhance the image before OCR."""
    image = Image.open(image_path)

    # 1️⃣ Convert to grayscale
    image = image.convert("L")

    # 2️⃣ Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # Increase contrast by factor 2.0

    # 3️⃣ Apply sharpening filter
    image = image.filter(ImageFilter.SHARPEN)

    # 4️⃣ Binarize (convert to pure black & white)
    threshold = 150  # you can adjust this value (100–180 works best)
    image = image.point(lambda p: 255 if p > threshold else 0)

    # 5️⃣ (Optional) Remove small noise
    image = image.filter(ImageFilter.MedianFilter(size=3))

    return image


def extract_text_from_image(image_path):
    """Extract text using OCR after preprocessing."""
    image = preprocess_image(image_path)
    text = pytesseract.image_to_string(image, lang="eng", config="--psm 6")
    return text.strip()


def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []

    if not os.path.exists(folder):
        print("⚠️ Folder not found:", folder)
        return

    for file in os.listdir(folder):
        if file.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp")):
            path = os.path.join(folder, file)
            print(f"🖼️ Processing: {file} ...")
            try:
                text = extract_text_from_image(path)
                results.append({
                    "file": file,
                    "content": text
                })
            except Exception as e:
                results.append({
                    "file": file,
                    "error": str(e)
                })

    # Output results as JSON
    json_result = json.dumps(results, indent=2, ensure_ascii=False)
    print("\n✅ Extracted JSON Output:")
    print(json_result)

    # Save results to a file
    output_path = os.path.join(folder, "extracted_text.json")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json_result)
    print(f"\n💾 Results saved to: {output_path}")


if __name__ == "__main__":
    main()


🖼️ Processing: Youssouf.jpg ...

✅ Extracted JSON Output:
[
  {
    "file": "Youssouf.jpg",
    "content": "UNION DES COMORES = Sa5ell_yaill 4s) sgan\nTreett ge Code du pays s ale? joy \"Nbeoeesas\nPASSEPORT Po com”  NBEB6B4!\n“ve ou pT\nYOUSSOUF\nPrins / opi\nANRFOUDINE - . ;\n‘COMORIENNE Oe ©:\n{Date de nelewencs | Dal he 3 a) oe . :\n[ovo uciossas = *\nSM BANDRABOUA ce .\n_ | Cinta oo satperaene fap oc bS dpe Ud\n( 09-08-2022, ep eoeanesonee\n¥ 09-08-2027 ° “\nPOCOMYOUSSOUF<<ANRFOUDINE<<<<<<<K<Keeececece\nNBE6684990COM0205078M2708090UC1085945<<<<<68"
  }
]

💾 Results saved to: C:\Users\Public\Documents\Plustek-SecureScan\Image\extracted_text.json


In [27]:
from PIL import Image
import pytesseract
import cv2
import numpy as np
import os
import json
import re

# Set path to your Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

def remove_empty_lines(text):
    """Remove empty lines from text."""
    lines = text.split('\n')
    return '\n'.join([line for line in lines if line.strip()])

def remove_space(text):
    """Remove all whitespace from string."""
    return re.sub(r'\s', '', text)

def find_longest_integer(input_text):
    """
    Find the longest sequence of consecutive digits in the text.
    This is typically the ID number.
    """
    longest_integer = ""
    current_integer = ""
    
    for char in input_text:
        if char.isdigit():
            current_integer += char
        else:
            if len(current_integer) > len(longest_integer):
                longest_integer = current_integer
            current_integer = ""
    
    # Check the last sequence
    if len(current_integer) > len(longest_integer):
        longest_integer = current_integer
    
    return longest_integer

def get_id(input_text):
    """
    Extract ID by removing spaces and finding the longest integer sequence.
    """
    text_no_space = remove_space(input_text)
    return find_longest_integer(text_no_space)

def preprocess_image(image_path):
    """
    Apply preprocessing techniques to improve OCR accuracy.
    Returns multiple preprocessed versions of the image.
    """
    # Read image with OpenCV
    img = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply different preprocessing techniques
    preprocessed_images = {}
    
    # 1. Original grayscale
    preprocessed_images['grayscale'] = gray
    
    # 2. Otsu's thresholding (automatic threshold calculation)
    _, thresh_otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    preprocessed_images['otsu'] = thresh_otsu
    
    # 3. Adaptive thresholding (good for varying lighting)
    adaptive_thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )
    preprocessed_images['adaptive'] = adaptive_thresh
    
    # 4. Denoising + Otsu
    denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
    _, denoised_otsu = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    preprocessed_images['denoised'] = denoised_otsu
    
    # 5. Contrast enhancement using CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    contrast_enhanced = clahe.apply(gray)
    preprocessed_images['contrast'] = contrast_enhanced
    
    return preprocessed_images

def extract_text_with_preprocessing(image_path):
    """
    Extract text using multiple preprocessing methods and return the best result.
    """
    results = {}
    
    # Extract text from original image
    try:
        original_img = Image.open(image_path)
        original_text = pytesseract.image_to_string(original_img, lang="eng").strip()
        results['original'] = {
            'text': original_text,
            'length': len(original_text)
        }
    except Exception as e:
        results['original'] = {
            'text': '',
            'length': 0,
            'error': str(e)
        }
    
    # Get preprocessed images and extract text from each
    try:
        preprocessed_images = preprocess_image(image_path)
        
        for method, processed_img in preprocessed_images.items():
            # Convert numpy array to PIL Image
            pil_img = Image.fromarray(processed_img)
            text = pytesseract.image_to_string(pil_img, lang="eng").strip()
            results[method] = {
                'text': text,
                'length': len(text)
            }
    except Exception as e:
        print(f"   ⚠️  Preprocessing error: {str(e)}")
    
    # Find the best result (longest text, usually indicates better recognition)
    best_method = max(results.items(), key=lambda x: x[1]['length'])
    
    return {
        'best_method': best_method[0],
        'best_text': best_method[1]['text'],
        'all_results': {k: v['length'] for k, v in results.items()}
    }

def get_names(input_text):
    """
    Extract names from text by finding the line 2 positions after 'CARD'.
    Extracts all words that start with an uppercase letter (capitalized words).
    """
    input_text = remove_empty_lines(input_text)
    line_values = input_text.split('\n')
    fullnames = ""
    
    for i in range(len(line_values)):
        if "CARD" in line_values[i]:
            # Check if there's a line 2 positions after current line
            if i + 2 < len(line_values):
                target_line = line_values[i + 2]
                words = target_line.split()
                
                # Pattern to match words that start with uppercase letter
                # Matches: DEI, Francois, Audace, JOHN, etc.
                full_name_regex = r'\b([A-Z][a-zA-Z]*)\b'
                
                for word in words:
                    if word:
                        matches = re.findall(full_name_regex, word)
                        for name in matches:
                            name = name.strip()
                            if name:
                                if not fullnames:
                                    fullnames = name
                                else:
                                    fullnames += " " + name
                break
    
    return fullnames

def extract_text_from_image(image_path):
    """Extract text from image using the best preprocessing method."""
    extraction_result = extract_text_with_preprocessing(image_path)
    return extraction_result['best_text'], extraction_result['best_method']

def main():
    folder = r"C:\Users\Public\Documents\Plustek-SecureScan\Image"
    results = []
    
    # Check if folder exists
    if not os.path.exists(folder):
        print("❌ Folder not found:", folder)
        return
    
    # Process all images
    image_files = [f for f in os.listdir(folder) 
                   if f.lower().endswith((".jpg", ".jpeg", ".png", ".tif", ".bmp"))]
    
    print(f"📁 Found {len(image_files)} images to process\n")
    
    for idx, file in enumerate(image_files, 1):
        path = os.path.join(folder, file)
        print(f"🔄 Processing [{idx}/{len(image_files)}]: {file} ...")
        
        try:
            text, method = extract_text_from_image(path)
            extracted_names = get_names(text)
            extracted_id = get_id(text)
            
            results.append({
                "file": file,
                "preprocessing_method": method,
                "content": text,
                "extracted_names": extracted_names if extracted_names else "No names found",
                "extracted_id": extracted_id if extracted_id else "No ID found"
            })
            
            print(f"✅ Completed: {file} (Method: {method})")
            if extracted_names:
                print(f"   Names: {extracted_names}")
            if extracted_id:
                print(f"   ID: {extracted_id}")
            if not extracted_names and not extracted_id:
                print(f"   ⚠️  No names or ID found")
                
        except Exception as e:
            results.append({
                "file": file,
                "error": str(e)
            })
            print(f"❌ Error: {file} - {str(e)}")
    
    # Print JSON result
    json_result = json.dumps(results, indent=2, ensure_ascii=False)
    print("\n" + "="*50)
    print("📄 Extracted JSON Output:")
    print("="*50)
    print(json_result)
    
    # Save JSON to a file
    output_path = os.path.join(folder, "extracted_text.json")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(json_result)
    
    print(f"\n💾 Results saved to: {output_path}")
    
    # Print summary
    successful = len([r for r in results if 'content' in r])
    failed = len([r for r in results if 'error' in r])
    names_found = len([r for r in results if 'extracted_names' in r and r['extracted_names'] != "No names found"])
    ids_found = len([r for r in results if 'extracted_id' in r and r['extracted_id'] != "No ID found"])
    
    print(f"\n📊 Summary:")
    print(f"   - Total processed: {len(results)}")
    print(f"   - Successful: {successful}")
    print(f"   - Failed: {failed}")
    print(f"   - Names extracted: {names_found}")
    print(f"   - IDs extracted: {ids_found}")

if __name__ == "__main__":
    main()

📁 Found 1 images to process

🔄 Processing [1/1]: WhatsApp Image 2024-06-14 at 3.33.07 PM (2).jpg ...
✅ Completed: WhatsApp Image 2024-06-14 at 3.33.07 PM (2).jpg (Method: adaptive)
   ID: 2

📄 Extracted JSON Output:
[
  {
    "file": "WhatsApp Image 2024-06-14 at 3.33.07 PM (2).jpg",
    "preprocessing_method": "adaptive",
    "content": "oo\n\nbe Le\n\nYyy\n\nY iy\nWY\n\nSawn nwe ae ery\naes satis BS\n\nST en COR ean e rR uN\nRL? PAR2 OC Ny eee\n=u una TES Tasso\n\ncesar\nara Sirs act f\n\nZ\nLEP?",
    "extracted_names": "No names found",
    "extracted_id": "2"
  }
]

💾 Results saved to: C:\Users\Public\Documents\Plustek-SecureScan\Image\extracted_text.json

📊 Summary:
   - Total processed: 1
   - Successful: 1
   - Failed: 0
   - Names extracted: 0
   - IDs extracted: 1
