<a href="https://colab.research.google.com/github/DenisKai7/id_scanner/blob/model-train/id_scanner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

from google.colab import drive
drive.mount('/content/drive')

!sudo apt update
!sudo apt install tesseract-ocr
!sudo apt install tesseract-ocr-ind
!pip install pytesseract
!pip install opencv-python
!pip install matplotlib

import cv2
import pytesseract
from PIL import Image
import os
import re
import numpy as np
import matplotlib.pyplot as plt

os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'


base_drive_path = '/content/drive/MyDrive/ocr_ktp/train'

def correct_rotation(img_array):
    """
    Melakukan koreksi rotasi pada gambar (numpy array) yang sudah dimuat.
    """
    if img_array is None:
        print("Error: Input gambar untuk koreksi rotasi adalah None.")
        return None

    gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)

    try:
        osd = pytesseract.image_to_osd(gray)
        rotation_angle = 0
        for line in osd.split('\n'):
            if 'Rotate:' in line:
                rotation_angle = int(line.split('Rotate: ')[1])
                break

        if rotation_angle == 0:
            return img_array

        (h, w) = img_array.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, -rotation_angle, 1.0)
        rotated_img = cv2.warpAffine(img_array, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
        return rotated_img
    except pytesseract.TesseractError as e:
        print(f"Peringatan: Gagal mendeteksi orientasi dengan Tesseract OSD: {e}. Menggunakan gambar asli.")
        return img_array

def preprocess_image(img):
    """
    Melakukan preprocessing pada gambar untuk meningkatkan akurasi OCR.
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    denoised = cv2.medianBlur(gray, 3)

    _, thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresh

def extract_ktp_data(image_path, apply_preprocessing=True):
    print(f"\nMemproses KTP: {image_path}")
    original_img = cv2.imread(image_path)

    if original_img is None:
        print(f"Error: Tidak dapat membaca gambar dari {image_path}. File mungkin tidak ada, rusak, atau format tidak didukung.")
        return None, None

    rotated_img = correct_rotation(original_img)

    if apply_preprocessing:
        processed_img = preprocess_image(rotated_img)
    else:
        processed_img = cv2.cvtColor(rotated_img, cv2.COLOR_BGR2GRAY)

    pil_img = Image.fromarray(processed_img)

    text = pytesseract.image_to_string(pil_img, lang='ind', config='--psm 6 --oem 3')

    data_ktp = {
        "NIK": "", "Nama": "", "Tempat/Tgl Lahir": "", "Jenis Kelamin": "", "Gol. Darah": "",
        "Alamat": {"Jalan": "", "RT/RW": "", "Kel/Desa": "", "Kecamatan": ""},
        "Agama": "", "Status Perkawinan": "", "Pekerjaan": "", "Kewarganegaraan": "", "Berlaku Hingga": ""
    }

    nik_match = re.search(r'\b\d{16}\b', text)
    if nik_match: data_ktp["NIK"] = nik_match.group(0)

    name_match = re.search(r'(?:Nama|Nama Lengkap)\s*:\s*(.+)', text, re.IGNORECASE)
    if name_match: data_ktp["Nama"] = name_match.group(1).strip()
    else:
        name_match = re.search(r'Nama\s*([A-Z\s]+)', text)
        if name_match: data_ktp["Nama"] = name_match.group(1).strip()

    dob_match = re.search(r'(?:Tempat/Tgl Lahir|Tempat, Tgl Lahir)\s*:\s*([A-Za-z\s]+),\s*(\d{2}-\d{2}-\d{4})', text, re.IGNORECASE)
    if dob_match: data_ktp["Tempat/Tgl Lahir"] = f"{dob_match.group(1).strip()}, {dob_match.group(2).strip()}"

    gender_match = re.search(r'Jenis Kelamin\s*:\s*(LAKI-LAKI|PEREMPUAN)\s*(?:Gol\. Darah\s*:\s*([A|B|AB|O|\-]+))?', text, re.IGNORECASE)
    if gender_match:
        data_ktp["Jenis Kelamin"] = gender_match.group(1).strip()
        if gender_match.group(2): data_ktp["Gol. Darah"] = gender_match.group(2).strip()

    if not data_ktp["Gol. Darah"]:
        blood_match = re.search(r'Gol\. Darah\s*:\s*([A|B|AB|O|\-]+)', text, re.IGNORECASE)
        if blood_match: data_ktp["Gol. Darah"] = blood_match.group(1).strip()

    address_start_idx = text.lower().find("alamat")
    if address_start_idx != -1:
        address_text = text[address_start_idx:]
        jalan_match = re.search(r'Jalan\s*:\s*(.+)', address_text, re.IGNORECASE)
        if jalan_match: data_ktp["Alamat"]["Jalan"] = jalan_match.group(1).strip()

        rtrw_match = re.search(r'RT/RW\s*:\s*(\d{3}/\d{3})', address_text, re.IGNORECASE)
        if rtrw_match: data_ktp["Alamat"]["RT/RW"] = rtrw_match.group(1).strip()

        keldesa_match = re.search(r'(?:Kel/Desa|Kelurahan/Desa)\s*:\s*(.+)', address_text, re.IGNORECASE)
        if keldesa_match: data_ktp["Alamat"]["Kel/Desa"] = keldesa_match.group(1).strip()

        kec_match = re.search(r'Kecamatan\s*:\s*(.+)', address_text, re.IGNORECASE)
        if kec_match: data_ktp["Alamat"]["Kecamatan"] = kec_match.group(1).strip()

    agama_match = re.search(r'Agama\s*:\s*(ISLAM|KRISTEN|KATOLIK|HINDU|BUDHA|KONGHUCU)', text, re.IGNORECASE)
    if agama_match: data_ktp["Agama"] = agama_match.group(1).strip()

    status_match = re.search(r'Status Perkawinan\s*:\s*(BELUM KAWIN|KAWIN|CERAI HIDUP|CERAI MATI)', text, re.IGNORECASE)
    if status_match: data_ktp["Status Perkawinan"] = status_match.group(1).strip()

    pekerjaan_match = re.search(r'Pekerjaan\s*:\s*(.+)', text, re.IGNORECASE)
    if pekerjaan_match: data_ktp["Pekerjaan"] = pekerjaan_match.group(1).strip()

    warga_match = re.search(r'Kewarganegaraan\s*:\s*(WNI|WNA)', text, re.IGNORECASE)
    if warga_match: data_ktp["Kewarganegaraan"] = warga_match.group(1).strip()

    berlaku_match = re.search(r'Berlaku Hingga\s*:\s*(\d{2}-\d{2}-\d{4}|SEUMUR HIDUP)', text, re.IGNORECASE)
    if berlaku_match: data_ktp["Berlaku Hingga"] = berlaku_match.group(1).strip()

    return data_ktp, processed_img

all_ktp_data = []
processed_images_for_display = []
original_images_for_display = []
image_paths_for_display = []

angle_folders = ['0', '90', '180', '270']

for angle_folder in angle_folders:
    folder_path = os.path.join(base_drive_path, angle_folder)
    print(f"Mengakses folder: {folder_path}")

    if not os.path.exists(folder_path):
        print(f"Peringatan: Folder '{folder_path}' tidak ditemukan. Pastikan path sudah benar.")
        continue

    files_in_folder = os.listdir(folder_path)
    if not files_in_folder:
        print(f"Info: Folder '{folder_path}' kosong atau tidak berisi file.")
        continue

    print(f"Ditemukan {len(files_in_folder)} file di '{folder_path}'.")

    for filename in files_in_folder:
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(folder_path, filename)

            if not os.path.exists(image_path):
                print(f"Error: File gambar tidak ditemukan di path: {image_path}. Melewatkan.")
                continue

            extracted_data, processed_img = extract_ktp_data(image_path, apply_preprocessing=True)

            if extracted_data and processed_img is not None:
                all_ktp_data.append(extracted_data)
                temp_original_img = cv2.imread(image_path)
                if temp_original_img is not None:
                    original_images_for_display.append(cv2.cvtColor(temp_original_img, cv2.COLOR_BGR2RGB))
                else:
                    original_images_for_display.append(None)
                processed_images_for_display.append(processed_img)
                image_paths_for_display.append(filename)

print("\n--- Data KTP yang Berhasil Diekstrak ---")
for i, data in enumerate(all_ktp_data):
    if i < len(image_paths_for_display):
        print(f"\nData KTP ke-{i+1} ({image_paths_for_display[i]}):")
    else:
        print(f"\nData KTP ke-{i+1}:")
    for key, value in data.items():
        if isinstance(value, dict):
            print(f"  {key}:")
            for sub_key, sub_value in value.items():
                print(f"    {sub_key}: {sub_value}")
        else:
            print(f"  {key}: {value}")

print("\n--- Visualisasi Hasil Preprocessing dan OCR ---")

num_images_to_show = min(3, len(all_ktp_data))

plt.figure(figsize=(15, 5 * num_images_to_show))

for i in range(num_images_to_show):

    plt.subplot(num_images_to_show, 3, i*3 + 1)
    if original_images_for_display[i] is not None:
        plt.imshow(original_images_for_display[i])
        plt.title(f"Asli ({image_paths_for_display[i]})")
    else:
        plt.text(0.5, 0.5, "Gambar Asli Tidak Tersedia", horizontalalignment='center', verticalalignment='center')
        plt.title(f"Asli ({image_paths_for_display[i]}) - Gagal Muat")
    plt.axis('off')

    plt.subplot(num_images_to_show, 3, i*3 + 2)
    if processed_images_for_display[i] is not None:
        if len(processed_images_for_display[i].shape) == 2:
            plt.imshow(processed_images_for_display[i], cmap='gray')
        else:
            plt.imshow(cv2.cvtColor(processed_images_for_display[i], cv2.COLOR_BGR2RGB))
        plt.title(f"Diproses ({image_paths_for_display[i]})")
    else:
        plt.text(0.5, 0.5, "Gambar Diproses Tidak Tersedia", horizontalalignment='center', verticalalignment='center')
        plt.title(f"Diproses ({image_paths_for_display[i]}) - Gagal Muat")
    plt.axis('off')

    plt.subplot(num_images_to_show, 3, i*3 + 3)
    plt.text(0.01, 0.99, f"NIK: {all_ktp_data[i]['NIK']}\n"
                         f"Nama: {all_ktp_data[i]['Nama']}\n"
                         f"Tgl Lahir: {all_ktp_data[i]['Tempat/Tgl Lahir']}\n"
                         f"Alamat: {all_ktp_data[i]['Alamat']['Jalan']}",
             verticalalignment='top', wrap=True, fontsize=10)
    plt.title("Data Ekstrak (Sebagian)")
    plt.axis('off')

plt.tight_layout()
plt.show()

print("\n--- Evaluasi Akurasi (Manual) ---")
print("Periksa output data ekstrak di atas. Seberapa akurat NIK, Nama, dan Alamat terekstrak?")
print("Jika ada kesalahan, ini bisa disebabkan oleh:")
print("1. Kualitas gambar KTP yang rendah (blur, gelap, pantulan).")
print("2. Kurangnya robustnya pola regex untuk menangani variasi format KTP.")
print("3. Keterbatasan Tesseract pada beberapa jenis font atau kondisi gambar.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/324-ktp_jpg.rf.73e362647aff8ce3be57230605ae3b1e_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/322-ktp-0001_jpg.rf.f3b8889c2030e579c37d8c118070df05_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/317-ktp-niar-baru-001_jpg.rf.8f2f4aa9fb9a6b9a5689b8763ad6be14_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/307_jpg.rf.86ef8b129584ed5108d8b27189afa279_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/32-3_jpg.rf.4600848a99e15bf975fff87dfc3654c2_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/315-img-20220704-wa0018_jpg.rf.b27af3aa6745e1cc0ac15ed82deb452d_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/306-img-20181122-wa0061_jpg.rf.9ba9689bf2f5363aef597f9907e57c35_ktp_0.jpg

Memproses KTP: /content/drive/MyDrive/ocr_ktp/train/0/32_jpg.rf.c7300c888febb9b25b2d3bc6062e4d19_ktp

In [None]:
def simulate_form_fill(ktp_data):
    print("\n--- Simulasi Pengisian Formulir KTP ---")
    print("---------------------------------------")

    if not ktp_data:
        print("Tidak ada data KTP untuk diisi.")
        return

    print(f"Mengisi NIK: {ktp_data['NIK']}")
    print(f"Mengisi Nama Lengkap: {ktp_data['Nama']}")
    print(f"Mengisi Tempat/Tanggal Lahir: {ktp_data['Tempat/Tgl Lahir']}")
    print(f"Memilih Jenis Kelamin: {ktp_data['Jenis Kelamin']}")
    print(f"Memilih Golongan Darah: {ktp_data['Gol. Darah']}")
    print(f"Mengisi Alamat Jalan: {ktp_data['Alamat']['Jalan']}")
    print(f"Mengisi RT/RW: {ktp_data['Alamat']['RT/RW']}")
    print(f"Mengisi Kelurahan/Desa: {ktp_data['Alamat']['Kel/Desa']}")
    print(f"Mengisi Kecamatan: {ktp_data['Alamat']['Kecamatan']}")
    print(f"Memilih Agama: {ktp_data['Agama']}")
    print(f"Memilih Status Perkawinan: {ktp_data['Status Perkawinan']}")
    print(f"Mengisi Pekerjaan: {ktp_data['Pekerjaan']}")
    print(f"Memilih Kewarganegaraan: {ktp_data['Kewarganegaraan']}")
    print(f"Mengisi Berlaku Hingga: {ktp_data['Berlaku Hingga']}")
    print("---------------------------------------")
    print("Formulir berhasil diisi (simulasi)!")

if all_ktp_data:
    simulate_form_fill(all_ktp_data[0])
else:
    print("Tidak ada data KTP yang berhasil diekstrak untuk simulasi pengisian formulir.")