# Ekstraksi dan Pengolahan Data Kepemilikan Tanah

## Proker 1: Strukturisasi Data dan Arsip Digital Dokumen Pemilikan Tanah

### 1. Menyiapkan coding environment, termasuk menginstall dan/atau memperbarui library

In [None]:
# Menginstall library-library
%pip install pandas
%pip install pytesseract
%pip install pdf2image
%pip install pillow
%pip install numpy
%pip install matplotlib
%pip install tensorflow
%pip install opencv-python


# Mengimport library-library untuk digunakan
import cv2
import numpy as np
import os
import pandas as pd
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from matplotlib import pyplot as plt
import re

In [2]:
# Loading the preprocessed template CSV file
template = pd.read_csv('DATA C-DESA NOBOREJO preprocessed.csv')

##### Konversi PDF ke PNG 

In [3]:
# Initialize a new empty dataframe with the same columns as the template
df = pd.DataFrame(columns=template.columns)
df.head()

# Make sure the PDF file is in the same directory as this script
output_folder = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Save CSV to the output directory
output_dir = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff'
output_file = 'output.csv'
df.to_csv(os.path.join(output_dir, output_file), index=False)

In [None]:
# Create subdirectory to hold images inside the Berkas Files subdirectory
output_folder = 'Berkas Files/images of berkas 500-600'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Convert PDF to images
pdf_path = 'Berkas Files/berkas 500-600.pdf'
images = convert_from_path(pdf_path)

# Save each image to the output folder
for i, image in enumerate(images):
    image_path = os.path.join(output_folder, f'halaman_{i+1}.png')
    image.save(image_path, 'PNG')

# Display the paths of the saved images
image_paths = [os.path.join(output_folder, f'halaman_{i+1}.png') for i in range(len(images))]
print("Images saved at:")
for path in image_paths:
    print(path)

In [5]:
# Load the image
image_path = 'Berkas Files/images of berkas 500-600/halaman_1.png'
image = cv2.imread(image_path, cv2.IMREAD_COLOR)

if image is None:
    raise ValueError(f"Image not loaded. Check if the path is correct: {image_path}")

### 2. Trial and Error halaman 1 dari file 'berkas 500-600.pdf'

##### Image preprocessing attempt 1, **failed**

Tried using: 

- **Tesseract OCR**: Used for extracting text from images.
- **Image Preprocessing**:
  - **Thresholding**: Applied adaptive thresholding to convert the image to a binary format.
  - **Noise Removal**: Used morphological operations to remove noise from the image.
  - **Contour Detection**: Detected contours to identify table structures in the image.
- **Table Line Detection**: Highlighted table lines to improve OCR accuracy.

Despite these efforts, we encountered challenges due to the poor quality of the scanned images and the handwritten text, which resulted in limited meaningful progress.

In [None]:
# Step 1: Apply refined adaptive thresholding with thicker lines
'''
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale (just in case)
adaptive_binary = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 1001, 20  # Increased block size and constant
)

# Save the output for this step
step_1_path = os.path.join(output_dir, 'Step 1 - Thickened Adaptive Thresholded Image.png')
cv2.imwrite(step_1_path, adaptive_binary)

print(f"Step 1 completed. Thickened adaptive thresholded image saved at: {step_1_path}")
'''

In [None]:
# Step 2: Remove unwanted hand region
'''
# Adjust the bounding box to cover the hand
hand_region = (0, adaptive_binary.shape[0] - 800, 600, 1000)  # x, y, width, height
x, y, w, h = hand_region

# Mask the hand by painting the region white
hand_removed = adaptive_binary.copy()
cv2.rectangle(hand_removed, (x, y), (x + w, y + h), (255, 255, 255), -1)

# Save the cleaned image
step_2_path = os.path.join(output_dir, 'Step 2 - Hand Removed.png')
cv2.imwrite(step_2_path, hand_removed)

print(f"Step 2 completed. Hand removed image saved at: {step_2_path}")
'''

In [None]:
# Step 3: Highlight table lines in green (retain original background)
'''
try:
    # Detect edges using Canny
    edges = cv2.Canny(hand_removed, 500, 1500)  # Adjust thresholds as needed (low_threshold, high_threshold)

    # Dilate the edges to strengthen lines
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))  # Adjust kernel size for line thickness
    dilated_edges = cv2.dilate(edges, kernel, iterations=1)

    # Filter detected edges based on geometry (long horizontal/vertical lines)
    contours, _ = cv2.findContours(dilated_edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    mask = np.zeros_like(edges)  # Create a blank mask
    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        aspect_ratio = max(w, h) / min(w, h) if min(w, h) > 0 else 0
        # Keep only long horizontal or vertical lines
        if (w > 50 and h < 10) or (h > 50 and w < 10):  # Adjust thresholds as needed
            cv2.drawContours(mask, [contour], -1, 255, thickness=cv2.FILLED)

    # Combine mask with dilated edges
    filtered_lines = cv2.bitwise_and(dilated_edges, mask)

    # Convert grayscale image to BGR for color overlay
    table_lines_colored = cv2.merge([hand_removed, hand_removed, hand_removed])  # Grayscale to BGR

    # Overlay green color on detected table lines
    table_lines_colored[filtered_lines > 0] = [0, 255, 0]  # Green (0, 255, 0)

    # Save the resulting image
    step_3_path = os.path.join(output_dir, 'Step 3 - Filtered Green Table Lines.png')
    cv2.imwrite(step_3_path, table_lines_colored)

    print(f"Step 3 completed. Filtered green table lines image saved at: {step_3_path}")

except Exception as e:
    print(f"An error occurred in Step 3: {e}")
'''

##### Image preprocessing attempt 2

In [9]:
# Define input path and output folder
image_path = 'Berkas Files/images of berkas 500-600/halaman_1.png'
output_folder = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

Step 1: Image Preprocessing

In [None]:
# Step 1.1. Convert to grayscale
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Save the grayscale image
output_path = os.path.join(output_folder, 'halaman_1_gray.png')
cv2.imwrite(output_path, gray_image)

print(f"Grayscale image saved at: {output_path}")

In [None]:
# Step 1.2. Enhance contrast using CLAHE
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced_image = clahe.apply(gray_image)

# Save the enhanced contrast image
output_path = os.path.join(output_folder, 'halaman_1_contrast.png')
cv2.imwrite(output_path, enhanced_image)

print(f"Enhanced contrast image saved at: {output_path}")

In [None]:
# Step 1.3. Remove noise using median blurring
denoised_image = cv2.medianBlur(enhanced_image, 5)

# Save the denoised image
output_path = os.path.join(output_folder, 'halaman_1_denoised.png')
cv2.imwrite(output_path, denoised_image)

print(f"Denoised image saved at: {output_path}")

In [None]:
# Step 1.4. Normalize dimensions
# Define the target dimensions
target_width, target_height = 1024, 1024

# Resize the image while preserving the aspect ratio
h, w = denoised_image.shape
scaling_factor = min(target_width / w, target_height / h)
new_width = int(w * scaling_factor)
new_height = int(h * scaling_factor)

# Resize the image
resized_image = cv2.resize(denoised_image, (new_width, new_height), interpolation=cv2.INTER_AREA)

# Pad the image to fit the target dimensions
delta_w = target_width - new_width
delta_h = target_height - new_height
top, bottom = delta_h // 2, delta_h - (delta_h // 2)
left, right = delta_w // 2, delta_w - (delta_w // 2)
normalized_image = cv2.copyMakeBorder(resized_image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255)

# Save the normalized image
output_path = os.path.join(output_folder, 'halaman_1_normalized.png')
cv2.imwrite(output_path, normalized_image)

print(f"Normalized image saved at: {output_path}")

Step 2: Removing the table

In [None]:
# Step 2.1: Detect Table Structures

# Threshold the image to binary
_, binary = cv2.threshold(normalized_image, 200, 255, cv2.THRESH_BINARY_INV)

# Detect horizontal lines
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)

# Detect vertical lines
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel)

# Combine horizontal and vertical lines
table_structure = cv2.add(horizontal_lines, vertical_lines)

# Optional: Dilate to merge gaps in table lines
table_structure = cv2.dilate(table_structure, np.ones((3, 3), np.uint8))

# Save the resulting table structure image
output_path = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff/halaman_1_table_structure.png'
cv2.imwrite(output_path, table_structure)

# Print confirmation and show the saved output
print(f"Table structure image saved at: {output_path}")


In [None]:
# Step 2.2: Remove Table Structure to Extract Text

# Normalize the table structure to binary (0 and 255)
_, table_structure_binary = cv2.threshold(table_structure, 127, 255, cv2.THRESH_BINARY)

# Create a mask where table lines are present
table_lines_mask = table_structure_binary

# Replace table lines in the normalized image with white
text_only_image = cv2.add(normalized_image, table_lines_mask)

# Save the resulting text-only image
output_path = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff/halaman_1_text_only.png'
cv2.imwrite(output_path, text_only_image)

# Print confirmation and save output
print(f"Text-only image saved at: {output_path}")

Step 3: Performing OCR

In [None]:
# Step 3.1: Apply OCR to Extract Text
# Specify OCR configurations
custom_config = r'--psm 13 -c tessedit_char_whitelist="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,() "'

# Apply OCR on the text-only image
ocr_result = pytesseract.image_to_string(text_only_image, lang='eng', config='custom_config')

# Save OCR output to a text file
output_path = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff/halaman_1_ocr_output_sparse.txt'
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(ocr_result)

# Print confirmation and OCR result
print(f"OCR result (sparse) saved at: {output_path}")
print("OCR output (sparse):")
print(ocr_result)

In [None]:
# Step 3.2: Clean up the OCR result
cleaned_ocr = re.sub(r'[^a-zA-Z0-9\s.,]', '', ocr_result)  # Keep only alphanumeric and basic punctuation
cleaned_ocr = re.sub(r'\s+', ' ', cleaned_ocr).strip()  # Remove extra whitespace and trim

# Save cleaned OCR output to a text file
output_path_cleaned = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff/halaman_1_ocr_cleaned.txt'
with open(output_path_cleaned, 'w', encoding='utf-8') as f:
    f.write(cleaned_ocr)

# Print confirmation and cleaned output
print(f"Cleaned OCR result saved at: {output_path_cleaned}")
print("Cleaned OCR output:")
print(cleaned_ocr)

In [None]:
# Step 3.3: Save OCR Output with Proper Header Placement

# Define page identifier and header
header = ["OCR Output"]
page_id = "page_500"

# Split cleaned OCR into individual words
words = [page_id] + cleaned_ocr.split()

# Create a DataFrame with words as a single row
df = pd.DataFrame([words])

# Update the header manually
df.columns = header + ["" for _ in range(len(words) - 1)]

# Save the DataFrame to a CSV
csv_output_path = 'Berkas Files/images of berkas 500-600/Preprocessed halaman 1 stuff/halaman_1_ocr_paginated_final_header.csv'
df.to_csv(csv_output_path, index=False, header=False)

# Print confirmation and preview
print(f"Final paginated OCR result saved to CSV at: {csv_output_path}")
print("CSV preview:")
print(df.head())

## Proker 2: Alat Pencarian Dokumen Kepemilikan Tanah

### 3. Memproses Semua Halaman dengan Pipeline OCR

In [None]:
# Langkah 1: Mengonversi PDF ke Gambar dan Mengatur Folder Penyimpanan

# Daftar file PDF dan folder keluaran yang sesuai
pdf_files = [
    ('Berkas Files/berkas 0-100.pdf', 'file png berkas 0-100'),
    ('Berkas Files/berkas 100-200.pdf', 'file png berkas 100-200'),
    ('Berkas Files/berkas 200-300.pdf', 'file png berkas 200-300'),
    ('Berkas Files/berkas 300-400.pdf', 'file png berkas 300-400'),
    ('Berkas Files/berkas 400-500.pdf', 'file png berkas 400-500'),
    ('Berkas Files/berkas 500-600.pdf', 'file png berkas 500-600'),
    ('Berkas Files/berkas 600-700.pdf', 'file png berkas 600-700'),
    ('Berkas Files/berkas 700-800.pdf', 'file png berkas 700-800'),
    ('Berkas Files/berkas 800-900.pdf', 'file png berkas 800-900'),
    ('Berkas Files/berkas 900-968.pdf', 'file png berkas 900-968'),
    ('Berkas Files/berkas sisa sobekan 400-500.pdf', 'file png berkas sisa sobekan 400-500')
]


# Proses setiap file PDF
for pdf_path, output_folder in pdf_files:
    # Pastikan folder output ada
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    print(f"Mengonversi {pdf_path} ke gambar...")
    
    # Konversi PDF ke gambar
    images = convert_from_path(pdf_path)
    for i, image in enumerate(images):
        image_path = os.path.join(output_folder, f'halaman_{i+1}.png')  # Tentukan nama file
        image.save(image_path, 'PNG')  # Simpan gambar dalam format PNG
        print(f"Gambar halaman {i+1} dari {pdf_path} disimpan di: {image_path}")  # Informasi gambar yang disimpan
    
    print(f"Konversi {pdf_path} selesai!")

print("Semua file PDF selesai diproses!")

In [31]:
# Langkah 2: Membuat pipeline untuk langkah-langkah pra-pemrosesan

# Daftar langkah
steps = [
    "langkah 1.1 konversi ke grayscale",
    "langkah 1.2 meningkatkan kontras menggunakan CLAHE",
    "langkah 1.3 menghapus noise menggunakan median blurring",
    "langkah 1.4 normalisasi dimensi",
    "langkah 2.1 mendeteksi struktur tabel",
    "langkah 2.2 menghapus struktur tabel untuk mengekstraksi teks",
    "langkah 3.1 menerapkan OCR untuk mengekstrak teks",
    "langkah 3.2 membersihkan hasil OCR",
    "langkah 3.3 menyimpan hasil OCR dengan header yang benar",
]

# Daftar semua folder file PNG
base_folders = [
    "file png berkas 0-100",
    "file png berkas 100-200",
    "file png berkas 200-300",
    "file png berkas 300-400",
    "file png berkas 400-500",
    "file png berkas 500-600",
    "file png berkas 600-700",
    "file png berkas 700-800",
    "file png berkas 800-900",
    "file png berkas 900-968",
    "file png berkas sisa sobekan 400-500",
]

# Fungsi untuk memproses semua folder
def process_all_folders(base_folders, steps):
    for base_folder in base_folders:
        print(f"Memproses folder: {base_folder}")

        # Buat subfolder untuk setiap langkah di dalam folder dasar
        output_folders = {step: os.path.join(base_folder, step) for step in steps}
        for folder in output_folders.values():
            os.makedirs(folder, exist_ok=True)

        # Jalankan proses untuk setiap gambar di folder
        process_images(base_folder, output_folders)

# Fungsi untuk memproses gambar di satu folder
def process_images(base_folder, output_folders):
    # Ambil daftar file gambar dari folder
    image_files = [f for f in os.listdir(base_folder) if f.endswith(".png")]
    
    for image_file in image_files:
        print(f"Memproses {image_file}...")
        image_path = os.path.join(base_folder, image_file)
        image = cv2.imread(image_path)

        # Langkah 1.1: Konversi ke grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        output_path = os.path.join(output_folders["langkah 1.1 konversi ke grayscale"], image_file)
        cv2.imwrite(output_path, gray_image)

        # Langkah 1.2: Tingkatkan kontras menggunakan CLAHE
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        contrast_image = clahe.apply(gray_image)
        output_path = os.path.join(output_folders["langkah 1.2 meningkatkan kontras menggunakan CLAHE"], image_file)
        cv2.imwrite(output_path, contrast_image)

        # Langkah 1.3: Hapus noise menggunakan median blurring
        denoised_image = cv2.medianBlur(contrast_image, 5)
        output_path = os.path.join(output_folders["langkah 1.3 menghapus noise menggunakan median blurring"], image_file)
        cv2.imwrite(output_path, denoised_image)

        # Langkah 1.4: Normalisasi dimensi
        target_width, target_height = 1024, 1024
        h, w = denoised_image.shape
        scaling_factor = min(target_width / w, target_height / h)
        new_width = int(w * scaling_factor)
        new_height = int(h * scaling_factor)
        resized_image = cv2.resize(denoised_image, (new_width, new_height), interpolation=cv2.INTER_AREA)
        delta_w = target_width - new_width
        delta_h = target_height - new_height
        top, bottom = delta_h // 2, delta_h - (delta_h // 2)
        left, right = delta_w // 2, delta_w - (delta_w // 2)
        normalized_image = cv2.copyMakeBorder(resized_image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255)
        output_path = os.path.join(output_folders["langkah 1.4 normalisasi dimensi"], image_file)
        cv2.imwrite(output_path, normalized_image)

        # Langkah 2.1: Deteksi struktur tabel
        _, binary = cv2.threshold(normalized_image, 200, 255, cv2.THRESH_BINARY_INV)
        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (50, 1))
        horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 50))
        vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel)
        table_structure = cv2.add(horizontal_lines, vertical_lines)
        output_path = os.path.join(output_folders["langkah 2.1 mendeteksi struktur tabel"], image_file)
        cv2.imwrite(output_path, table_structure)

        # Langkah 2.2: Hapus struktur tabel untuk mengekstraksi teks
        _, table_structure_binary = cv2.threshold(table_structure, 127, 255, cv2.THRESH_BINARY)
        table_lines_mask = table_structure_binary
        text_only_image = cv2.add(normalized_image, table_lines_mask)
        output_path = os.path.join(output_folders["langkah 2.2 menghapus struktur tabel untuk mengekstraksi teks"], image_file)
        cv2.imwrite(output_path, text_only_image)

        # Langkah 3.1: Terapkan OCR
        custom_config = r'--psm 13 -c tessedit_char_whitelist="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,()"'
        ocr_result = pytesseract.image_to_string(text_only_image, lang='eng', config=custom_config)
        output_path = os.path.join(output_folders["langkah 3.1 menerapkan OCR untuk mengekstrak teks"], image_file.replace(".png", ".txt"))
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(ocr_result)

        # Langkah 3.2: Membersihkan hasil OCR
        cleaned_ocr = re.sub(r'[^a-zA-Z0-9\s.,]', '', ocr_result)
        cleaned_ocr = re.sub(r'\s+', ' ', cleaned_ocr).strip()
        output_path = os.path.join(output_folders["langkah 3.2 membersihkan hasil OCR"], image_file.replace(".png", ".txt"))
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(cleaned_ocr)

    print("Semua langkah selesai untuk folder:", base_folder)

In [32]:
# Langkah 3: Jalankan pipeline untuk semua folder
# Jalankan pipeline untuk semua folder
process_all_folders(base_folders, steps)

Memproses folder: file png berkas 0-100
Memproses halaman_1.png...
Memproses halaman_10.png...
Memproses halaman_11.png...
Memproses halaman_12.png...
Memproses halaman_13.png...
Memproses halaman_14.png...
Memproses halaman_15.png...
Memproses halaman_16.png...
Memproses halaman_17.png...
Memproses halaman_18.png...
Memproses halaman_19.png...
Memproses halaman_2.png...
Memproses halaman_20.png...
Memproses halaman_21.png...
Memproses halaman_22.png...
Memproses halaman_23.png...
Memproses halaman_24.png...
Memproses halaman_25.png...
Memproses halaman_26.png...
Memproses halaman_27.png...
Memproses halaman_28.png...
Memproses halaman_29.png...
Memproses halaman_3.png...
Memproses halaman_30.png...
Memproses halaman_31.png...
Memproses halaman_32.png...
Memproses halaman_33.png...
Memproses halaman_34.png...
Memproses halaman_35.png...
Memproses halaman_36.png...
Memproses halaman_37.png...
Memproses halaman_38.png...
Memproses halaman_39.png...
Memproses halaman_4.png...
Memproses ha

### 4. Membersihkan dan Menggabungkan Hasil OCR