In [None]:
import cv2
import easyocr
import matplotlib.pyplot as plt
import re
import numpy as np
import glob
import pandas as pd
import os


In [4]:
def extract_field(results):
    name = None
    id_number = None
    dob = None

    for i, (bbox, text, conf) in enumerate(results):
        clean = text.lower().strip()

        if any(keyword in clean for keyword in ["name", "ten","vaten","full"]):
            # Try to get the next entry
            if i + 1 < len(results):
                next_text = results[i + 1][1].strip()
                if next_text[:1].isupper():
                    name = next_text
                elif i + 2 < len(results):  # Fallback to i+2 if not uppercase
                    fallback_text = results[i + 2][1].strip()
                    if fallback_text[:1].isupper():
                        name = fallback_text
            

        id_match = re.search(r'\b(\d{12})\b', text)
        if id_match:
            id_number = id_match.group(1)
    
        if any(keyword in clean for keyword in ["sinh", "birth"]):
            # Try next entry (i+1)
            if i + 1 < len(results):
                next_text = results[i + 1][1].strip()
                if len(re.findall(r"\d", next_text)) >= 6:
                    dob = next_text
                    # Fallback to i+2
                elif i + 2 < len(results):
                    fallback_text = results[i + 2][1].strip()
                    if len(re.findall(r"\d", fallback_text)) >= 6:
                        dob = fallback_text
            
    return {"ID": id_number, "Name": name, "DOB": dob}


In [6]:
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply CLAHE for local contrast enhancement
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    contrast = clahe.apply(gray)

    # Sharpen using a kernel
    sharpening_kernel = np.array([
        [1,  4,   6,  4, 1],
        [4, 16,  24, 16, 4],
        [6, 24, -476, 24, 6],
        [4, 16,  24, 16, 4],
        [1,  4,   6,  4, 1]
    ]) / -256.0
    sharpened = cv2.filter2D(contrast, -1, sharpening_kernel)

    # Normalize intensity values to full range
    norm_img = np.zeros_like(sharpened)
    normalized = cv2.normalize(sharpened, norm_img, 0, 255, cv2.NORM_MINMAX)


    return normalized


In [87]:
image_paths = glob.glob('testset/*.jpg')
reader = easyocr.Reader(['en'])
all_results = []
for img_path in image_paths[:200]:
    img = preprocess_image(img_path) 
    result = reader.readtext(img)
    # Display the image with detected text
    for t in result:
        #print(t)
        bbox, text, score = t
        cv2.rectangle(img, (int(bbox[0][0]), int(bbox[0][1])), (int(bbox[2][0]), int(bbox[2][1])), (0, 255, 0), 2)
    #plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    #plt.axis('off')
    #plt.show()
    extracted= extract_field(result)
    extracted["img_path"]= os.path.basename(img_path)
    all_results.append(extracted)
    #print(extracted)
    
#print(all_results)



In [88]:
import re
from datetime import datetime

def clean_dob(text):
    if not isinstance(text, str):
        return ""

    # Step 1: Replace common OCR mistakes
    text = text.upper()
    text = (
        text.replace('O', '0')
            .replace('I', '1')
            .replace('L', '1')
            .replace('S', '5')
            .replace('Z', '2')
    )

    # Step 2: Remove non-digit characters
    digits = ''.join(re.findall(r'\d', text))

    # Step 3: Remove extra '1's if longer than 8 digits (OCR misread slashes)
    while len(digits) > 8 :
        if '1' in digits:
            # Remove the first occurrence of '1'
            digits = digits.replace('1', '', 1)

    # Step 4: Parse if we have at least 8 digits
    if len(digits) >= 8:
        dd = digits[:2]
        mm = digits[2:4]
        yyyy = digits[4:8]

        try:
            dob = datetime.strptime(f"{dd}/{mm}/{yyyy}", "%d/%m/%Y")
            if 1900 <= dob.year <= 2025:
                return dob.strftime("%d/%m/%Y")
        except ValueError:
            pass

    return digits


In [89]:
def clean_name(text):
    if not isinstance(text, str):
        return ""

    # Step 1: Remove extra whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)

    # Step 2: Fix common OCR mistakes
    text = text.replace('0', 'O')  # zero → O
    text = text.replace('1', 'I')  # one → I
    text = text.replace('5', 'S')  # five → S (optional)
    text = text.replace('2', 'Z')  # two → Z (optional)
    text = text.replace('6','O')  # remove non-alphabetic characters

    text = re.sub(r'[^a-zA-Z\s]', '', text)  # keep only letters and spaces

    # Step 3: Convert to UPPERCASE
    return text.upper()


In [92]:
df = pd.DataFrame(all_results)
df.to_csv("ocr_results.csv")
print("Saved results to ocr_results.csv")

Saved results to ocr_results.csv


In [95]:
# Load data as strings 
ocr_df = pd.read_csv("ocr_results.csv", dtype=str)
gt_df = pd.read_csv("ground_truth.csv", dtype=str)

# Fields to compare
fields = ['Name', 'DOB', 'ID']

# Fill NaN with empty strings
ocr_df.fillna('', inplace=True)
gt_df.fillna('', inplace=True)
ocr_df['DOB'] = ocr_df['DOB'].apply(clean_dob)
ocr_df['Name'] = ocr_df['Name'].apply(clean_name)

# Merge on image path
merged = pd.merge(ocr_df, gt_df, on='img_path', suffixes=('_ocr', '_gt'))

# Compute exact match per field
for field in fields:
    merged[f'{field}_exact_match'] = merged[f'{field}_ocr'] == merged[f'{field}_gt']

# Print accuracy summary
for field in fields:
    match_rate = merged[f'{field}_exact_match'].mean() * 100
    print(f"{field} exact match accuracy: {match_rate:.2f}%")

# Optional: save per-row results
result_cols = ['img_path']
for field in fields:
    result_cols += [f'{field}_gt', f'{field}_ocr', f'{field}_exact_match']
result= merged[result_cols]
print(result.head())
merged[result_cols].to_csv("ocr_exact_match_results.csv", index=False)


Name exact match accuracy: 84.18%
DOB exact match accuracy: 82.65%
ID exact match accuracy: 96.43%
      img_path               Name_gt              Name_ocr  Name_exact_match  \
0  img1507.jpg        PHAM GIA KHIEM        PHAM GIA KHIEM              True   
1  img1513.jpg       PHAM DINH QUYEN       PHAM DINH QUYEN              True   
2  img1275.jpg  NGUYEN QUACH TRI TAI  NGUYEN QUACH TRI TAI              True   
3  img1117.jpg   TONG PHUOC ANH SIEU   TONG PHUOC ANH SIEU              True   
4  img1329.jpg       NGUYEN THANH AN       NGUYEN THANH AN              True   

       DOB_gt     DOB_ocr  DOB_exact_match         ID_gt        ID_ocr  \
0  31/07/2003  31/07/2003             True  091203002648  091203002648   
1  24/08/1979  24/08/1979             True  001079052151  001079052151   
2  17/11/2002  17/11/2002             True  095202002441  095202002441   
3  13/04/2003  13/04/2003             True  048203006839  048203006839   
4  15/11/2003  15/11/2003             True  064203