## **Imports**


In [94]:
from sklearn.neural_network import MLPClassifier  # MLP is an NN
from sklearn import svm
import numpy as np
import argparse
import cv2
import os
import re
import random
import pytesseract
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from skimage.util import random_noise
from PIL import Image, ImageDraw, ImageFont
from arabic_reshaper import reshape
from bidi.algorithm import get_display
import pandas as pd
from openpyxl.utils import get_column_letter
from commonfunctions import *
import numpy as np
import unittest

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

import cv2
import numpy as np
import matplotlib.pyplot as plt

pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' 


target_img_size = (32, 32) 
random_seed = 42  
random.seed(random_seed)
np.random.seed(random_seed)

## GPU setup (EasyOCR / Qwen / PaddleOCR)
Run the next cell to check if CUDA is available in this environment.

## **Main Functions Overview**

- **Image Alignment**
  - Detects SIFT keypoints and descriptors in the input and reference images.
  - Matches them using the ratio test.
  - Uses RANSAC to estimate a homography.
  - Applies the homography to warp the input image so it lines up with the reference.
  - Returns the aligned image (or the original if not enough matches are found).

- **Extract Details**
  - Uses (x, y, w, h) coordinates to crop the aligned card into:
    - The name region
    - The code (ID) region
  - Returns these sub-images for downstream OCR or digit processing.

- **Save Student Name**
  - Ensures the output folder exists.
  - Writes the cropped name image to disk with a filename that includes the student ID.
  - Creates a persistent record usable for manual review or OCR.

- **Split and Save Digits**
  - Converts the code region to grayscale and applies Otsu thresholding.
  - Finds contours and filters out small noise.
  - Selects the largest seven contours (by area) and sorts them left-to-right.
  - Saves each detected digit crop into a per-student folder as individual image files.

- **save_split_digits**
  - Takes a list of digit images for a student.
  - Ensures a folder exists for each student (named by their ID).
  - Saves each digit image as `digit_0.jpg`, `digit_1.jpg`, ..., `digit_6.jpg` inside the student’s folder.
  - Used for batch saving when all digit crops are already extracted.

## **Noise Detection and Treatment**
- **Impulsive Noise (Median Filter)**
- **Random Noise (Gaussian Filter)**

In [95]:
import cv2
import numpy as np

def is_random_noise(img, threshold=0.1):

    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    stddev = np.std(img)
    normalized_stddev = stddev / 255.0

    if normalized_stddev < threshold:
        return img, False

    treated_img = cv2.fastNlMeansDenoising(
        img, 
        None, 
        h=10, 
        templateWindowSize=7, 
        searchWindowSize=21
    )
    
    return treated_img, True

In [96]:
import cv2
import numpy as np

def is_impulsive_noise(img, threshold=0.1, black_range=(0, 9), white_range=(246, 255)):
    
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
    total_pixels = img.size
    is_pepper = (img >= black_range[0]) & (img <= black_range[1])
    is_salt = (img >= white_range[0]) & (img <= white_range[1])

    noise_mask = is_pepper | is_salt
    num_noise_pixels = np.sum(noise_mask)
    prop = num_noise_pixels / total_pixels

    if prop < threshold:
        return img, False 

    k = int(3 + prop * 10)
    if k % 2 == 0: k += 1
    k = min(max(k, 3), 9)

    median_filtered = cv2.medianBlur(img, k)

    treated_img = img.copy()

    treated_img[noise_mask] = median_filtered[noise_mask]

    return treated_img, True

Contrast enhancment

In [97]:
import cv2
import numpy as np

def enhance_contrast_clahe(img, clip_limit=2.0, tile_size=(8, 8)):

    if len(img.shape) == 3:

        lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        
        # Create CLAHE object
        clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_size)
        cl = clahe.apply(l)
        
        # Merge back and convert to BGR
        enhanced_lab = cv2.merge((cl, a, b))
        return cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
    else:
        clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_size)
        return clahe.apply(img)

In [98]:
def align_images_sift(img_to_align, reference_path):
    img1 = img_to_align
    img2 = cv2.imread(reference_path) 
    
    if len(img1.shape) == 3:
        gray1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)
    else:
        gray1 = img1

    if len(img2.shape) == 3:
        gray2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)
    else:
        gray2 = img2

    sift = cv2.SIFT_create() 
    
    kp1, des1 = sift.detectAndCompute(gray1, None)
    kp2, des2 = sift.detectAndCompute(gray2, None)

    bf = cv2.BFMatcher()
    matches = bf.knnMatch(des1, des2, k=2)

    good_matches = []
    for m, n in matches:
        if m.distance < 0.75 * n.distance:
            good_matches.append(m)

    if len(good_matches) > 10:
        src_pts = np.float32([kp1[m.queryIdx].pt for m in good_matches]).reshape(-1, 1, 2)
        dst_pts = np.float32([kp2[m.trainIdx].pt for m in good_matches]).reshape(-1, 1, 2)

        M, mask = cv2.findHomography(src_pts, dst_pts, cv2.RANSAC, 5.0)

        h, w = img2.shape[:2]
        aligned_img = cv2.warpPerspective(img1, M, (w, h))

        return aligned_img
    
    else:
        print(f"Not enough matches found: {len(good_matches)}/10")
        return img1
    


def extract_details(aligned_image):
    name_coords = (100, 205, 1200, 150)
    code_coords = (640, 404, 335, 110)
    
    nx, ny, nw, nh = name_coords
    cx, cy, cw, ch = code_coords
    
    name_contour = aligned_image[ny:ny+nh, nx:nx+nw]
    code_contour = aligned_image[cy:cy+ch, cx:cx+cw]
    
    return name_contour, code_contour


def save_student_name(student_id, name_img, output_folder="extracted_names"):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
    filename = f"{output_folder}/{student_id}_name.jpg"
    
    # Save the image
    cv2.imwrite(filename, name_img)
    

def split_and_save_digits(student_id, code_roi, output_folder="extracted_digits"):
    save_path = f"{output_folder}/ID{student_id}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
        
    gray = cv2.cvtColor(code_roi, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # A. Collect all valid candidates
    candidates = []
    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = w * h
        # Filter tiny noise
        if h > 15 and w > 5:
            candidates.append((x, y, w, h, area))

    candidates = sorted(candidates, key=lambda c: c[4], reverse=True) 
    final_digits = candidates[:7] 
    
    # C. Sort left to right
    final_digits = sorted(final_digits, key=lambda c: c[0])
        
    # D. Save
    for index, (x, y, w, h, area) in enumerate(final_digits):
        digit_img = code_roi[y:y+h, x:x+w]
        filename = f"{save_path}/digit_{index}.jpg"
        cv2.imwrite(filename, digit_img)

      
import cv2
import numpy as np

def extract_name_and_digits(aligned_image):

    name_coords = (100, 205, 1200, 150)
    code_coords = (640, 404, 335, 110)
    daf3_coords = (350, 500, 620, 110)
    
    nx, ny, nw, nh = name_coords
    cx, cy, cw, ch = code_coords
    dx, dy, dw, dh = daf3_coords
    
    # Extract ROIs
    name_img = aligned_image[ny:ny+nh, nx:nx+nw]
    code_roi = aligned_image[cy:cy+ch, cx:cx+cw]
    daf3_img = aligned_image[dy:dy+dh, dx:dx+dw]
    
    # --- Helper Function to Process Any ROI ---
    def process_roi_digits(roi_img, digit_limit):

        if len(roi_img.shape) == 3:
            gray = cv2.cvtColor(roi_img, cv2.COLOR_BGR2GRAY)
        else:
            gray = roi_img
        
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        candidates = []
        for cnt in contours:
            x, y, w, h = cv2.boundingRect(cnt)
            area = w * h
            
            if h > 15 and w > 5:
                
                if w > 0.8 * h: 
                    half_w = w // 2
                    candidates.append((x, y, half_w, h, half_w * h))
                    candidates.append((x + half_w, y, half_w, h, half_w * h))
                else:
                    candidates.append((x, y, w, h, area))
        

        candidates = sorted(candidates, key=lambda c: c[4], reverse=True)[:digit_limit]

        final_candidates = sorted(candidates, key=lambda c: c[0])
        

        cropped_digits = []
        for (x, y, w, h, area) in final_candidates:
            digit_crop = roi_img[y:y+h, x:x+w]
            cropped_digits.append(digit_crop)
            
        return cropped_digits
    

    code_digits = process_roi_digits(code_roi, digit_limit=7)

    daf3_digits = process_roi_digits(daf3_img, digit_limit=14)

    return name_img, code_digits, daf3_digits

def save_split_digits(student_id, digit_imgs, output_folder="extracted_digits"):
  
    save_path = f"{output_folder}/{student_id}"
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    for index, digit_img in enumerate(digit_imgs):
        filename = f"{save_path}/digit_{index}.jpg"
        cv2.imwrite(filename, digit_img)    



## **SVM English Number Classifier**

In [99]:
path_to_train_dataset = r"train_digits" 

def train_SVM_robust():

    label_map = {
        'a': '0', 'b': '1', 'c': '2', 'd': '3', 'e': '4', 
        'f': '5', 'g': '6', 'h': '7', 'i': '8', 'j': '9'
    }
    
    features = []
    labels = []
    
    img_filenames = os.listdir(path_to_train_dataset)
    print(f"Loading {len(img_filenames)} training images...")

    for fn in img_filenames:
        if not fn.lower().endswith(('.jpg', '.png')):
            continue

        prefix = fn[0].lower()
        if prefix in label_map:
            labels.append(label_map[prefix])
            
            path = os.path.join(path_to_train_dataset, fn)
            img = cv2.imread(path)
            
            features.append(extract_hog_features(img))

    clf = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', LinearSVC(random_state=42, max_iter=5000, dual=False))
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=random_seed
    )
    
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)
    print(f"Training Complete. Validation Accuracy: {accuracy*100:.2f}%")
    
    return clf

def extract_hog_features(img):
    if len(img.shape) == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    _, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    
    img = cv2.resize(img, (32, 32)) 
    
    win_size = (32, 32)
    cell_size = (8, 8) 
    block_size = (16, 16)
    block_stride = (8, 8)
    nbins = 9
    
    hog = cv2.HOGDescriptor(win_size, block_size, block_stride, cell_size, nbins)
    h = hog.compute(img)
    return h.flatten()

# **Tesseract Arabic OCR**

## **Current Situation**

The project uses Tesseract OCR to extract Arabic names from scanned images. Initially, the extraction pipeline achieved only a **70% success rate**. This meant that about 30% of the images failed to yield any valid Arabic text, even though the images were visually clear and contained readable names.

## **Why Was the Success Rate Only 70%?**

- **Overprocessing:** The original code applied several preprocessing steps (scaling, thresholding, blurring, etc.) before running OCR. While these steps can help with noisy or low-contrast images, they often **destroy clean, high-contrast text**—especially for Arabic, where fine details matter.
- **Order of Operations:** The pipeline tried processed versions first, so if the original image was already optimal, it was never used for OCR.
- **PSM/OEM Settings:** The code tried a limited set of Tesseract Page Segmentation Modes (PSM) and OCR Engine Modes (OEM), which may not have been optimal for all images.
- **Text Cleaning:** The cleaning function was aggressive, but if Tesseract output was empty or too short, the result was discarded.

## **What Was Changed to Achieve 100% Success**

1. **Prioritize the Original Image:**  
   The new code always tries the original, unprocessed grayscale image first, with several PSM settings. This ensures that clean images are not degraded by unnecessary processing.

2. **Expanded Preprocessing (But Only If Needed):**  
   Only if the original image fails, the code tries padded and scaled versions, but never applies destructive thresholding or blurring unless absolutely necessary.

3. **Multiple PSM and OEM Combinations:**  
   For each image variant, the code tries several PSM (6, 7, 3, 13) and both OEM (3, 1) settings, maximizing the chance that Tesseract will interpret the layout correctly.

4. **Result Selection:**  
   All non-empty results are collected, and the **longest valid extraction** is chosen, which is usually the correct full name.

5. **Diagnostics:**  
   Additional debug and diagnostic code was used to confirm that the original image, with minimal processing, consistently yields the best results for this dataset.

# Reference

The old (70%) code is left in the notebook for comparison. The new approach, as described above, achieves **100% extraction success** on the current dataset by respecting the quality of the input images and leveraging Tesseract's flexibility.

In [None]:
# def extractname(img_path):
    
#     # --- HELPER: TEXT CLEANER ---
#     def clean_text(raw_text):
#         if not raw_text: return ""
#         # Keep Arabic letters (0621-064A) and spaces
#         cleaned = re.sub(r'[^\u0621-\u064A\s]', '', raw_text)
#         cleaned = cleaned.replace('\n', ' ')
#         cleaned = re.sub(r'\s+', ' ', cleaned).strip()
#         return cleaned

#     # --- LOAD IMAGE AS GRAYSCALE DIRECTLY ---
#     img_gray = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
#     if img_gray is None: 
#         return ""

#     # Try multiple approaches and collect all results
#     all_results = []
    
#     # Preprocessing variants
#     preprocessed_images = {
#         'original': img_gray,
#         'padded': cv2.copyMakeBorder(img_gray, 40, 40, 40, 40, cv2.BORDER_CONSTANT, value=255),
#     }
    
#     # Add scaled version
#     h, w = img_gray.shape
#     scaled = cv2.resize(img_gray, (w*2, h*2), interpolation=cv2.INTER_CUBIC)
#     preprocessed_images['scaled_padded'] = cv2.copyMakeBorder(scaled, 40, 40, 40, 40, cv2.BORDER_CONSTANT, value=255)
    
#     # PSM modes to try
#     psm_modes = [6, 7, 3, 13]  # 13 = raw line
    
#     for img_name, img in preprocessed_images.items():
#         for psm in psm_modes:
#             for oem in [3, 1]:  # Try both LSTM+Legacy and LSTM only
#                 try:
#                     config = f"--oem {oem} --psm {psm}"
#                     text = pytesseract.image_to_string(img, lang='ara', config=config)
#                     cleaned = clean_text(text)
                    
#                     if len(cleaned) > 2:
#                         all_results.append((cleaned, len(cleaned), img_name, psm, oem))
#                 except:
#                     continue
    
#     # Return the longest valid result
#     if all_results:
#         all_results.sort(key=lambda x: x[1], reverse=True)
#         return all_results[0][0]
    
#     return ""








###############################    easyOCR       #####################################


import os
import csv
import numpy as np
import easyocr

USE_NAME_LEXICON_CORRECTION = True
NAME_LEXICON_CSV = "name_labels.csv"

EASYOCR_READTEXT_KWARGS = dict(
    detail=1,
    paragraph=False,
    decoder="beamsearch",
    beamWidth=5,
    batch_size=1,
    text_threshold=0.55,
    low_text=0.30,
    link_threshold=0.35,
    contrast_ths=0.08,
    adjust_contrast=0.7,
    mag_ratio=2.0,
 )

_EASYOCR_READER = None
_NAME_LEXICON = None


def _ensure_easyocr_reader():
    global _EASYOCR_READER
    if _EASYOCR_READER is None:
        _EASYOCR_READER = easyocr.Reader(["ar", "en"], gpu=True)
    return _EASYOCR_READER


def _load_name_lexicon():
    global _NAME_LEXICON
    if _NAME_LEXICON is not None:
        return _NAME_LEXICON
    lex = []
    if os.path.exists(NAME_LEXICON_CSV):
        try:
            with open(NAME_LEXICON_CSV, "r", encoding="utf-8", newline="") as f:
                for row in csv.DictReader(f):
                    v = (row.get("transcription") or row.get("Name") or "").strip()
                    if v:
                        lex.append(v)
        except Exception:
            lex = []
    _NAME_LEXICON = lex
    return _NAME_LEXICON


def _norm_dl(s: str) -> str:
    return (s or "").replace("د", "X").replace("ل", "X")


def _levenshtein(a: str, b: str) -> int:
    a = a or ""
    b = b or ""
    n, m = len(a), len(b)
    if n == 0:
        return m
    if m == 0:
        return n
    prev = list(range(m + 1))
    for i in range(1, n + 1):
        curr = [i] + [0] * m
        ca = a[i - 1]
        for j in range(1, m + 1):
            cb = b[j - 1]
            cost = 0 if ca == cb else 1
            curr[j] = min(
                prev[j] + 1,
                curr[j - 1] + 1,
                prev[j - 1] + cost,
            )
        prev = curr
    return prev[m]


def _correct_with_lexicon_dl(pred: str) -> str:
    lex = _load_name_lexicon()
    if not pred or not lex:
        return pred
    p = _norm_dl(pred)
    best_name, best_dist = pred, 10**9
    for cand in lex:
        d = _levenshtein(p, _norm_dl(cand))
        if d < best_dist:
            best_dist, best_name = d, cand
    tol = max(2, int(0.18 * max(len(best_name), 1)))
    return best_name if best_dist <= tol else pred


def extractname(image_or_path, debug=False):
    try:
        reader = _ensure_easyocr_reader()
        results = reader.readtext(image_or_path, **EASYOCR_READTEXT_KWARGS)

        items = []
        for r in results:
            try:
                bbox, text, conf = r
            except Exception:
                continue
            if not isinstance(text, str):
                continue
            text = text.strip()
            if len(text) < 2:
                continue
            conf = float(conf) if conf is not None else 0.0
            try:
                cx = float(np.mean([p[0] for p in bbox]))
            except Exception:
                cx = 0.0
            items.append((cx, text, conf))

        items.sort(key=lambda t: t[0], reverse=True)
        joined = " ".join([t for _, t, __ in items]).strip()

        out = _correct_with_lexicon_dl(joined) if USE_NAME_LEXICON_CORRECTION else joined
        if debug:
            confs = [c for _, __, c in items]
            avg_conf = float(np.mean(confs)) if confs else 0.0
            print(f"[EasyOCR] avg_conf={avg_conf:.3f} raw='{joined}' corrected='{out}'")
        return out
    except Exception as e:
        print(f"EasyOCR Error on {type(image_or_path).__name__}: {e}")
        return ""

#############################           PaddleOCR            ############################

# import os

# # Avoid slow "model hoster connectivity" checks on startup
# os.environ.setdefault("DISABLE_MODEL_SOURCE_CHECK", "True")

# from paddleocr import PaddleOCR

# # PaddleOCR v3.x: `use_gpu` and `use_angle_cls` are not valid args.
# # For orientation/angle handling, use `use_textline_orientation`.
# # Bump text-det resolution and relax thresholds so small words aren't missed.
# ocr = PaddleOCR(
#     lang='ar',
#     use_textline_orientation=True,
#     text_det_limit_side_len=1280,
#     text_det_limit_type='max',
#     text_det_thresh=0.2,
#     text_det_box_thresh=0.3,
#     text_det_unclip_ratio=1.8,
# )


# def _poly_center(poly):
#     # poly is expected to be (4,2) array-like
#     try:
#         xs = [float(p[0]) for p in poly]
#         ys = [float(p[1]) for p in poly]
#         return (sum(xs) / len(xs), sum(ys) / len(ys))
#     except Exception:
#         return (0.0, 0.0)


# def _group_into_lines(items, y_tol=18.0):
#     """Group (cx, cy, text) into lines by y coordinate."""
#     items = sorted(items, key=lambda t: t[1])
#     lines = []
#     current = []
#     current_y = None

#     for cx, cy, text in items:
#         if current_y is None:
#             current_y = cy
#             current = [(cx, cy, text)]
#             continue

#         if abs(cy - current_y) <= y_tol:
#             current.append((cx, cy, text))
#             current_y = (current_y * (len(current) - 1) + cy) / len(current)
#         else:
#             lines.append(current)
#             current = [(cx, cy, text)]
#             current_y = cy

#     if current:
#         lines.append(current)

#     return lines


# def extractname(image_or_path):
#     """Reads Arabic name text from a path or ndarray using PaddleOCR."""
#     try:
#         src = image_or_path
#         if isinstance(src, str):
#             img = cv2.imread(src)
#         else:
#             img = src
#         if img is None:
#             return ""

#         # Pad the crop a bit to avoid clipping the first/last word at the borders.
#         img = cv2.copyMakeBorder(img, 10, 10, 70, 70, cv2.BORDER_CONSTANT, value=(255, 255, 255))

#         result = ocr.ocr(img)

#         tokens = []

#         # PaddleOCR v3.x output shape: list[dict], with keys like `rec_texts`, `rec_polys`.
#         if isinstance(result, list) and result and isinstance(result[0], dict):
#             for page in result:
#                 rec_texts = page.get('rec_texts') or []
#                 rec_polys = page.get('rec_polys') or []

#                 if isinstance(rec_texts, list) and isinstance(rec_polys, list) and len(rec_texts) == len(rec_polys):
#                     items = []
#                     for t, poly in zip(rec_texts, rec_polys):
#                         if not isinstance(t, str):
#                             continue
#                         t = t.strip()
#                         if not t:
#                             continue
#                         cx, cy = _poly_center(poly)
#                         items.append((cx, cy, t))

#                     # Group into lines by Y, then sort each line RTL (X desc)
#                     for line in _group_into_lines(items):
#                         line_sorted = sorted(line, key=lambda t: t[0], reverse=True)
#                         tokens.extend([t for _, __, t in line_sorted])
#                 else:
#                     # Fallback: just take the text list order as-is
#                     if isinstance(rec_texts, list):
#                         tokens.extend([t.strip() for t in rec_texts if isinstance(t, str) and t.strip()])
#                     elif isinstance(rec_texts, str) and rec_texts.strip():
#                         tokens.append(rec_texts.strip())

#         else:
#             # Older output shape: [[ [box, (text, score)], ... ]]
#             try:
#                 for line in result:
#                     for res in line:
#                         t = res[1][0]
#                         if isinstance(t, str) and t.strip():
#                             tokens.append(t.strip())
#             except Exception:
#                 pass

#         # Clean: keep Arabic letters and spaces
#         if tokens:
#             joined = " ".join(tokens)
#             joined = re.sub(r'[^\u0621-\u064A\s]', '', joined)
#             joined = re.sub(r'\s+', ' ', joined).strip()
#             return joined

#         return ""

#     except Exception as e:
#         print(f"PaddleOCR Error on {type(image_or_path).__name__}: {e}")
#         return ""

In [101]:
def calculate_pipeline_accuracy(true_file_path, extracted_file_path):
    def _norm_arabic_variants(s: str) -> str:
        return (s or "").replace("ى", "ي").replace("ة", "ه")

    try:
        df_true = pd.read_excel(true_file_path)
        df_extracted = pd.read_excel(extracted_file_path)
    except Exception as e:
        print(f"Error loading files: {e}")
        return 0.0

    df_true.columns = df_true.columns.str.strip()
    df_extracted.columns = df_extracted.columns.str.strip()

    min_len = min(len(df_true), len(df_extracted))
    df_true = df_true.iloc[:min_len].reset_index(drop=True)
    df_extracted = df_extracted.iloc[:min_len].reset_index(drop=True)

    columns_to_check = ['Code', 'Daf3', 'Name']
    scores = {}

    print(f"--- Accuracy Report (Checking {min_len} rows) ---\n")

    for col in columns_to_check:
        if col not in df_true.columns or col not in df_extracted.columns:
            print(f"Error: Column '{col}' missing.")
            print(f"   Available in True File: {df_true.columns.tolist()}")
            print(f"   Available in Extracted: {df_extracted.columns.tolist()}")
            scores[col] = 0.0
            continue

        true_series = df_true[col].astype(str).fillna('')
        extracted_series = df_extracted[col].astype(str).fillna('')

        true_clean = true_series.str.strip().str.replace(r'\.0$', '', regex=True)
        extracted_clean = extracted_series.str.strip().str.replace(r'\.0$', '', regex=True)

        if col == 'Name':
            row_scores = []

            for t_val, e_val in zip(true_clean, extracted_clean):
                t_val = _norm_arabic_variants(t_val)
                e_val = _norm_arabic_variants(e_val)

                t_nospace = t_val.replace(" ", "")
                e_nospace = e_val.replace(" ", "")

                if t_val == e_val:
                    row_scores.append(1.0)
                elif (t_nospace == e_nospace) and (abs(len(t_val) - len(e_val)) <= 1):
                    row_scores.append(1.0)
                else:
                    t_words = set(t_val.split())
                    e_words = set(e_val.split())
                    if len(t_words) == 0:
                        row_scores.append(1.0 if len(e_words) == 0 else 0.0)
                    else:
                        common = t_words.intersection(e_words)
                        row_scores.append(len(common) / len(t_words))

            accuracy = np.mean(row_scores) * 100
        else:
            matches = (true_clean == extracted_clean)
            accuracy = (matches.sum() / len(matches)) * 100

        scores[col] = accuracy
        print(f"{col} Accuracy: {accuracy:.2f}%")

    average_accuracy = (sum(scores.values()) / len(scores)) if scores else 0.0

    print(f"\n--------------------------------")
    print(f"AVERAGE ACCURACY: {average_accuracy:.2f}%")
    print(f"--------------------------------")

    return average_accuracy

# **Main Pipeline** 

In [102]:
import os
import re
import pandas as pd
import cv2
from IPython.display import display, Image as IPyImage, HTML

def show_image_cv(img, title=None):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(4, 4))
    if len(img.shape) == 2:
        plt.imshow(img, cmap='gray')
    else:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    if title:
        plt.title(title)
    plt.axis('off')
    plt.show()


def main_pipeline():
    base_dir = os.getcwd()
    path_to_dataset = os.path.join(base_dir, 'Raw_IDs')
    refrence_image_path = os.path.join(base_dir, 'Raw_IDs', 'ID14.jpg')

    SVMclassifier = train_SVM_robust()

    data_for_excel = []

    # Safety check for directory
    if not os.path.exists(path_to_dataset):
        print(f"Directory not found: {path_to_dataset}")
        return

    for i in os.listdir(path_to_dataset):
        if not i.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
            continue
        
        img_path = os.path.join(path_to_dataset, i)

        # --- Processing ---
        raw_img = cv2.imread(img_path)
        clean_img, is_impulsive = is_impulsive_noise(raw_img)
        aligned_img = align_images_sift(clean_img, refrence_image_path)
        clean_img2, is_random = is_random_noise(aligned_img)
        enhanced_img = enhance_contrast_clahe(clean_img2)
        # Simple sharpening kernel
        kernel = np.array([[0, -1, 0],
                   [-1, 5,-1],
                   [0, -1, 0]])
        sharpened = cv2.filter2D(enhanced_img, -1, kernel)
        name_img, digit_imgs, daf3_digits = extract_name_and_digits(sharpened)
        student_id = os.path.splitext(i)[0]

        # Save images
        save_student_name(student_id, name_img)
        save_split_digits(student_id, digit_imgs)
        save_split_digits(f"{student_id}_daf3", daf3_digits, output_folder="extracted_daf3_digits")

        # Predict Code
        digit_preds = []
        for digit_img in digit_imgs:
            feat = extract_hog_features(digit_img)
            pred = SVMclassifier.predict([feat])[0]
            digit_preds.append(str(pred))
        code_str = ''.join(digit_preds)

        # Predict Daf3
        daf3_preds = []
        for d_img in daf3_digits:
            feat = extract_hog_features(d_img)
            pred = SVMclassifier.predict([feat])[0]
            daf3_preds.append(str(pred))
        daf3_str = ''.join(daf3_preds)

        # Extract Name
        name_text = extractname(f'./extracted_names/{student_id}_name.jpg')

        data_for_excel.append({
            "Student ID": student_id,   
            "Name": name_text,
            "Code": code_str,
            "Daf3": daf3_str,
        })

    # --- OUTPUT SECTION ---
    if data_for_excel:
        df = pd.DataFrame(data_for_excel)
        df = df.sort_values(by="Student ID", key=lambda x: x.str.extract(r'(\d+)').iloc[:, 0].astype(int))

        print("Processing Complete. Results:")
        true_results_path = 'True_Results.xlsx'

        df_display = df[['Student ID', 'Name', 'Code', 'Daf3']].copy()
        if os.path.exists(true_results_path):
            try:
                true_df = pd.read_excel(true_results_path)
                true_df.columns = true_df.columns.str.strip()

                def _key(c: str) -> str:
                    return re.sub(r"[^a-z0-9]", "", str(c).strip().lower())

                colmap = {_key(c): c for c in true_df.columns}
                sid_col = colmap.get('studentid') or colmap.get('student_id') or colmap.get('id')
                if sid_col is None:
                    raise KeyError(f"No student-id column found. Columns: {list(true_df.columns)}")
                true_df = true_df.rename(columns={sid_col: 'Student ID'})

                name_col = colmap.get('name')
                code_col = colmap.get('code')
                daf3_col = colmap.get('daf3')
                if name_col and name_col != 'Name':
                    true_df = true_df.rename(columns={name_col: 'Name'})
                if code_col and code_col != 'Code':
                    true_df = true_df.rename(columns={code_col: 'Code'})
                if daf3_col and daf3_col != 'Daf3':
                    true_df = true_df.rename(columns={daf3_col: 'Daf3'})

                left = df_display.copy()
                right = true_df.copy()
                left['_sid_key'] = left['Student ID'].astype(str).str.extract(r'(\d+)')[0]
                right['_sid_key'] = right['Student ID'].astype(str).str.extract(r'(\d+)')[0]
                merged = left.merge(right, on='_sid_key', how='left', suffixes=('', '_True'))

                def _norm_series(s: pd.Series) -> pd.Series:
                    return s.astype(str).fillna('').str.strip().str.replace(r'\.0$', '', regex=True)

                merged['Name_Correct'] = _norm_series(merged['Name']) == _norm_series(merged.get('Name_True', ''))
                merged['Code_Correct'] = _norm_series(merged['Code']) == _norm_series(merged.get('Code_True', ''))
                merged['Daf3_Correct'] = _norm_series(merged['Daf3']) == _norm_series(merged.get('Daf3_True', ''))

                df_display = merged[[
                    'Student ID',
                    'Name', 'Name_Correct',
                    'Code', 'Code_Correct',
                    'Daf3', 'Daf3_Correct',
                ]]
            except Exception as e:
                print(f"Could not add correctness columns: {e}")

        display(df_display)

        #SAVE TO EXCEL
        output_file = "Extracted_Results.xlsx"
        with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
            df.to_excel(writer, index=False, sheet_name='Sheet1')

            worksheet = writer.sheets['Sheet1']
            for column in df:
                column_length = max(df[column].astype(str).map(len).max(), len(column))
                col_idx = df.columns.get_loc(column)
                col_letter = chr(65 + col_idx)
                worksheet.column_dimensions[col_letter].width = column_length + 2


        print(f"Excel file saved to: {output_file}")
        
        true_results_path = 'True_Results.xlsx' 
        
        if os.path.exists(true_results_path):
            calculate_pipeline_accuracy(true_results_path, output_file)
        else:
            print(f"Skipping accuracy check: '{true_results_path}' not found.")

main_pipeline()

Loading 149 training images...
Training Complete. Validation Accuracy: 100.00%
Processing Complete. Results:


Unnamed: 0,Student ID,Name,Name_Correct,Code,Code_Correct,Daf3,Daf3_Correct
0,ID1,مصطفى محمود احمد عويس,True,1200277,True,14071200100041,False
1,ID2,شاميه علاء محمد سيد احمد,True,1220175,True,14712022101,False
2,ID3,اسمهان ابراهيم يوسف البيطار,True,1220165,True,14712022101214,True
3,ID4,حلمي علي ريان,False,1220145,True,14712022101495,True
4,ID5,حسين محمد ماهر بهاءالدين,True,1220237,True,14712022101358,True
5,ID6,محمد منير عبدالحميد كمال,True,4230174,True,14712022101524,True
6,ID7,يوسف محمد حمدى محمد عثمان,True,1220301,True,14712522101392,False
7,ID8,معاذ امام احمد امام احمد,True,1220205,True,14712022101329,True
8,ID9,نغم طارق محمد احمد قاسم,True,1220149,True,14712022101620,True
9,ID10,يوسف عمرو سيد كمال محمود محمد,True,1220319,True,14712022101322,True


Excel file saved to: Extracted_Results.xlsx
--- Accuracy Report (Checking 54 rows) ---

Code Accuracy: 100.00%
Daf3 Accuracy: 92.59%
Name Accuracy: 99.23%

--------------------------------
AVERAGE ACCURACY: 97.27%
--------------------------------


## Qwen2.5-VL pretrained OCR (comparison only)
- This section does **not** change your pipeline or existing OCR code.
- It runs a pretrained Qwen2.5-VL OCR model on the already-saved name crops in `extracted_names/` and writes a separate Excel file for comparison.

In [107]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import os
import re
from PIL import Image
import pandas as pd
import numpy as np
import torch
from IPython.display import display

os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")

try:
    import torch.compiler as _torch_compiler
    if not hasattr(_torch_compiler, "is_compiling"):
        if hasattr(torch, "_dynamo") and hasattr(torch._dynamo, "is_compiling"):
            _torch_compiler.is_compiling = torch._dynamo.is_compiling
        else:
            _torch_compiler.is_compiling = lambda: False
except Exception:
    pass

MODEL_NAME = "sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v2"
NAMES_DIR = "extracted_names"
BASELINE_XLSX = "Extracted_Results.xlsx"
TRUE_XLSX = "True_Results.xlsx"
QWEN_XLSX = "Qwen_Results.xlsx"


def _clean_arabic_name(text: str) -> str:
    if not text:
        return ""
    s = re.sub(r"[^\u0621-\u064A\s]", " ", str(text))
    s = re.sub(r"\s+", " ", s).strip()
    return s


def _pipeline_name_score(true_name: str, extracted_name: str) -> float:
    t = "" if true_name is None else str(true_name)
    e = "" if extracted_name is None else str(extracted_name)
    t = t.strip()
    e = e.strip()
    if t.endswith(".0"):
        t = t[:-2]
    if e.endswith(".0"):
        e = e[:-2]
    if t == e:
        return 1.0
    t_nospace = t.replace(" ", "")
    e_nospace = e.replace(" ", "")
    if (t_nospace == e_nospace) and (abs(len(t) - len(e)) <= 1):
        return 1.0
    t_words = set(t.split())
    e_words = set(e.split())
    if len(t_words) == 0:
        return 1.0 if len(e_words) == 0 else 0.0
    return len(t_words.intersection(e_words)) / len(t_words)


def _pipeline_name_accuracy(true_names, extracted_names) -> float:
    scores = [_pipeline_name_score(t, e) for t, e in zip(true_names, extracted_names)]
    return float(np.mean(scores) * 100) if scores else 0.0


if not torch.cuda.is_available():
    raise RuntimeError("CUDA is not available. Fix torch CUDA then restart the kernel.")

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
)

try:
    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=False)
except TypeError:
    processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True)


def extract_name_from_image(image_path: str) -> str:
    prompt = "استخرج اسم الطالب فقط من الصورة. اكتب الاسم العربي فقط بدون أرقام أو رموز أو كلمات إضافية."
    image = Image.open(image_path).convert("RGB")
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image_path},
                {"type": "text", "text": prompt},
            ],
        }
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=[image],
        padding=True,
        return_tensors="pt",
    ).to(model.device)
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,
        pad_token_id=processor.tokenizer.eos_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
    )
    input_len = inputs.input_ids.shape[1]
    output_text = processor.batch_decode(
        generated_ids[:, input_len:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True,
    )[0]
    return _clean_arabic_name(output_text)


if not os.path.isdir(NAMES_DIR):
    raise RuntimeError(f"{NAMES_DIR!r} folder not found. Run the main pipeline first.")
if not os.path.exists(BASELINE_XLSX):
    raise RuntimeError(f"{BASELINE_XLSX!r} not found. Run the main pipeline first.")

df_base = pd.read_excel(BASELINE_XLSX)
df_base.columns = df_base.columns.str.strip()
if "Student ID" not in df_base.columns:
    raise RuntimeError(f"{BASELINE_XLSX!r} is missing 'Student ID' column.")

qwen_rows = []
for sid in df_base["Student ID"].astype(str).tolist():
    p1 = os.path.join(NAMES_DIR, f"{sid}_name.jpg")
    p2 = os.path.join(NAMES_DIR, f"{sid}.jpg")
    img_path = p1 if os.path.exists(p1) else p2
    qwen_name = extract_name_from_image(img_path) if os.path.exists(img_path) else ""
    qwen_rows.append({"Student ID": sid, "Name": qwen_name})

df_qwen = pd.DataFrame(qwen_rows)
df_qwen.to_excel(QWEN_XLSX, index=False)

if os.path.exists(TRUE_XLSX):
    df_true = pd.read_excel(TRUE_XLSX)
    df_true.columns = df_true.columns.str.strip()
    min_len = min(len(df_true), len(df_base), len(df_qwen))
    actual = df_true.get("Name", pd.Series([""] * min_len)).iloc[:min_len].tolist()
    mycode = df_base.get("Name", pd.Series([""] * min_len)).iloc[:min_len].tolist()
    qwen = df_qwen.get("Name", pd.Series([""] * min_len)).iloc[:min_len].tolist()

    baseline_acc = _pipeline_name_accuracy(actual, mycode)
    qwen_acc = _pipeline_name_accuracy(actual, qwen)

    table = pd.DataFrame({
        "Actual Name": actual,
        "My Code Output": mycode,
        "Qwen Output": qwen,
    })
    print(f"Baseline Name Accuracy: {baseline_acc:.2f}%")
    print(f"Qwen Name Accuracy:     {qwen_acc:.2f}%")
    print(f"Name Delta (baseline - Qwen): {baseline_acc - qwen_acc:+.2f} points")
    display(table.head(30))
else:
    display(df_qwen.head(30))

Baseline Name Accuracy: 98.86%
Qwen Name Accuracy:     83.77%
Name Delta (baseline - Qwen): +15.09 points


Unnamed: 0,Actual Name,My Code Output,Qwen Output
0,مصطفى محمود احمد عويس,مصطفى محمود احمد عويس,مسطفى محمود احمد عويس
1,شاميه علاء محمد سيد احمد,شاميه علاء محمد سيد احمد,شاميه علاء محمد سيد احمد
2,اسمهان ابراهيم يوسف البيطار,اسمهان ابراهيم يوسف البيطار,اسمهان ابراهيم يوسف البيطار
3,سلمى حلمي علي ريان,حلمي علي ريان,حطمي علي ريان
4,حسين محمد ماهر بهاءالدين,حسين محمد ماهر بهاءالدين,حسين محمد ماهر بهاء الدين
5,محمد منير عبدالحميد كمال,محمد منير عبدالحميد كمال,محمد مزير عبد الحميد كمال
6,يوسف محمد حمدى محمد عثمان,يوسف محمد حمدى محمد عثمان,يوسف محمد حمدى محمد عثمان
7,معاذ امام احمد امام احمد,معاذ امام احمد امام احمد,معاذ امام احمد امام احمد
8,نغم طارق محمد احمد قاسم,نغم طارق محمد احمد قاسم,نغم طارق محمد احمد قاسم
9,يوسف عمرو سيد كمال محمود محمد,يوسف عمرو سيد كمال محمود محمد,يوسف عمرو سيد كمال محمود محمد
