In [13]:
import os
import math
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage import exposure
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import shutil
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [14]:
# Preparing and preprocessing Amharic character sample dataset
def preprocess_images(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through each subfolder (character class)
    for char_folder in os.listdir(input_folder):
        char_folder_path = os.path.join(input_folder, char_folder)

        if os.path.isdir(char_folder_path):
            output_char_folder = os.path.join(output_folder, char_folder)

            # Create the output subfolder for the character class
            if not os.path.exists(output_char_folder):
                os.makedirs(output_char_folder)

            # Iterate through each image in the character class subfolder
            for filename in os.listdir(char_folder_path):
                if filename.lower().endswith((".jpg", ".jpeg", ".png")):
                    # Read the image using Pillow
                    image_path = os.path.join(char_folder_path, filename)

                    img = Image.open(image_path).convert('L')  # Open and convert to grayscale
                    img = img.resize((28, 28))  # Resize to 28x28 pixels

                    # Save the preprocessed image to the output subfolder using Pillow
                    output_path = os.path.join(output_char_folder, filename)
                    img.save(output_path)

if __name__ == "__main__":
    # Replace 'input_folder' and 'output_folder' with your actual paths
    input_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset"
    output_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Preprocessed"

    preprocess_images(input_folder, output_folder)

In [15]:
# Ensuring uniformity in size, resolution, and grayscale conversion of the above preprocessed data set
def check_uniformity(folder_path):
    # Keep track of whether all processed images meet the criteria
    all_meet_criteria = True

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            image_path = os.path.join(root, file)

            # Open the preprocessed image
            img = Image.open(image_path)

            # Check size uniformity
            width, height = img.size

            # Check resolution uniformity
            resolution = img.info.get("dpi", (0, 0))

            # Check grayscale conversion
            if img.mode != "L":
                all_meet_criteria = False
                print(f"Image: {file} is not grayscale.")
            elif width != 28 or height != 28:  # Adjust these dimensions based on your requirements
                all_meet_criteria = False
                print(f"Image: {file} does not have the required size.")

    if all_meet_criteria:
        print("All processed images meet the criteria.")

# Specify the path to the preprocessed images
preprocessed_folder_path = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Preprocessed"

# Perform uniformity checks
check_uniformity(preprocessed_folder_path)

All processed images meet the criteria.


In [16]:
# Splitting the dataset into training and test set
def split_data(input_folder, output_train_folder, output_test_folder, test_size=0.4, random_state=42):
    # Create output folders if they don't exist
    os.makedirs(output_train_folder, exist_ok=True)
    os.makedirs(output_test_folder, exist_ok=True)

    # Iterate through each subfolder (character class)
    for char_folder in os.listdir(input_folder):
        char_folder_path = os.path.join(input_folder, char_folder)

        if os.path.isdir(char_folder_path):
            # Split images into training and testing sets
            train_folder = os.path.join(output_train_folder, char_folder)
            test_folder = os.path.join(output_test_folder, char_folder)

            os.makedirs(train_folder, exist_ok=True)
            os.makedirs(test_folder, exist_ok=True)

            images = os.listdir(char_folder_path)
            train_images, test_images = train_test_split(images, test_size=test_size, random_state=random_state)

            # Move images to the appropriate folders
            for image in train_images:
                shutil.copy(os.path.join(char_folder_path, image), os.path.join(train_folder, image))

            for image in test_images:
                shutil.copy(os.path.join(char_folder_path, image), os.path.join(test_folder, image))

if __name__ == "__main__":
    # Replace 'input_folder', 'output_train_folder', and 'output_test_folder' with your actual paths
    input_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Preprocessed"
    output_train_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Train"
    output_test_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Test"

    split_data(input_folder, output_train_folder, output_test_folder, test_size=0.4, random_state=42)


In [35]:
def preprocess_images(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through each subfolder (character class)
    for char_folder in os.listdir(input_folder):
        char_folder_path = os.path.join(input_folder, char_folder)

        if os.path.isdir(char_folder_path):
            output_char_folder = os.path.join(output_folder, char_folder)

            # Create the output subfolder for the character class
            if not os.path.exists(output_char_folder):
                os.makedirs(output_char_folder)

            # Iterate through each image in the character class subfolder
            for filename in os.listdir(char_folder_path):
                if filename.lower().endswith((".jpg", ".jpeg", ".png")):
                    # Read the image using Pillow
                    image_path = os.path.join(char_folder_path, filename)

                    img = Image.open(image_path).convert('L')  # Open and convert to grayscale
                    img = img.resize((28, 28))  # Resize to 28x28 pixels

                    # Extract HOG features
                    features, hog_image = hog(np.array(img), visualize=True)
                    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))

                    # Save the HOG image (for visualization purposes, optional)
                    hog_image_path = os.path.join(output_char_folder, "hog_" + filename)
                    Image.fromarray((hog_image_rescaled * 255).astype(np.uint8)).save(hog_image_path)

                    # Save the HOG features as a NumPy array
                    features_path = os.path.join(output_char_folder, "features_" + filename.replace(".", "_") + ".npy")
                    np.save(features_path, features)

if __name__ == "__main__":
    # Replace 'input_folder' and 'output_folder' with your actual paths
    input_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Train"
    output_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Features Train"

    preprocess_images(input_folder, output_folder)


In [36]:
def preprocess_images(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through each subfolder (character class)
    for char_folder in os.listdir(input_folder):
        char_folder_path = os.path.join(input_folder, char_folder)

        if os.path.isdir(char_folder_path):
            output_char_folder = os.path.join(output_folder, char_folder)

            # Create the output subfolder for the character class
            if not os.path.exists(output_char_folder):
                os.makedirs(output_char_folder)

            # Iterate through each image in the character class subfolder
            for filename in os.listdir(char_folder_path):
                if filename.lower().endswith((".jpg", ".jpeg", ".png")):
                    # Read the image using Pillow
                    image_path = os.path.join(char_folder_path, filename)

                    img = Image.open(image_path).convert('L')  # Open and convert to grayscale
                    img = img.resize((28, 28))  # Resize to 28x28 pixels

                    # Extract HOG features
                    features, hog_image = hog(np.array(img), visualize=True)
                    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))

                    # Save the HOG image (for visualization purposes, optional)
                    hog_image_path = os.path.join(output_char_folder, "hog_" + filename)
                    Image.fromarray((hog_image_rescaled * 255).astype(np.uint8)).save(hog_image_path)

                    # Save the HOG features as a NumPy array
                    features_path = os.path.join(output_char_folder, "features_" + filename.replace(".", "_") + ".npy")
                    np.save(features_path, features)

if __name__ == "__main__":
    # Replace 'input_folder' and 'output_folder' with your actual paths
    input_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Test"
    output_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Features Test"

    preprocess_images(input_folder, output_folder)

In [37]:
# Loading features list and labels list separately as numpay array
def load_data(folder):
    features_list = []
    labels_list = []

    for char_folder in os.listdir(folder):
        char_folder_path = os.path.join(folder, char_folder)

        if os.path.isdir(char_folder_path):
            for filename in os.listdir(char_folder_path):
                if filename.startswith("features_") and filename.endswith(".npy"):
                    features_path = os.path.join(char_folder_path, filename)
                    label = char_folder  # Use folder name as label

                    features = np.load(features_path)
                    features_list.append(features)
                    labels_list.append(label)

    return np.array(features_list), np.array(labels_list)

In [38]:
# Path for train data set
input_train_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Features Train"
    
# Load data features and labels from the training data set
features_train, labels_train = load_data(input_train_folder)

# Path for test data set
input_test_folder = r"C:\Users\Bizuhan\Downloads\OCR\Amharic OCR Dataset Features Test"

# Load data features and labels from the test data set
features_test, labels_test = load_data(input_test_folder)

In [39]:
# Calculate the distance using euclidean metrics 
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))
    
# A function for OCR system using euclidean_distance measurement
def nearest_neighbor(train_features, train_labels, test_feature):
    distances = [euclidean_distance(train_feature, test_feature) for train_feature in train_features]
    nearest_idx = np.argmin(distances)
    return train_labels[nearest_idx]

# Predict using euclidean distance
predictions = [nearest_neighbor(features_train, labels_train, test_feature) for test_feature in features_test]

# Calculate accuracy
accuracy = accuracy_score(labels_test, predictions) * 100
print("Accuracy:", accuracy, "%")


Accuracy: 13.352826510721247 %


In [40]:
# Calculate the distance using manhattan metrics 
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

# A function for OCR system using manhattan_distance measurement
def nearest_neighbor(train_features, train_labels, test_feature):
    distances = [manhattan_distance(train_feature, test_feature) for train_feature in train_features]
    nearest_idx = np.argmin(distances)
    return train_labels[nearest_idx]
    
# Predict using manhattan distance
predictions = [nearest_neighbor(features_train, labels_train, test_feature) for test_feature in features_test]

# Calculate accuracy
accuracy = accuracy_score(labels_test, predictions) * 100
print("Accuracy:", accuracy, "%")

Accuracy: 13.54775828460039 %


In [41]:
# Calculate the distance using chebyshev metrics 
def chebyshev_distance(x1, x2):
    return np.max(np.abs(x1 - x2))

# A function for OCR system using chebyshev_distance measurement
def nearest_neighbor(train_features, train_labels, test_feature):
    distances = [chebyshev_distance(train_feature, test_feature) for train_feature in train_features]
    nearest_idx = np.argmin(distances)
    return train_labels[nearest_idx]
    
# Predict using chebyshev distance
predictions = [nearest_neighbor(features_train, labels_train, test_feature) for test_feature in features_test]

# Calculate accuracy
accuracy = accuracy_score(labels_test, predictions) * 100
print("Accuracy:", accuracy, "%")

Accuracy: 6.237816764132553 %


In [42]:
# Calculate the distance using cosine metrics 
def cosine_distance(x1, x2):
    dot_product = np.dot(x1, x2)
    norm_x1 = np.linalg.norm(x1)
    norm_x2 = np.linalg.norm(x2)
    return 1 - (dot_product / (norm_x1 * norm_x2))

# A function for OCR system using cosine_distance measurement
def nearest_neighbor(train_features, train_labels, test_feature):
    distances = [cosine_distance(train_feature, test_feature) for train_feature in train_features]
    nearest_idx = np.argmin(distances)
    return train_labels[nearest_idx]
    
# Predict using cosine distance
predictions = [nearest_neighbor(features_train, labels_train, test_feature) for test_feature in features_test]

# Calculate accuracy
accuracy = accuracy_score(labels_test, predictions) * 100
print("Accuracy:", accuracy, "%")

Accuracy: 13.352826510721247 %
