# OCR PROJECT WITH CNN 

## Clean the data

In [19]:
# CLEANING DATA FOR 1 use , training or testing should be done twice.

import os
import cv2

# Set the path to the original dataset directory
original_dir = "data2/training_data"

# Set the path to the cleaned dataset directory (create if it doesn't exist)
cleaned_dir = "cleandata/train"
if not os.path.exists(cleaned_dir):
    os.makedirs(cleaned_dir)

# Loop over all subdirectories under the original dataset directory
for subdir in os.listdir(original_dir):
    subdir_path = os.path.join(original_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop over all images in the current subdirectory
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                # Load the image
                img = cv2.imread(os.path.join(subdir_path, filename))

                # Convert the image to grayscale
                img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

                # Threshold the image to remove any noise or distortion
                img_thresh = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

                # Find the contours of the text in the image
                contours, hierarchy = cv2.findContours(img_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

                # If the image has one contour, save it to the cleaned dataset directory
                if len(contours) == 1:
                    cleaned_subdir = os.path.join(cleaned_dir, subdir)
                    if not os.path.exists(cleaned_subdir):
                        os.makedirs(cleaned_subdir)
                    cv2.imwrite(os.path.join(cleaned_subdir, filename), img_thresh)

error: OpenCV(4.7.0) D:\a\opencv-python\opencv-python\opencv\modules\imgproc\src\color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cv::cvtColor'


## Data Clean for Test

In [24]:
import os
import cv2

# Set the path to the original dataset directory
original_dir = "data2/testing_data"

# Set the path to the cleaned dataset directory (create if it doesn't exist)
cleaned_dir = "cleandata/test"
if not os.path.exists(cleaned_dir):
    os.makedirs(cleaned_dir)

# Loop over all subdirectories under the original dataset directory
for subdir in os.listdir(original_dir):
    subdir_path = os.path.join(original_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop over all images in the current subdirectory
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                try:
                    # Load the image
                    img = cv2.imread(os.path.join(subdir_path, filename))

                    # Convert the image to grayscale
                    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

                    # Threshold the image to remove any noise or distortion
                    img_thresh = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

                    # Find the contours of the text in the image
                    contours, hierarchy = cv2.findContours(img_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

                    # If the image has one contour, save it to the cleaned dataset directory
                    if len(contours) == 1:
                        cleaned_subdir = os.path.join(cleaned_dir, subdir)
                        if not os.path.exists(cleaned_subdir):
                            os.makedirs(cleaned_subdir)
                        cv2.imwrite(os.path.join(cleaned_subdir, filename), img_thresh)
                except:
                    print(f"Skipping {filename} due to corrupted data")
                    continue

Skipping 44404.png due to corrupted data


# Split train to val 

In [27]:
import os
from sklearn.model_selection import train_test_split

# Set the path to the cleaned dataset directory
dataset_dir = "cleandata/train"

# Set the path to the output directory
output_dir = "cleandata/validation"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the list of classes
classes = sorted(os.listdir(dataset_dir))

# Initialize the lists for storing the image paths and labels
image_paths = []
labels = []

# Loop over all subdirectories in the dataset directory
for class_index, class_name in enumerate(classes):
    class_dir = os.path.join(dataset_dir, class_name)
    for filename in os.listdir(class_dir):
        image_path = os.path.join(class_dir, filename)
        image_paths.append(image_path)
        labels.append(class_index)

# Split the data into a training set and a validation set
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=42, stratify=labels)

# Copy the training set and the validation set to the output directory
for split_name, split_paths, split_labels in [("train", train_paths, train_labels), 
                                              ("val", val_paths, val_labels)]:
    split_dir = os.path.join(output_dir, split_name)
    if not os.path.exists(split_dir):
        os.makedirs(split_dir)
    for image_path, label in zip(split_paths, split_labels):
        class_name = classes[label]
        dest_dir = os.path.join(split_dir, class_name)
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)
        dest_path = os.path.join(dest_dir, os.path.basename(image_path))
        with open(image_path, 'rb') as src_file, open(dest_path, 'wb') as dest_file:
            dest_file.write(src_file.read())

# Resized image

In [30]:
import os
import cv2

# Set the path to the cleaned dataset directory
cleaned_dir = "cleandata/validation/train"
cleaned_dir1 = "cleandata/validation/val"
cleaned_dir2 = "cleandata/test"

# Set the path to the resized dataset directory (create if it doesn't exist)
resized_dir = "cleandata/resized/train"
resized_dir1 = "cleandata/resized/val"
resized_dir2 ="cleandata/resized/test"
if not os.path.exists(resized_dir2):
    os.makedirs(resized_dir2)

# Set the desired size for the images
desired_size = (32, 32)

# Loop over all subdirectories under the cleaned dataset directory
for subdir in os.listdir(cleaned_dir2):
    subdir_path = os.path.join(cleaned_dir2, subdir)
    if os.path.isdir(subdir_path):
        # Loop over all images in the current subdirectory
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                # Load the image
                img = cv2.imread(os.path.join(subdir_path, filename))

                # Resize the image to the desired size
                resized_img = cv2.resize(img, desired_size)

                # Save the resized image to the resized dataset directory
                resized_subdir = os.path.join(resized_dir2, subdir)
                if not os.path.exists(resized_subdir):
                    os.makedirs(resized_subdir)
                cv2.imwrite(os.path.join(resized_subdir, filename), resized_img)

## Data Normalization and resizing in train,test, and val data

In [28]:
import os
import cv2
import numpy as np

train_dir = "cleandata/validation/train"
img_height, img_width = 32, 32  # define the desired image size

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_train = []
y_train = []

valid_labels = list(range(10)) + [chr(i) for i in range(ord('A'), ord('Z')+1)]

# Loop through all subfolders under the train directory
for subdir in os.listdir(train_dir):
    if subdir not in valid_labels:
        continue
    subdir_path = os.path.join(train_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_train.append(img)
                y_train.append((subdir) if isinstance(subdir, str) else subdir)  # set the label to the subfolder name (integer or character)

# Convert the data to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

# Print the shape of the data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (11276, 32, 32, 1)
y_train shape: (11276,)


In [29]:
import os
import cv2
import numpy as np

train_dir = "cleandata/test"
img_height, img_width = 32, 32  # define the desired image size

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_test = []
y_test = []

valid_labels = list(range(10)) + [chr(i) for i in range(ord('A'), ord('Z')+1)]

# Loop through all subfolders under the train directory
for subdir in os.listdir(train_dir):
    if subdir not in valid_labels:
        continue
    subdir_path = os.path.join(train_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_test.append(img)
                y_test.append((subdir) if isinstance(subdir, str) else subdir)  # set the label to the subfolder name (integer or character)

# Convert the data to numpy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)

# Print the shape of the data
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (2105, 32, 32, 1)
y_test shape: (2105,)


In [30]:
import os
import cv2
import numpy as np

val_dir = "cleandata/validation/val"
img_height, img_width = 32, 32  # define the desired image size

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_val = []
y_val = []

valid_labels = list(range(10)) + [chr(i) for i in range(ord('A'), ord('Z')+1)]

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if subdir not in valid_labels:
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_val.append(img)
                y_val.append((subdir) if isinstance(subdir, str) else subdir)  # set the label to the subfolder name (integer or character)

# Convert the data to numpy arrays
X_val = np.array(X_val)
y_val = np.array(y_val)

# Print the shape of the data
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (2819, 32, 32, 1)
y_val shape: (2819,)


## resizing, normalization and labeling process using all data (36 data 0-9 and A to Z) masih fail

## resizing, normalization and labeling process using 0-9 data

In [1]:
import os
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

val_dir = "cleandata/validation/train"
img_height, img_width = 32, 32  # define the desired image size
num_classes = 10  # 10 digits only

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_train = []
y_train = []

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if not subdir.isdigit():
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_train.append(img)
                label = int(subdir)  # convert the subdirectory name to integer label
                y_train.append(label)

# Convert the data to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

# One-hot encode the labels
y_train = to_categorical(y_train, num_classes=num_classes)

# Print the shape of the data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape) 

X_train shape: (4395, 32, 32, 1)
y_train shape: (4395, 10)


In [2]:
import os
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

val_dir = "cleandata/test"
img_height, img_width = 32, 32  # define the desired image size
num_classes = 10  # 10 digits only

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_test = []
y_test = []

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if not subdir.isdigit():
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_test.append(img)
                label = int(subdir)  # convert the subdirectory name to integer label
                y_test.append(label)

# Convert the data to numpy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)

# One-hot encode the labels
y_test = to_categorical(y_test, num_classes=num_classes)

# Print the shape of the data
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape) 

X_test shape: (904, 32, 32, 1)
y_test shape: (904, 10)


In [3]:
import os
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

val_dir = "cleandata/validation/val"
img_height, img_width = 32, 32  # define the desired image size
num_classes = 10  # 10 digits only

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_val = []
y_val = []

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if not subdir.isdigit():
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_val.append(img)
                label = int(subdir)  # convert the subdirectory name to integer label
                y_val.append(label)

# Convert the data to numpy arrays
X_val = np.array(X_val)
y_val = np.array(y_val)

# One-hot encode the labels
y_val = to_categorical(y_val, num_classes=num_classes)

# Print the shape of the data
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape) 

X_val shape: (1099, 32, 32, 1)
y_val shape: (1099, 10)


## resizing, normalization and labeling process using A-Z data

In [15]:
import os
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

val_dir = "cleandata/validation/train"
img_height, img_width = 32, 32  # define the desired image size
num_classes = 26  # 26 letters only

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_train = []
y_train = []

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if not subdir.isalpha() or not subdir.isupper():
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_train.append(img)
                label = ord(subdir) - ord('A')  # convert character label to integer label
                y_train.append(label)

# Convert the data to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

# One-hot encode the labels
y_train = to_categorical(y_train, num_classes=num_classes)

# Print the shape of the data
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (11276, 32, 32, 1)
y_train shape: (11276, 26)


In [18]:
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

val_dir = "cleandata/test"
img_height, img_width = 32, 32  # define the desired image size
num_classes = 26  # 26 letters only

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_test = []
y_test = []

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if not subdir.isalpha() or not subdir.isupper():
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_test.append(img)
                label = ord(subdir) - ord('A')  # convert character label to integer label
                y_test.append(label)

# Convert the data to numpy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)

# One-hot encode the labels
y_test = to_categorical(y_test, num_classes=num_classes)

# Print the shape of the data
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (2105, 32, 32, 1)
y_test shape: (2105, 26)


In [19]:
import cv2
import numpy as np
from tensorflow.keras.utils import to_categorical

val_dir = "cleandata/validation/val"
img_height, img_width = 32, 32  # define the desired image size
num_classes = 26  # 26 letters only

def preprocess_img(img_path):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (img_height, img_width))
    img = img.astype(np.float32) / 255.0  # normalize pixel values to [0, 1]
    img = np.expand_dims(img, axis=-1)  # add channel dimension
    return img

X_val = []
y_val = []

# Loop through all subfolders under the validation directory
for subdir in os.listdir(val_dir):
    if not subdir.isalpha() or not subdir.isupper():
        continue
    subdir_path = os.path.join(val_dir, subdir)
    if os.path.isdir(subdir_path):
        # Loop through all images in the current subfolder
        for filename in os.listdir(subdir_path):
            if filename.endswith(".png"):
                img_path = os.path.join(subdir_path, filename)
                img = preprocess_img(img_path)
                X_val.append(img)
                label = ord(subdir) - ord('A')  # convert character label to integer label
                y_val.append(label)

# Convert the data to numpy arrays
X_val = np.array(X_val)
y_val = np.array(y_val)

# One-hot encode the labels
y_val = to_categorical(y_val, num_classes=num_classes)

# Print the shape of the data
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_val shape: (2819, 32, 32, 1)
y_val shape: (2819, 26)


### CNN MODEL BUILD

In [4]:
from tensorflow.keras import layers, models

# Define the model architecture
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 1)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 30, 30, 32)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 15, 15, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 13, 13, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 6, 6, 64)         0         
 2D)                                                             
                                                                 
 flatten (Flatten)           (None, 2304)              0         
                                                                 
 dense (Dense)               (None, 64)                1

In [5]:
print("X_train shape : ", X_train.shape)
print("X_val : ", X_val.shape)
print("y_train : ", y_train.shape)
print("y_val :", y_val.shape)

X_train shape :  (4395, 32, 32, 1)
X_val :  (1099, 32, 32, 1)
y_train :  (4395, 10)
y_val : (1099, 10)


In [6]:
from tensorflow.keras.callbacks import ModelCheckpoint


# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_pred = model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)  # convert the predicted probabilities to class labels
y_true = np.argmax(y_val, axis=1)  # convert the one-hot encoded labels to class labels

precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')

print("Precision: {:.2f}%".format(precision*100))
print("Recall: {:.2f}%".format(recall*100))
print("F1 score: {:.2f}%".format(f1*100))

Precision: 99.55%
Recall: 99.54%
F1 score: 99.54%


In [8]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test loss:", test_loss)
print("Test accuracy:", test_acc)

Test loss: 2.4636101443320513e-05
Test accuracy: 1.0


In [9]:
model.save("OCERusingNumber.h5")