In [None]:
from commonfunctions import *

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier  # MLP is an NN
from sklearn import svm
import numpy as np
import argparse
import cv2
import os
import random
from sklearn.model_selection import train_test_split
import pytesseract

import skimage
import skimage.io as io
import matplotlib.pyplot as plt
from skimage import filters, feature
from skimage.color import rgb2gray
from skimage.morphology import binary_erosion, binary_dilation, binary_closing,skeletonize, thin
from skimage.measure import find_contours
from skimage.draw import rectangle
from scipy.ndimage import median_filter
import cv2 as cv

# Convolution:
from scipy.signal import convolve2d
from scipy import fftpack
import math

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Path to the dataset
DATASET_PATH = "./English/Img"  # Update this path to your Chars74K directory

# Constants
IMAGE_SIZE = (64, 64)  # Resize images to 64x64
NUM_CLASSES = 36  # 26 letters + 10 digits

def load_chars74k(dataset_path=DATASET_PATH):
    """Load the Chars74K dataset and preprocess images from both GoodImg and BadImg."""
    images = []
    labels = []

    # Define subfolders for GoodImg and BadImg
    folders = ['GoodImg/Bmp', 'BadImag/Bmp']

    for folder in folders:
        folder_path = os.path.join(dataset_path, folder)

        # Iterate over all character folders in GoodImg and BadImg
        for label, char_folder in enumerate(sorted(os.listdir(folder_path))):
            char_folder_path = os.path.join(folder_path, char_folder)
            if not os.path.isdir(char_folder_path):
                continue

            # Iterate over all images in the character folder
            for img_name in os.listdir(char_folder_path):
                img_path = os.path.join(char_folder_path, img_name)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

                # Resize and normalize the image
                img = cv2.resize(img, IMAGE_SIZE)
                img = img / 255.0
                images.append(img)
                labels.append(label)

    # Convert to numpy arrays
    images = np.array(images).reshape(-1, IMAGE_SIZE[0], IMAGE_SIZE[1], 1)
    labels = np.array(labels)

    return images, labels


def build_cnn_model(input_shape=(64, 64, 1), num_classes=NUM_CLASSES):
    """Build and compile a CNN model."""
    model = Sequential([
        Conv2D(32, (3, 3), activation="relu", input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(256, activation="relu"),
        Dropout(0.5),
        Dense(num_classes, activation="softmax"),
    ])

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model


def runOCR():
    # Load and preprocess the dataset
    print("Loading dataset...")
    images, labels = load_chars74k()
    print(f"Loaded {len(images)} images and {len(labels)} labels.")

    # One-hot encode labels
    labels = to_categorical(labels, NUM_CLASSES)

    # Split into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

    # Build the CNN model
    print("Building model...")
    model = build_cnn_model()

    # Train the model
    print("Training model...")
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

    # Evaluate the model
    print("Evaluating model...")
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

    # Save the model
    model.save("chars74k_model.h5")
    print("Model saved as chars74k_model.h5")


runOCR()