<a href="https://colab.research.google.com/github/Aniket21628/OCR/blob/main/OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install opencv-python numpy pandas scikit-learn tensorflow keras imutils



In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sachinpatel21/az-handwritten-alphabets-in-csv-format")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/az-handwritten-alphabets-in-csv-format


In [8]:
import os
import cv2
import numpy as np
import pandas as pd
from keras.datasets import mnist
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [5]:
def process_kaggle_dataset():
    """Process Kaggle A-Z handwritten dataset (CSV format) - UPDATED"""
    df = pd.read_csv("/kaggle/input/az-handwritten-alphabets-in-csv-format/A_Z Handwritten Data.csv")
    data = []
    labels = []

    # Use iloc for positional access
    for i in range(len(df)):
        row = df.iloc[i]
        label = int(row.iloc[0])  # First column is label
        pixels = row.iloc[1:].values.astype("uint8")  # Remaining columns are pixels

        # Reshape to 28x28
        pixels = pixels.reshape((28, 28))
        data.append(pixels)
        labels.append(label + 10)  # Offset: A=10, B=11, ..., Z=35

    return np.array(data, dtype="float32"), np.array(labels, dtype="int")

In [6]:
def process_mnist():
    """Process MNIST dataset"""
    (train_data, train_labels), (test_data, test_labels) = mnist.load_data()
    mnist_data = np.vstack([train_data, test_data])
    mnist_labels = np.hstack([train_labels, test_labels])
    return mnist_data.astype("float32"), mnist_labels

In [7]:
!mkdir -p data/surrey

In [8]:
!wget -q http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishImg.tgz -O data/english.tgz
!tar -xf data/english.tgz -C data/surrey/

In [9]:
!tar -xzf data/english.tgz -C data/surrey/ --strip-components=1

In [10]:
import os

# Check if files exist
surrey_path = "data/surrey/English/Img/GoodImg/Bmp"
if os.path.exists(surrey_path):
    print(f"Found {len(os.listdir(surrey_path))} character folders")
    sample_folder = os.path.join(surrey_path, os.listdir(surrey_path)[0])
    print(f"Sample folder contains {len(os.listdir(sample_folder))} images")
else:
    print("Dataset not found. Manual download required.")
    print("Please download from: https://github.com/sachinpatel21/Chars74k_Dataset")

Found 62 character folders
Sample folder contains 36 images


In [11]:
def process_surrey_dataset():
    """Process University of Surrey dataset (image files)"""
    SURREY_PATH = "data/surrey/English/Img/GoodImg/Bmp"
    characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

    data = []
    labels = []

    # Sort folders numerically (Sample001, Sample002, etc)
    folders = sorted(
        [f for f in os.listdir(SURREY_PATH) if f.startswith("Sample")],
        key=lambda x: int(x[6:])
    )

    for idx, folder in enumerate(folders):
        char_path = os.path.join(SURREY_PATH, folder)
        if not os.path.isdir(char_path):
            continue

        for img_file in os.listdir(char_path):
            if img_file.startswith("."):
                continue

            img_path = os.path.join(char_path, img_file)
            img = cv2.imread(img_path)

            if img is None:
                continue

            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            resized = cv2.resize(gray, (28, 28))
            data.append(resized)
            labels.append(idx)  # Use folder index as label

    return np.array(data, dtype="float32"), np.array(labels, dtype="int")

In [12]:
print("Processing MNIST dataset...")
mnist_data, mnist_labels = process_mnist()

print("Processing Kaggle A-Z dataset...")
kaggle_data, kaggle_labels = process_kaggle_dataset()

print("Processing Surrey dataset...")
surrey_data, surrey_labels = process_surrey_dataset()

# Verify datasets
print(f"MNIST: {mnist_data.shape[0]} images")
print(f"Kaggle: {kaggle_data.shape[0]} images")
print(f"Surrey: {surrey_data.shape[0]} images")


Processing MNIST dataset...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Processing Kaggle A-Z dataset...
Processing Surrey dataset...
MNIST: 70000 images
Kaggle: 372450 images
Surrey: 7705 images


In [13]:
combined_data = np.vstack([mnist_data, kaggle_data, surrey_data])
combined_labels = np.hstack([mnist_labels, kaggle_labels, surrey_labels])

In [14]:
np.save("combined_data.npy", combined_data)
np.save("combined_labels.npy", combined_labels)
print(f"Combined dataset size: {combined_data.shape[0]} images")

Combined dataset size: 450155 images


In [11]:
# Load combined dataset
data = np.load("combined_data.npy")
labels = np.load("combined_labels.npy")

In [12]:
# Resize to 32x32 for ResNet
resized_data = np.array([cv2.resize(img, (32, 32)) for img in data])
resized_data = np.expand_dims(resized_data, axis=-1)  # Add channel dimension
resized_data = resized_data / 255.0  # Normalize

In [13]:
# One-hot encode labels
label_bin = LabelBinarizer()
bin_labels = label_bin.fit_transform(labels)

In [14]:
# Split into train/test sets
(trainX, testX, trainY, testY) = train_test_split(
    resized_data, bin_labels, test_size=0.25, random_state=42
)

print(f"Train shape: {trainX.shape}, Test shape: {testX.shape}")

Train shape: (337616, 32, 32, 1), Test shape: (112539, 32, 32, 1)


In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Conv2D, BatchNormalization, Activation,
    Add, AveragePooling2D, Flatten, Dense
)
from tensorflow.keras.regularizers import l2

In [2]:
def residual_module(x, num_filters, stride, reduce=False):
    """ResNet residual module"""
    shortcut = x

    # Block 1: 1x1 conv
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Conv2D(int(num_filters * 0.25), (1, 1), use_bias=False)(x)

    # Block 2: 3x3 conv
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Conv2D(int(num_filters * 0.25), (3, 3), strides=stride,
               padding="same", use_bias=False)(x)

    # Block 3: 1x1 conv
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = Conv2D(num_filters, (1, 1), use_bias=False)(x)

    # Shortcut connection
    if reduce:
        shortcut = Conv2D(num_filters, (1, 1), strides=stride,
                         use_bias=False)(shortcut)

    x = Add()([x, shortcut])
    return x


In [3]:
def build_resnet(input_shape=(32, 32, 1), num_classes=62):
    """Build ResNet model for OCR"""
    inputs = Input(shape=input_shape)

    # Initial processing
    x = BatchNormalization()(inputs)

    # Stage 1
    x = residual_module(x, 64, (1, 1))

    # Stage 2 (reduce spatial size)
    x = residual_module(x, 128, (2, 2), reduce=True)

    # Stage 3 (reduce spatial size)
    x = residual_module(x, 256, (2, 2), reduce=True)

    # Final layers
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    x = AveragePooling2D((8, 8))(x)
    x = Flatten()(x)
    x = Dense(num_classes, activation="softmax")(x)

    return Model(inputs, x)

In [4]:
model = build_resnet()
model.summary()

In [15]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# Compile model
model.compile(
    loss="categorical_crossentropy",
    optimizer=Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

# Callbacks
checkpoint = ModelCheckpoint(
    "ocr_model.h5", save_best_only=True, monitor="val_accuracy", verbose=1
)

# Train
history = model.fit(
    trainX, trainY,
    validation_data=(testX, testY),
    epochs=20,
    batch_size=256,
    callbacks=[checkpoint]
)

Epoch 1/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.6526 - loss: 1.4160
Epoch 1: val_accuracy improved from -inf to 0.91694, saving model to ocr_model.h5




[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 41ms/step - accuracy: 0.6527 - loss: 1.4155 - val_accuracy: 0.9169 - val_loss: 0.3277
Epoch 2/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9438 - loss: 0.2241
Epoch 2: val_accuracy improved from 0.91694 to 0.91968, saving model to ocr_model.h5




[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 34ms/step - accuracy: 0.9438 - loss: 0.2241 - val_accuracy: 0.9197 - val_loss: 0.2850
Epoch 3/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9562 - loss: 0.1673
Epoch 3: val_accuracy did not improve from 0.91968
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 33ms/step - accuracy: 0.9562 - loss: 0.1673 - val_accuracy: 0.1601 - val_loss: 9.0998
Epoch 4/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9631 - loss: 0.1385
Epoch 4: val_accuracy did not improve from 0.91968
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 35ms/step - accuracy: 0.9631 - loss: 0.1385 - val_accuracy: 0.5519 - val_loss: 3.3207
Epoch 5/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9667 -



[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 35ms/step - accuracy: 0.9809 - loss: 0.0677 - val_accuracy: 0.9281 - val_loss: 0.2793
Epoch 14/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9822 - loss: 0.0623
Epoch 14: val_accuracy did not improve from 0.92811
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 33ms/step - accuracy: 0.9822 - loss: 0.0623 - val_accuracy: 0.2217 - val_loss: 40.5890
Epoch 15/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.9837 - loss: 0.0576
Epoch 15: val_accuracy did not improve from 0.92811
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 33ms/step - accuracy: 0.9837 - loss: 0.0577 - val_accuracy: 0.3721 - val_loss: 17.8441
Epoch 16/20
[1m1319/1319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0

In [16]:
from sklearn.metrics import classification_report

# Load best model
model.load_weights("ocr_model.h5")

# Predictions
preds = model.predict(testX, batch_size=256)
pred_labels = np.argmax(preds, axis=1)
true_labels = np.argmax(testY, axis=1)

# Classification report
characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
print(classification_report(
    true_labels, pred_labels,
    target_names=list(characters)
))


[1m440/440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step
              precision    recall  f1-score   support

           0       0.97      0.36      0.52      1692
           1       1.00      0.35      0.52      1936
           2       0.96      0.78      0.86      1815
           3       1.00      0.89      0.94      1808
           4       0.98      0.85      0.91      1713
           5       0.98      0.80      0.88      1597
           6       1.00      0.76      0.86      1725
           7       0.98      0.91      0.94      1845
           8       0.94      0.92      0.93      1720
           9       0.94      0.96      0.95      1782
           A       0.87      0.99      0.93      3681
           B       0.93      0.96      0.94      2215
           C       0.96      0.98      0.97      5816
           D       0.95      0.89      0.92      2531
           E       0.96      0.98      0.97      2957
           F       0.95      0.95      0.95       287
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
