# Text Identification OCR - Minimal Version (For Memorization)

OpenCV preprocessing + Tesseract OCR + CNN classification


In [None]:
# Imports
import numpy as np
import cv2
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(42)
tf.random.set_seed(42)


In [None]:
# Create synthetic text images
def create_text_image(text, width=200, height=50, font_size=30):
    img = Image.new('RGB', (width, height), 'white')
    draw = ImageDraw.Draw(img)
    try:
        font = ImageFont.truetype("arial.ttf", font_size)
    except:
        font = ImageFont.load_default()
    bbox = draw.textbbox((0, 0), text, font=font)
    x = (width - (bbox[2] - bbox[0])) // 2
    y = (height - (bbox[3] - bbox[1])) // 2
    draw.text((x, y), text, fill='black', font=font)
    return img

# Generate dataset
text_categories = {
    'numbers': ['123', '456', '789', '012', '345'],
    'letters': ['ABC', 'DEF', 'GHI', 'JKL', 'MNO'],
    'words': ['HELLO', 'WORLD', 'TEXT', 'IMAGE', 'OCR'],
    'mixed': ['A1B2', 'C3D4', 'E5F6', 'G7H8', 'I9J0']
}

images = []
labels = []
for category, text_list in text_categories.items():
    for text in text_list:
        img = create_text_image(text)
        images.append(np.array(img))
        labels.append(category)

X = np.array(images)
y = np.array(labels)
print(f"Dataset: {X.shape}, Labels: {len(labels)}")


In [None]:
# Preprocess with OpenCV
def preprocess_image(image, target_size=(128, 128)):
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    else:
        gray = image.copy()
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    _, thresh = cv2.threshold(blurred, 127, 255, cv2.THRESH_BINARY)
    kernel = np.ones((3, 3), np.uint8)
    cleaned = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
    resized = cv2.resize(cleaned, target_size)
    normalized = resized.astype('float32') / 255.0
    return normalized

X_processed = np.array([preprocess_image(img) for img in X])
print(f"Processed shape: {X_processed.shape}")


In [None]:
# Extract text with Tesseract OCR
def extract_text_ocr(image):
    img_uint8 = (image * 255).astype(np.uint8)
    pil_image = Image.fromarray(img_uint8)
    try:
        text = pytesseract.image_to_string(pil_image, config='--psm 7 --oem 3').strip()
    except:
        text = ""
    return text

extracted_texts = [extract_text_ocr(img) for img in X_processed]
print(f"OCR extracted: {len(extracted_texts)} texts")


In [None]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

# Reshape for CNN
X_cnn = X_processed.reshape(X_processed.shape[0], X_processed.shape[1], X_processed.shape[2], 1)
y_onehot = keras.utils.to_categorical(y_encoded, num_classes)

print(f"CNN input: {X_cnn.shape}, Labels: {y_onehot.shape}")


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_cnn, y_onehot, test_size=0.2, random_state=42, stratify=y_encoded
)
print(f"Train: {X_train.shape}, Test: {X_test.shape}")


In [None]:
# Build CNN model
model = keras.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.25),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.25),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation='softmax')
])

model.summary()


In [None]:
# Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train
history = model.fit(X_train, y_train, epochs=30, batch_size=16, validation_split=0.2, verbose=1)


In [None]:
# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {test_accuracy:.4f}")

y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_test, axis=1)

print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


In [None]:
# Plot training history
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Val')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Val')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(6, 5))
plt.imshow(cm, cmap='Blues')
plt.colorbar()
plt.xticks(range(len(label_encoder.classes_)), label_encoder.classes_)
plt.yticks(range(len(label_encoder.classes_)), label_encoder.classes_)
for i in range(len(label_encoder.classes_)):
    for j in range(len(label_encoder.classes_)):
        plt.text(j, i, cm[i, j], ha='center', va='center', fontsize=14, fontweight='bold')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Save model
model.save('text_identification_model.h5')
print("Model saved!")
