Import stuff

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Input
import os
import pandas as pd
import numpy as np
import cv2
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, LSTM, Bidirectional, Reshape, Attention
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
import pickle
import shutil
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import zipfile

Mounting G Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
os.chdir('/content/drive/My Drive/project') # Replace 'your_dataset_folder'

THIS PART IS WHERE DATA PREPARATION PART

First, we unify the dataset


In [2]:

RAW_DIR = "raw_datasets"
PROCESSED_DIR = "data/processed"
IMAGE_DIR = os.path.join(PROCESSED_DIR, "images")
LABELS_CSV = os.path.join(PROCESSED_DIR, "labels.csv")


def ensure_dirs():
    os.makedirs(IMAGE_DIR, exist_ok=True)


def dataset_already_processed():
    """Check if processed images exist to avoid redundant processing."""
    return os.path.exists(LABELS_CSV) and len(os.listdir(IMAGE_DIR)) > 0

def decompress_raw_datasets():
    """Extract ZIP files only if they haven't been extracted already."""
    zip_files = [f for f in os.listdir(RAW_DIR) if f.endswith(".zip")]

    for zip_file in zip_files:
        zip_path = os.path.join(RAW_DIR, zip_file)
        extract_folder = os.path.join(RAW_DIR, zip_file.replace(".zip", ""))  # Expected extraction folder

        if os.path.exists(extract_folder) and len(os.listdir(extract_folder)) > 0:
            print(f"✅ {zip_file} is already extracted, skipping...")
        else:
            print(f"📦 Extracting {zip_file}...")
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extractall(RAW_DIR)
            print(f"✅ Extraction complete: {zip_file}")


def process_ocr_dataset():
    """Process character-based dataset from OCR_dataset."""
    dataset_path = os.path.join(RAW_DIR, "OCR_dataset")
    data = []

    for data_folder in ["data/training_data", "data/testing_data", "data2/training_data", "data2/testing_data"]:
        full_path = os.path.join(dataset_path, data_folder)
        if not os.path.exists(full_path):
            continue

        for char_folder in os.listdir(full_path):
            char_folder_path = os.path.join(full_path, char_folder)
            if os.path.isdir(char_folder_path):
                for img_name in os.listdir(char_folder_path):
                    src_path = os.path.join(char_folder_path, img_name)
                    if os.path.exists(src_path):
                        new_name = f"{char_folder}_{img_name}"
                        dst_path = os.path.join(IMAGE_DIR, new_name)
                        shutil.copy(src_path, dst_path)
                        data.append((dst_path, char_folder))

    return data

def process_df_resized():
    """Extract labels from df_resized filenames and move images."""
    dataset_path = os.path.join(RAW_DIR, "df_resized")
    data = []

    for img_name in os.listdir(dataset_path):
        src_path = os.path.join(dataset_path, img_name)
        if os.path.exists(src_path):
            try:
                label = img_name.split("_")[-1].split(".")[0]  # Extract character after '_'
                new_name = f"{label}_{img_name}"
                dst_path = os.path.join(IMAGE_DIR, new_name)
                shutil.copy(src_path, dst_path)
                data.append((dst_path, label))
            except IndexError:
                print(f"Skipping {img_name}, unable to extract label.")

    return data

def process_handwriting_ds():
    """Process the newly added dataset (A, a_lower, 0, etc.) into a unified format."""
    extracted_path = os.path.join(RAW_DIR, "handwritting_ds")
    data = []

    if not os.path.exists(extracted_path):
        print("❌ Extracted dataset folder not found!")
        return data

    # Process each character folder (A, a_lower, 0, etc.)
    for label_folder in os.listdir(extracted_path):
        label_path = os.path.join(extracted_path, label_folder)

        if os.path.isdir(label_path):
            # Extract correct label
            label = label_folder.replace("_lower", "")  # Normalize lowercase labels

            # Scan subfolders (hsf_1 to hsf_7, train)
            for subfolder in os.listdir(label_path):
                subfolder_path = os.path.join(label_path, subfolder)

                if os.path.isdir(subfolder_path):
                    for img_name in os.listdir(subfolder_path):
                        src_path = os.path.join(subfolder_path, img_name)
                        if os.path.exists(src_path):
                            new_name = f"{label}_{img_name}"  # Ensure unique naming
                            dst_path = os.path.join(IMAGE_DIR, new_name)
                            shutil.copy(src_path, dst_path)
                            data.append((dst_path, label))

    print(f"✅ Processed {len(data)} images from the new dataset.")
    return data


def unify_dataset():
    ensure_dirs()

    # Check if we already processed datasets
    if dataset_already_processed():
        print("✅ Processed dataset already exists. Only adding new data...")
        # Append new data to existing CSV
        handwriting_data = process_handwriting_ds()
        df_existing = pd.read_csv(LABELS_CSV) if os.path.exists(LABELS_CSV) else pd.DataFrame(columns=["image_path", "label"])
        df_new = pd.DataFrame(handwriting_data, columns=["image_path", "label"])
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        df_combined.to_csv(LABELS_CSV, index=False)
        print(f"✅ Added {len(handwriting_data)} new images. Total: {len(df_combined)}")
    else:
        print("🚀 No processed dataset found. Processing from raw datasets...")
        # ocr_data = process_ocr_dataset()
        df_resized_data = process_df_resized()
        handwriting_data = process_handwriting_ds()
        all_data = df_resized_data + handwriting_data
        df = pd.DataFrame(all_data, columns=["image_path", "label"])
        df.to_csv(LABELS_CSV, index=False)
        print(f"Dataset processing complete. {len(df)} images processed.")


In [None]:
decompress_raw_datasets()

✅ df_resized.zip is already extracted, skipping...
✅ OCR_dataset.zip is already extracted, skipping...
✅ handwritting_ds.zip is already extracted, skipping...


In [3]:
unify_dataset()

🚀 No processed dataset found. Processing from raw datasets...
✅ Processed 731668 images from the new dataset.


NameError: name 'ocr_data' is not defined

THEN WE SPLIT THE DATASETS

In [5]:
PROCESSED_DIR = "data/processed"
LABELS_CSV = os.path.join(PROCESSED_DIR, "labels.csv")

def check_dataset():
    df = pd.read_csv(LABELS_CSV)
    # Count occurrences of each label
    label_counts = df["label"].value_counts()
    # Print classes that have fewer than 5 samples
    rare_classes = label_counts[label_counts < 5]
    print("Rare classes (fewer than 5 samples):")
    print(rare_classes)
    # Remove classes with fewer than 5 samples
    rare_classes = label_counts[label_counts < 5].index
    df = df[~df["label"].isin(rare_classes)]
    print(df["label"].value_counts())


def split_dataset():
    """Splits the dataset into train, validation, and test sets."""
    df = pd.read_csv(LABELS_CSV)

    # Remove NaN values in 'label' column
    df = df.dropna(subset=["label"])

    # Count occurrences of each label
    label_counts = df["label"].value_counts()

    # # Filter out labels that appear only once
    # rare_classes = label_counts[label_counts < 2].index
    # df = df[~df["label"].isin(rare_classes)]

    # Replace rare labels with 'OTHER'
    # df["label"] = df["label"].apply(lambda x: x if label_counts[x] >= 5 else "OTHER")

    # Ensure only labeled data is used for splitting
    df_labeled = df[df["label"] != "UNKNOWN"]

    # Check again if there are any missing labels after cleaning
    if df_labeled["label"].isnull().sum() > 0:
        print("Warning: Some labels are still missing!")


    print(df_labeled)

    # Stratified split to maintain label distribution
    train, temp = train_test_split(df_labeled, test_size=0.2, random_state=42)
    val, test = train_test_split(temp, test_size=0.5, random_state=42)

    # Save the splits
    train.to_csv(os.path.join(PROCESSED_DIR, "train.csv"), index=False)
    val.to_csv(os.path.join(PROCESSED_DIR, "val.csv"), index=False)
    test.to_csv(os.path.join(PROCESSED_DIR, "test.csv"), index=False)

    print(f"Dataset split complete: {len(train)} train, {len(val)} validation, {len(test)} test.")


In [6]:
check_dataset()

Rare classes (fewer than 5 samples):
Series([], Name: count, dtype: int64)
label
1    38049
7    35796
3    35293
0    34803
2    34184
     ...  
f     2513
K     2473
y     2438
p     2432
j     1927
Name: count, Length: 62, dtype: int64


In [7]:
split_dataset()

                                        image_path label
0       data/processed/images/a_train_61_09132.png     a
1               data/processed/images/y_1468_y.png     y
2       data/processed/images/P_train_50_04964.png     P
3       data/processed/images/9_train_39_01395.png     9
4       data/processed/images/4_train_34_03026.png     4
...                                            ...   ...
733603  data/processed/images/S_train_53_05266.png     S
733604  data/processed/images/5_train_35_29485.png     5
733605  data/processed/images/V_train_56_03929.png     V
733606  data/processed/images/4_train_34_04750.png     4
733607  data/processed/images/5_train_35_06466.png     5

[733608 rows x 2 columns]
Dataset split complete: 586886 train, 73361 validation, 73361 test.


Augmenting the data and preprocessing

In [9]:
# Constants
IMG_HEIGHT = 64
IMG_WIDTH = 64
DATASET_DIR = "data/processed"
AUGMENTED_DIR = "data/augmented"

# Ensure output directory exists
os.makedirs(AUGMENTED_DIR, exist_ok=True)


In [10]:

def preprocess_image(image_path):
    """Loads an image, converts to grayscale, resizes, and normalizes."""
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Convert to grayscale
    # Check if the image was loaded properly
    if image is None:
        print(f"⚠️ Warning: Unable to read image {image_path}. Skipping...")
    else:
        # Resize only if the image is valid
        image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))  # Resize
        # image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))  # Resize
        image = image.astype(np.float32) / 255.0  # Normalize (0-1 range)
    return image

def augment_image(image):
    """Applies augmentation using TensorFlow operations."""
    image = tf.convert_to_tensor(image, dtype=tf.float32)  # Convert to tensor
    image = tf.expand_dims(image, axis=-1)  # Add channel dimension (from [H, W] to [H, W, 1])

    # Apply augmentations
    image = tf.image.random_brightness(image, max_delta=0.2)  # Random brightness
    image = tf.image.random_contrast(image, lower=0.8, upper=1.2)  # Contrast variation
    image = tf.image.random_flip_left_right(image)  # Horizontal flip
    # image = tf.image.random_translation(image, translations=[3, 3])  # Small random shift
    image = tf.image.random_jpeg_quality(image, min_jpeg_quality=80, max_jpeg_quality=100)  # Compression artifacts

    return image.numpy()  # Convert back to NumPy

def process_dataset(csv_filename, output_csv):
    """Processes images listed in a CSV and saves them in the augmented directory."""
    df = pd.read_csv(os.path.join(DATASET_DIR, csv_filename))
    new_data = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        image_path, label = row["image_path"], row["label"]
        image = preprocess_image(image_path)

        if image is None:
            continue

        # Save preprocessed image
        new_path = os.path.join(AUGMENTED_DIR, os.path.basename(image_path))
        cv2.imwrite(new_path, (image * 255).astype(np.uint8))

        # Save augmented versions
        for i in range(3):  # 3 augmented versions per image
            aug_image = augment_image(image)
            aug_path = os.path.join(AUGMENTED_DIR, f"aug_{i}_" + os.path.basename(image_path))
            cv2.imwrite(aug_path, (aug_image * 255).astype(np.uint8))
            new_data.append([aug_path, label])

        new_data.append([new_path, label])

    # Save updated CSV
    df_new = pd.DataFrame(new_data, columns=["image_path", "label"])
    df_new.to_csv(os.path.join(AUGMENTED_DIR, output_csv), index=False)

    print(f"✅ Processed {csv_filename} → {output_csv} ({len(df_new)} images)")


In [11]:
process_dataset("train.csv", "train_augmented.csv")
# process_dataset("val.csv", "val_augmented.csv")
# process_dataset("test.csv", "test_augmented.csv")

100%|██████████| 586886/586886 [1:53:50<00:00, 85.92it/s]   


✅ Processed train.csv → train_augmented.csv (2347544 images)


This is where we actually create the model

BUILDING THE MODELS

In [3]:

# Define input shape
IMG_WIDTH = 64
IMG_HEIGHT = 64

def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    """Creates a Transformer Encoder block."""
    x = layers.LayerNormalization()(inputs)
    x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads)(x, x)
    x = layers.Dropout(dropout)(x)
    x = layers.Add()([x, inputs])

    res = layers.LayerNormalization()(x)
    res = layers.Dense(ff_dim, activation="relu")(res)
    res = layers.Dropout(dropout)(res)
    res = layers.Dense(inputs.shape[-1])(res)
    x = layers.Add()([x, res])

    return x

def build_ocr_model(NUM_CLASSES):
    """Builds a CNN-Transformer model for handwriting recognition."""

    # Input layer
    input_img = keras.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 1), name="image")

    # CNN feature extraction
    x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(input_img)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(256, (3, 3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    # x = layers.Conv2D(256, (3, 3), activation="relu", padding="same")(x)
    # x = layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D(pool_size=(2, 2))(x)

    # Reshape for Transformer
    new_shape = x.shape[1] * x.shape[2]
    x = layers.Reshape(target_shape=(new_shape, x.shape[3]))(x)

    # Transformer Encoder Layers
    x = transformer_encoder(x, head_size=64, num_heads=4, ff_dim=256, dropout=0.1)
    x = transformer_encoder(x, head_size=128, num_heads=4, ff_dim=256, dropout=0.1)
    x = transformer_encoder(x, head_size=256, num_heads=4, ff_dim=256, dropout=0.1)
    x = transformer_encoder(x, head_size=64, num_heads=4, ff_dim=256, dropout=0.1)

    # Fully connected output
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.5)(x)

    # Output layer for classification (softmax activation)

    print("Num Classes model : ", NUM_CLASSES)
    output = layers.Dense(NUM_CLASSES, activation="softmax", name="output")(x)
    print(output)

    # Model compilation
    model = keras.Model(inputs=input_img, outputs=output, name="bandungbondowoso")
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    return model



This only for preview

In [4]:
# # Build the model
import keras.backend as K
K.clear_session()  # Clear the old model session
ocr_model = build_ocr_model(63)
ocr_model.summary()

Num Classes model :  63
<KerasTensor shape=(None, 63), dtype=float32, sparse=False, name=keras_tensor_47>


In [4]:
import os
import pandas as pd

DATA_DIR = "data/processed/images"
DATA_DIR_PROCESSED = "data/processed" # For easy access the csv
CSV_PATH = os.path.join(DATA_DIR_PROCESSED, "train_augmented.csv")

def create_train_csv(data_dir, csv_path):
    """
    Generates a CSV mapping image paths to labels.
    - Extracts labels from filenames (everything before the first `_`).

    Args:
        data_dir (str): Path to directory containing augmented images.
        csv_path (str): Path to save the generated CSV file.

    Returns:
        None
    """
    print(f"📂 Scanning images in: {data_dir}")

    image_paths = []
    labels = []

    for file in os.listdir(data_dir):
        if file.endswith((".png", ".jpg", ".jpeg")):  # Ensure it's an image
            img_path = os.path.join(data_dir, file)

            # Extract label (text before first `_`)
            label = file.split("_")[0]

            image_paths.append(img_path)
            labels.append(label)

    # **Create DataFrame & Save CSV**
    df = pd.DataFrame({"image_path": image_paths, "label": labels})
    df.to_csv(csv_path, index=False)

    print(f"✅ CSV saved: {csv_path} ({len(df)} samples)")

# **Run the function**
create_train_csv(DATA_DIR, CSV_PATH)


📂 Scanning images in: data/processed/images
✅ CSV saved: data/processed/train_augmented.csv (733608 samples)


This is the preparation script for training

In [5]:
from tensorflow.keras.utils import to_categorical

DATA_DIR = "data/augmented"
IMAGE_DIR = "data/processed"
TRAIN_CSV = os.path.join(IMAGE_DIR, "train_augmented.csv")
VAL_CSV = os.path.join(IMAGE_DIR, "val.csv")  # Non-augmented validation data
TEST_CSV = os.path.join(IMAGE_DIR, "test.csv")  # Non-augmented test data

IMG_WIDTH, IMG_HEIGHT = 64, 64
BATCH_SIZE = 8 # change this to reduce memory usage (the ideal value is 32)
EPOCHS = 12
AUTOTUNE = tf.data.experimental.AUTOTUNE
CHECKPOINT_PATH = "models/bandungbondowoso.keras"
NUM_CLASSES = 64

# **2️⃣ Data Loading Functions**
def load_data(csv_path, sample_fraction=0.2, random_state=42):
    """Loads image paths and labels from a CSV file with stratified sampling."""
    print(f"📂 Loading data from: {csv_path} (Sampling {sample_fraction * 100}%)")
    chunksize = 10000  # Adjust based on memory constraints
    chunks = []

    for chunk in pd.read_csv(csv_path, chunksize=chunksize):
        sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
        chunks.append(sampled_chunk)

    df = pd.concat(chunks, ignore_index=True)
    image_paths = df["image_path"].values
    labels = df["label"].astype(str).values

    # **Encode Labels**
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    num_classes = len(label_encoder.classes_)

    print(f"Labels shape: {labels.shape}")
    # # Check if labels are one-hot encoded
    # if len(labels.shape) == 1:  # If labels are not one-hot encoded
    #     print("Labels are not one-hot encoded. Converting...")
    #     labels = to_categorical(labels, num_classes=NUM_CLASSES)
    #     print("Labels are now one-hot encoded.")
    # # Check the shape and content
    # print(f"Labels shape: {labels.shape}")
    # print(f"Sample labels: {labels[:5]}")

    # **Save Label Encoder**
    with open("models/label_encoder.pkl", "wb") as f:
        pickle.dump(label_encoder, f)

    print(f"✅ Loaded {len(labels)} samples from {csv_path}")
    print(f"🎯 Max encoded label: {max(labels)}")
    print(f"🎯 Min encoded label: {min(labels)}")
    print(f"🎯 Number of unique encoded labels: {len(set(labels))}")
    print(f"🎯 Computed NUM_CLASSES: {num_classes}")

    # # Assuming labels are one-hot encoded
    # print(f"✅ Loaded {len(labels)} samples from {csv_path}")
    # if len(labels.shape) > 1:
    #   # Labels are one-hot encoded
    #   max_label = np.argmax(labels, axis=1)  # Get the class with max value in each row
    #   min_label = np.argmin(labels, axis=1)  # Get the class with min value in each row
    # else:
    #   max_label = max(labels)
    #   min_label = min(labels)
    # print(f"🎯 Max encoded label: {max(max_label)}")  # Max value of the labels
    # print(f"🎯 Min encoded label: {min(min_label)}")  # Min value of the labels
    # print(f"🎯 Number of unique encoded labels: {len(set(np.argmax(labels, axis=1)))}")  # Unique class count

    return image_paths, labels, num_classes

def preprocess_image(image_path, label):
    """Read, decode, resize, normalize images, and one-hot encode labels."""
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, channels=1)  # Convert to grayscale
    image = tf.image.resize(image, (IMG_HEIGHT, IMG_WIDTH)) / 255.0  # Normalize to [0, 1]

    # One-hot encode the label
    label = tf.cast(label, tf.int32)  # Ensure label is int32
    label = tf.one_hot(label, NUM_CLASSES)  # One-hot encode the label
    print(f"Shape of label after one-hot encoding: {label.shape}")  # Check shape
    return image, label


def create_tf_dataset(csv_path, batch_size):
    """Create a TensorFlow dataset from a CSV file."""
    image_paths, labels, _ = load_data(csv_path)
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(lambda x, y: preprocess_image(x, y), num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(batch_size).prefetch(AUTOTUNE)
    return dataset

TRAINING STARTED

In [6]:
# Load datasets

print("Loading Train Dataset")
train_image_paths, train_labels, NUM_CLASSES = load_data(TRAIN_CSV, 1.0)


print("Loading Validation Dataset")
val_image_paths, val_labels, _ = load_data(VAL_CSV, 1.0)  # We only need labels here

print("Creating training dataset")
train_dataset = create_tf_dataset(TRAIN_CSV, BATCH_SIZE)

print("Creating validation dataset")
val_dataset = create_tf_dataset(VAL_CSV, BATCH_SIZE)



Loading Train Dataset
📂 Loading data from: data/processed/train_augmented.csv (Sampling 100.0%)


  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).app

Labels shape: (2347544,)
✅ Loaded 2347544 samples from data/processed/train_augmented.csv
🎯 Max encoded label: 61
🎯 Min encoded label: 0
🎯 Number of unique encoded labels: 62
🎯 Computed NUM_CLASSES: 62
Loading Validation Dataset
📂 Loading data from: data/processed/val.csv (Sampling 100.0%)


  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).app

Labels shape: (73361,)
✅ Loaded 73361 samples from data/processed/val.csv
🎯 Max encoded label: 61
🎯 Min encoded label: 0
🎯 Number of unique encoded labels: 62
🎯 Computed NUM_CLASSES: 62
Creating training dataset
📂 Loading data from: data/processed/train_augmented.csv (Sampling 20.0%)


  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).app

Labels shape: (469550,)
✅ Loaded 469550 samples from data/processed/train_augmented.csv
🎯 Max encoded label: 61
🎯 Min encoded label: 0
🎯 Number of unique encoded labels: 62
🎯 Computed NUM_CLASSES: 62
Shape of label after one-hot encoding: (62,)
Creating validation dataset
📂 Loading data from: data/processed/val.csv (Sampling 20.0%)


  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=random_state))
  sampled_chunk = chunk.groupby("label", group_keys=False).app

Labels shape: (14678,)
✅ Loaded 14678 samples from data/processed/val.csv
🎯 Max encoded label: 61
🎯 Min encoded label: 0
🎯 Number of unique encoded labels: 62
🎯 Computed NUM_CLASSES: 62
Shape of label after one-hot encoding: (62,)


In [7]:
import keras.backend as K
K.clear_session()  # Clear the old model session
print("Num Classes BUild : ", NUM_CLASSES)
ocr_model = build_ocr_model(NUM_CLASSES)
ocr_model.summary()


Num Classes BUild :  62
Num Classes model :  62
<KerasTensor shape=(None, 62), dtype=float32, sparse=False, name=keras_tensor_47>


In [None]:
CHECKPOINT_PATH = "models/bandungbondowoso.keras"

# # **Load Existing Model or Create New**
# if os.path.exists(CHECKPOINT_PATH):
#     print(f"Loading existing model from {CHECKPOINT_PATH}...")
#     ocr_model = tf.keras.models.load_model(CHECKPOINT_PATH)


# **Define Checkpoints**
checkpoint = tf.keras.callbacks.ModelCheckpoint(CHECKPOINT_PATH, monitor="val_loss", save_best_only=True, verbose=1)

# **Continue Training**
history = ocr_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint]
)
        # **Save Final Model in `.keras` Format**
ocr_model.save(CHECKPOINT_PATH)
print(f"✅ Training complete! Model saved as {CHECKPOINT_PATH}")

Epoch 1/12


In [None]:

print("\n📌 Loading Test Dataset...")
test_dataset = create_tf_dataset(TEST_CSV, BATCH_SIZE)

# **Evaluate on Test Data**
print("\n🧪 Evaluating on Test Data...")
test_loss, test_acc = ocr_model.evaluate(test_dataset)
print(f"\n📊 Test Accuracy: {test_acc:.4f}, Test Loss: {test_loss:.4f}")


Training Summary

In [None]:
import matplotlib.pyplot as plt

# **Plot Training Summary**
def plot_training_summary(history):
    plt.figure(figsize=(12, 5))

    # **Loss Plot**
    plt.subplot(1, 2, 1)
    plt.plot(history.history["loss"], label="Train Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()

    # **Accuracy Plot**
    plt.subplot(1, 2, 2)
    plt.plot(history.history["accuracy"], label="Train Accuracy")
    plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.title("Training & Validation Accuracy")
    plt.legend()

    plt.show()


In [None]:
plot_training_summary(history)