In [None]:
import os
import cv2
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Bidirectional, LSTM, Dense, Lambda, Activation, BatchNormalization, Dropout
from keras.optimizers import Adam

In [None]:
train = pd.read_csv('/kaggle/input/handwriting-recognition/written_name_train_v2.csv')
valid = pd.read_csv('/kaggle/input/handwriting-recognition/written_name_validation_v2.csv')
test = pd.read_csv('/kaggle/input/handwriting-recognition/written_name_test_v2.csv')

In [None]:
plt.figure(figsize=(15, 10))

for i in range(6):
    ax = plt.subplot(2, 3, i+1)
    img_dir = '/kaggle/input/handwriting-recognition/train_v2/train/'+train.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    plt.imshow(image, cmap = 'gray')
    plt.title(train.loc[i, 'IDENTITY'], fontsize=12)
    plt.axis('off')

plt.subplots_adjust(wspace=0.2, hspace=-0.8)

In [None]:
train_size = len(train)
print(f"Number of samples in the training set: {train_size}")

valid_size = len(valid)
print(f"Number of samples in the validation set: {valid_size}")

test_size = len(test)
print(f"Number of samples in the testing set: {test_size}")

In [None]:
print("Number of NaNs in train set      : ", train['IDENTITY'].isnull().sum())
print("Number of NaNs in validation set : ", valid['IDENTITY'].isnull().sum())

In [None]:
train.dropna(axis=0, inplace=True)
valid.dropna(axis=0, inplace=True)

In [None]:
unreadable = train[train['IDENTITY'] == 'UNREADABLE']
unreadable.reset_index(inplace = True, drop=True)

plt.figure(figsize=(15, 10))

for i in range(6):
    ax = plt.subplot(2, 3, i+1)
    img_dir = '/kaggle/input/handwriting-recognition/train_v2/train/'+unreadable.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    plt.imshow(image, cmap = 'gray')
    plt.title(unreadable.loc[i, 'IDENTITY'], fontsize=12)
    plt.axis('off')

plt.subplots_adjust(wspace=0.2, hspace=-0.8)

In [None]:
train = train[train['IDENTITY'] != 'UNREADABLE']
valid = valid[valid['IDENTITY'] != 'UNREADABLE']

In [None]:
train['IDENTITY'] = train['IDENTITY'].str.upper()
valid['IDENTITY'] = valid['IDENTITY'].str.upper()

In [None]:
train.reset_index(inplace = True, drop=True) 
valid.reset_index(inplace = True, drop=True)

In [None]:
train_size = len(train)
print(f"Number of samples in the training set: {train_size}")

valid_size = len(valid)
print(f"Number of samples in the validation set: {valid_size}")

test_size = len(test)
print(f"Number of samples in the testing set: {test_size}")

In [None]:
def preprocess(img):
    (h, w) = img.shape
    
    final_img = np.ones([64, 256])*255 # blank white image
    
    # crop
    if w > 256:
        img = img[:, :256]
        
    if h > 64:
        img = img[:64, :]
    
    
    final_img[:h, :w] = img
    return cv2.rotate(final_img, cv2.ROTATE_90_CLOCKWISE)

In [None]:
train_size = 30000
valid_size= 3000

In [None]:
train_x = []

for i in range(train_size):
    img_dir = '/kaggle/input/handwriting-recognition/train_v2/train/'+train.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    image = preprocess(image)
    image = image/255.
    train_x.append(image)

In [None]:
valid_x = []

for i in range(valid_size):
    img_dir = '/kaggle/input/handwriting-recognition/validation_v2/validation/'+valid.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    image = preprocess(image)
    image = image/255.
    valid_x.append(image)

In [None]:
train_x = np.array(train_x).reshape(-1, 256, 64, 1)
valid_x = np.array(valid_x).reshape(-1, 256, 64, 1)

In [None]:
print(f"Shape of preprocessed training data: {train_x.shape}")  # First value is the number of samples
print(f"Shape of preprocessed validation data: {valid_x.shape}")
# Assuming test data is preprocessed similarly as train_x and valid_x
# print(f"Shape of preprocessed testing data: {test_x.shape}")


In [None]:
alphabets = u"ABCDEFGHIJKLMNOPQRSTUVWXYZ-' "
max_str_len = 24 # max length of input labels
num_of_characters = len(alphabets) + 1 # +1 for ctc pseudo blank
num_of_timestamps = 64 # max length of predicted labels


def label_to_num(label):
    label_num = []
    for ch in label:
        label_num.append(alphabets.find(ch))
        
    return np.array(label_num)

def num_to_label(num):
    ret = ""
    for ch in num:
        if ch == -1:  # CTC Blank
            break
        else:
            ret+=alphabets[ch]
    return ret

In [None]:
name = 'JEBASTIN'
print(name, '\n',label_to_num(name))

In [None]:
# Define the threshold for a "long" label
threshold = 24

# Calculate the length of each label in the training set
train['label_length'] = train['IDENTITY'].apply(len)

# Find labels longer than the threshold
long_labels = train[train['label_length'] > threshold]

# Count how many labels are longer than the threshold
num_long_labels = len(long_labels)

# Calculate the percentage of long labels in the dataset
percentage_long_labels = (num_long_labels / len(train)) * 100

print(f"Number of labels longer than {threshold} characters: {num_long_labels}")
print(f"Percentage of long labels in training set: {percentage_long_labels:.2f}%")

# Optional: Plot a histogram of label lengths
train['label_length'].plot(kind='hist', bins=range(0, 50, 5), rwidth=0.8)
plt.title('Distribution of Label Lengths in Training Set')
plt.xlabel('Label Length (characters)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Apply the same steps to the validation set
valid['label_length'] = valid['IDENTITY'].apply(len)
long_labels_valid = valid[valid['label_length'] > threshold]
num_long_labels_valid = len(long_labels_valid)
percentage_long_labels_valid = (num_long_labels_valid / len(valid)) * 100

print(f"Number of labels longer than {threshold} characters in validation set: {num_long_labels_valid}")
print(f"Percentage of long labels in validation set: {percentage_long_labels_valid:.2f}%")


In [None]:
# Plot histogram for both train and validation sets
plt.figure(figsize=(10, 5))

# Training set
plt.hist(train['label_length'], bins=range(0, 50, 5), alpha=0.7, label='Training Set')
# Validation set
plt.hist(valid['label_length'], bins=range(0, 50, 5), alpha=0.7, label='Validation Set')

plt.title('Distribution of Label Lengths in Training and Validation Sets')
plt.xlabel('Label Length (characters)')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
# Remove rows with label length greater than 24 characters in-place
train = train[train['label_length'] <= 24]

# Reset index after filtering
train.reset_index(drop=True, inplace=True)

print(f"Modified training set size: {len(train)}")


In [None]:
train_y = np.ones([train_size, max_str_len]) * -1
train_label_len = np.zeros([train_size, 1])
train_input_len = np.ones([train_size, 1]) * (num_of_timestamps-2)
train_output = np.zeros([train_size])

for i in range(train_size):
    train_label_len[i] = len(train.loc[i, 'IDENTITY'])
    train_y[i, 0:len(train.loc[i, 'IDENTITY'])]= label_to_num(train.loc[i, 'IDENTITY'])    

In [None]:
valid_y = np.ones([valid_size, max_str_len]) * -1
valid_label_len = np.zeros([valid_size, 1])
valid_input_len = np.ones([valid_size, 1]) * (num_of_timestamps-2)
valid_output = np.zeros([valid_size])

for i in range(valid_size):
    valid_label_len[i] = len(valid.loc[i, 'IDENTITY'])
    valid_y[i, 0:len(valid.loc[i, 'IDENTITY'])]= label_to_num(valid.loc[i, 'IDENTITY'])    

In [None]:
print('True label : ',train.loc[100, 'IDENTITY'] , '\ntrain_y : ',train_y[100],'\ntrain_label_len : ',train_label_len[100], 
      '\ntrain_input_len : ', train_input_len[100])

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# Combine training and validation data
combined_x = np.concatenate((train_x, valid_x), axis=0)
combined_y = np.concatenate((train_y, valid_y), axis=0)

# Assuming train_label_len and valid_label_len are also combined
combined_label_len = np.concatenate((train_label_len, valid_label_len), axis=0)
combined_input_len = np.concatenate((train_input_len, valid_input_len), axis=0)

# Split the combined dataset into training and validation sets
train_X, valid_X, train_Y, valid_Y, train_input_len, valid_input_len, train_label_len, valid_label_len = train_test_split(
    combined_x,
    combined_y,
    combined_input_len,
    combined_label_len,
    test_size=0.0909,  # Adjust the split ratio as needed
    random_state=42
)


In [None]:
from tensorflow.keras.regularizers import l2

# Input layer
input_data = Input(shape=(256, 64, 1), name='input')

# Convolutional layers with L2 regularization
inner = Conv2D(32, (3, 3), padding='same', name='conv1', kernel_initializer='he_normal', 
               kernel_regularizer=l2(0.001))(input_data)  # Lower L2 here
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(2, 2), name='max1')(inner)

inner = Conv2D(64, (3, 3), padding='same', name='conv2', kernel_initializer='he_normal', 
               kernel_regularizer=l2(0.001))(inner)  # Lower L2 here
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(2, 2), name='max2')(inner)
inner = Dropout(0.2)(inner)  # Reduced dropout for conv layers

inner = Conv2D(128, (3, 3), padding='same', name='conv3', kernel_initializer='he_normal', 
               kernel_regularizer=l2(0.001))(inner)  # Lower L2 here
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max3')(inner)

# Added 4th Convolutional Block
inner = Conv2D(256, (3, 3), padding='same', name='conv4', kernel_initializer='he_normal', 
               kernel_regularizer=l2(0.001))(inner)  # Lower L2 here
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max4')(inner)
inner = Dropout(0.2)(inner)  # Reduced dropout for conv layers

# Added 5th Convolutional Block
inner = Conv2D(512, (3, 3), padding='same', name='conv5', kernel_initializer='he_normal', 
               kernel_regularizer=l2(0.001))(inner)  # Lower L2 here
inner = BatchNormalization()(inner)
inner = Activation('relu')(inner)
inner = MaxPooling2D(pool_size=(1, 2), name='max5')(inner)
inner = Dropout(0.2)(inner)  # Reduced dropout for conv layers

# Reshape for RNN
inner = Reshape(target_shape=((64, 1024)), name='reshape')(inner)  # Adjusted for larger feature map size
inner = Dense(64, activation='relu', kernel_initializer='he_normal', 
              kernel_regularizer=l2(0.001), name='dense1')(inner)  # Higher L2 here, more important layer

# Bidirectional LSTM layers with L2 regularization
inner = Bidirectional(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, 
                           kernel_regularizer=l2(0.001)), name='lstm1')(inner)  # Balanced Dropout and L2
inner = Bidirectional(LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.3, 
                           kernel_regularizer=l2(0.001)), name='lstm2')(inner)  # Balanced Dropout and L2

# Output layer with L2 regularization
inner = Dense(num_of_characters, kernel_initializer='he_normal', 
              kernel_regularizer=l2(0.001), name='dense2')(inner)  # Regular L2 here
y_pred = Activation('softmax', name='softmax')(inner)

# Model definition
model = Model(inputs=input_data, outputs=y_pred)

# Print model summary
model.summary()


In [None]:
# the ctc loss function
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    # the 2 is critical here since the first couple outputs of the RNN
    # tend to be garbage
    y_pred = y_pred[:, 2:, :]
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [None]:
labels = Input(name='gtruth_labels', shape=[max_str_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

ctc_loss = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
model_final = Model(inputs=[input_data, labels, input_length, label_length], outputs=ctc_loss)

In [None]:
from keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import CSVLogger

# EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Change to 'val_loss' if you prefer to monitor loss
    patience=5,                   # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True,     # Restore the model weights from the epoch with the best value of the monitored quantity
)

# Create CSVLogger instance
csv_logger = CSVLogger('/kaggle/working/training_log.csv', append=False)

In [None]:
# the loss calculation occurs elsewhere, so we use a dummy lambda function for the loss
model_final.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=Adam(lr = 0.0001))

In [None]:
history = model_final.fit(
    x=[train_X, train_Y, train_input_len, train_label_len],
    y=train_output,
    validation_data=([valid_X, valid_Y, valid_input_len, valid_label_len], valid_output),
    epochs=50,
    batch_size=128,
    callbacks=[early_stopping, csv_logger]
)

In [None]:
model_final.save("model_upto_50.h5")

In [None]:
new_history = model_final.fit(
    x=[train_X, train_Y, train_input_len, train_label_len],
    y=train_output,
    validation_data=([valid_X, valid_Y, valid_input_len, valid_label_len], valid_output),
    epochs=150,  # Total number of epochs, including previous ones
    initial_epoch=50,  # Start from epoch 51
    batch_size=128,
    callbacks=[early_stopping, csv_logger]
)


In [None]:
model_final.save("model_upto_150.h5")

In [None]:
model.save("model_not_final_150.h5")

In [None]:
# Combine the training loss and validation loss from both histories
combined_history = {}
combined_history['loss'] = history.history['loss'] + new_history.history['loss']
combined_history['val_loss'] = history.history['val_loss'] + new_history.history['val_loss']


In [None]:
# Plot the combined training and validation loss
plt.figure(figsize=(12, 6))

plt.plot(combined_history['loss'], label='Training Loss')
plt.plot(combined_history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
preds = model.predict(valid_x)
decoded = K.get_value(K.ctc_decode(preds, input_length=np.ones(preds.shape[0])*preds.shape[1], 
                                   greedy=True)[0][0])

prediction = []
for i in range(valid_size):
    prediction.append(num_to_label(decoded[i]))

In [None]:
y_true = valid.loc[0:valid_size, 'IDENTITY']
correct_char = 0
total_char = 0
correct = 0

for i in range(valid_size):
    pr = prediction[i]
    tr = y_true[i]
    total_char += len(tr)
    
    for j in range(min(len(tr), len(pr))):
        if tr[j] == pr[j]:
            correct_char += 1
            
    if pr == tr :
        correct += 1 
    
print('Correct characters predicted : %.2f%%' %(correct_char*100/total_char))
print('Correct words predicted      : %.2f%%' %(correct*100/valid_size))

In [None]:


plt.figure(figsize=(15, 10))
for i in range(6):
    ax = plt.subplot(2, 3, i+1)
    img_dir = '/kaggle/input/handwriting-recognition/test_v2/test/'+test.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    plt.imshow(image, cmap='gray')
    
    image = preprocess(image)
    image = image/255.
    pred = model.predict(image.reshape(1, 256, 64, 1))
    decoded = K.get_value(K.ctc_decode(pred, input_length=np.ones(pred.shape[0])*pred.shape[1], 
                                       greedy=True)[0][0])
    plt.title(num_to_label(decoded[0]), fontsize=12)
    plt.axis('off')
    
plt.subplots_adjust(wspace=0.2, hspace=-0.8)

In [None]:
# Import the required library for plotting
import matplotlib.pyplot as plt

# Plot the training and validation loss over epochs
plt.figure(figsize=(10, 6))

# Plot the training loss
plt.plot(history.history['loss'], label='Training Loss')

# Plot the validation loss
plt.plot(history.history['val_loss'], label='Validation Loss')

# Add labels and title
plt.title('Training and Validation Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Show the plot
plt.show()


In [None]:
model.save("extended_crnn_model_combined_train_valid.h5")

In [None]:
import pandas as pd

# Existing loss and val_loss values
loss_initial = [
    27.3748, 23.7283, 22.8093, 19.6996, 13.7676, 9.3792, 7.5753, 6.6913, 6.1320, 
    5.7143, 5.3807, 5.1257, 4.9028, 4.6972, 4.5307, 4.3612, 4.2143, 4.0946, 3.9639, 
    3.8326, 3.7417, 3.6280, 3.5335, 3.4467, 3.3555, 3.2656, 3.1985, 3.1261, 3.0572, 
    2.9862, 2.9258, 2.8701, 2.8190, 2.7655, 2.7112, 2.6577, 2.6144, 2.5684, 2.5343, 
    2.4822, 2.4441, 2.4199, 2.3779, 2.3304, 2.3003, 2.2581, 2.2367, 2.1934, 2.1669, 
    2.1491
]

val_loss_initial = [
    25.8614, 23.9250, 22.1027, 16.3845, 10.5517, 8.1299, 6.6331, 6.2241, 5.6428, 
    5.4575, 4.9368, 4.8372, 4.5101, 4.4439, 4.3372, 4.0736, 3.9975, 3.9212, 3.7392, 
    3.6703, 3.7071, 3.5766, 3.4913, 3.3830, 3.3948, 3.3272, 3.2310, 3.1923, 3.0979, 
    3.0958, 3.0387, 2.9921, 2.9622, 2.9373, 2.9184, 2.8740, 2.8316, 2.8284, 2.8135, 
    2.7806, 2.7739, 2.7130, 2.6768, 2.6947, 2.6989, 2.6717, 2.6276, 2.6442, 2.6122, 
    2.6036
]

# Additional loss and val_loss values for epochs 51-72
loss_additional = [
    2.1101, 2.0877, 2.0680, 2.0340, 2.0061, 1.9789, 1.9559, 1.9465, 1.9035, 1.8779,
    1.8618, 1.8471, 1.8399, 1.8073, 1.7801, 1.7618, 1.7447, 1.7263, 1.7152, 1.6914,
    1.6729, 1.6570
]

val_loss_additional = [
    2.5843, 2.6211, 2.5751, 2.5722, 2.5447, 2.5733, 2.6169, 2.5212, 2.5644, 2.5152,
    2.5367, 2.4765, 2.5227, 2.5091, 2.4982, 2.4821, 2.4661, 2.5226, 2.4909, 2.4938,
    2.4903, 2.4811
]

# Combine initial and additional values
loss_combined = loss_initial + loss_additional
val_loss_combined = val_loss_initial + val_loss_additional
epochs_combined = range(1, len(loss_combined) + 1)

# Create a DataFrame to store the values
df = pd.DataFrame({
    'Epoch': epochs_combined,
    'Training Loss': loss_combined,
    'Validation Loss': val_loss_combined
})

# Save the DataFrame to an Excel file
df.to_excel('updated_training_loss_history.xlsx', index=False)

print("Updated training and validation loss saved to 'updated_training_loss_history.xlsx'")


In [None]:
model = tf.keras.models.load_model("/kaggle/input/crnn_not_final/keras/default/1/model_not_final_150.h5")

In [None]:
preds = model.predict(valid_x)
decoded = K.get_value(K.ctc_decode(preds, input_length=np.ones(preds.shape[0])*preds.shape[1], 
                                   greedy=True)[0][0])

prediction = []
for i in range(valid_size):
    prediction.append(num_to_label(decoded[i]))

In [None]:
y_true = valid.loc[0:valid_size, 'IDENTITY']
correct_char = 0
total_char = 0
correct = 0

for i in range(valid_size):
    pr = prediction[i]
    tr = y_true[i]
    total_char += len(tr)
    
    for j in range(min(len(tr), len(pr))):
        if tr[j] == pr[j]:
            correct_char += 1
            
    if pr == tr :
        correct += 1 
    
print('Correct characters predicted : %.2f%%' %(correct_char*100/total_char))
print('Correct words predicted      : %.2f%%' %(correct*100/valid_size))

In [None]:
from sklearn.metrics import classification_report

# Flatten the predictions and ground truth labels to character level
y_true_flat = []
y_pred_flat = []

for i in range(valid_size):
    # Convert the true label to numeric form
    true_label = valid.loc[i, 'IDENTITY']
    true_label_num = label_to_num(true_label)
    
    # Add true labels to y_true_flat
    y_true_flat.extend(true_label_num)
    
    # Convert the predicted label to numeric form
    pred_label = prediction[i]
    pred_label_num = label_to_num(pred_label)
    
    # Add predicted labels to y_pred_flat
    y_pred_flat.extend(pred_label_num)

# Ensure both lists are of the same length for evaluation
min_length = min(len(y_true_flat), len(y_pred_flat))
y_true_flat = y_true_flat[:min_length]
y_pred_flat = y_pred_flat[:min_length]

# Compute classification report
report = classification_report(y_true_flat, y_pred_flat, target_names=list(alphabets), zero_division=0)
print(report)
