In [None]:
!pip install kaggle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/ColabNotebooks/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download landlord/handwriting-recognition

In [None]:
! unzip handwriting-recognition.zip


In [7]:
# importing libraries
import tensorflow as tf
import keras
import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from itertools import chain
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Embedding,  Dense, Flatten, Dropout, LSTM, TimeDistributed, BatchNormalization, Reshape
from keras.utils import to_categorical

In [8]:
# Loading the dataset
train = pd.read_csv('/content/written_name_train_v2.csv')
test = pd.read_csv('/content/written_name_test_v2.csv')
train_img_dir = '/content/train_v2/train'
test_img_dir = '/content/test_v2/test'

In [9]:
# Filtering out 'UNREADABLE' and NaN labels
train = train[train['IDENTITY'] != 'UNREADABLE'].dropna()
test = test[test['IDENTITY'] != 'UNREADABLE'].dropna()

In [None]:
# Function to convert name to vectors
def name_to_vectors(name, max_len, char_to_index):
    name = str(name)
    vector = [char_to_index[char] for char in name if char in char_to_index]
    # Padding
    vector += [char_to_index[' ']] * (max_len - len(vector))
    return vector

# Function to convert image to binary
def to_binary(image):
    image = cv2.resize(cv2.cvtColor(image, cv2.COLOR_BGR2RGB), dsize=(284, 62))
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, image_binary = cv2.threshold(image_gray, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    image_binary = image_binary / 255.0
    return image_binary

# Loading data and preprocess
def load_data(data, img_dir, max_samples=None):
    data_img = []
    data_idt = []
    num_samples = min(len(data), max_samples) if max_samples else len(data)

    for i in range(num_samples):
        image_filename = data['FILENAME'].iloc[i]
        image_path = os.path.join(img_dir, image_filename)
        image = cv2.imread(image_path)
        image = to_binary(image)
        data_img.append(image)
        data_idt.append(data['IDENTITY'].iloc[i])

    return data_img, data_idt

# Loading training and test data
train_data_img, train_data_idt = load_data(train, train_img_dir)
test_data_img, test_data_idt = load_data(test, test_img_dir)

# Calculating the maximum length of names in train_data_idt
max_len = max(len(str(name)) for name in train_data_idt)

# Get unique characters and create mapping
all_data = train_data_idt + test_data_idt
all_characters = ''.join(all_data)
unique_characters = sorted(set(all_characters))
char_to_index = {char: i for i, char in enumerate(unique_characters)}
index_to_char = {i: char for i, char in enumerate(unique_characters)}

# Creating vectors for each name in train_data_idt and test_data_idt
vectors_list_train_idt = [name_to_vectors(name, max_len, char_to_index) for name in train_data_idt]
vectors_list_test_idt = [name_to_vectors(name, max_len, char_to_index) for name in test_data_idt]

# Converting to numpy arrays
vectors_list_train_idt = np.array(vectors_list_train_idt)
vectors_list_test_idt = np.array(vectors_list_test_idt)

# To One-hot encode the output
vectors_list_train_idt_one_hot = to_categorical(vectors_list_train_idt, num_classes=len(unique_characters))
vectors_list_test_idt_one_hot = to_categorical(vectors_list_test_idt, num_classes=len(unique_characters))

# Model
model = Sequential()
model.add(Embedding(len(unique_characters), 64, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(len(unique_characters), activation='softmax')))

# Compilation of the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Training the model
history = model.fit(vectors_list_train_idt, vectors_list_train_idt_one_hot, epochs=1, batch_size=32, validation_data=(vectors_list_test_idt, vectors_list_test_idt_one_hot))
# Function to generate text
def generate_text(model, start_string, char_to_index, index_to_char, max_len, num_generate=100):
    input_eval = [char_to_index[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        predicted_id = tf.random.categorical(predictions[:, -1, :], num_samples=1)[-1, 0].numpy()

        input_eval = tf.concat([input_eval, tf.expand_dims([predicted_id], 0)], axis=1)
        text_generated.append(index_to_char[predicted_id])

    return start_string + ''.join(text_generated)


