# Day 4 - CNN, Bi-LSTM and CTC ##
### Objetive: Exercise on CNN + Bi-LSTM + CTC Loss for Arabic Text Classification ###
### Dataset: Labeled Handwritten Arabic Words ###
### Please fill in all sections that start with "# Task" , sections that start with "# Step" are  pre-implemented #######

#### Section 1 - Dependencies & Libraries

In [None]:
# Step 1.1. - Install required libraries
!pip3 install numpy matplotlib tensorflow keras scikit-learn --upgrade

# Step 1.2. - Restart Kernel Manually
# Toolbar -> Kernel -> Restart & Clear Output -> Restart & Clear All Outputs

In [None]:
# Step 1.3. - Import required libraries
from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Input, Dropout, LSTM, Bidirectional, Reshape, StringLookup, Layer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt, numpy as np, glob, pandas as pd, tensorflow as tf

#### Section 2 - Define Configurations

In [None]:
# Step 2.1. Define Image Width & Height (All images will be resized accordingly)
img_width = 64
img_height = 32

# Step 2.2. Define Padding Token (Integer) - Used to pad vectorized labels
padding_token = 99

# Step 2.3. Define Maximum Length of Label/Output (Longest Arabic word is 16 letters)
max_label_length = 16

# Step 2.4. Define Test Size for Train Test Split
test_size = 0.33

# Step 2.5. Define Number of Epochs/Iterations to Train the Model
epochs = 10

# Step 2.6 Define Batch Size for Model Training
batch_size = 16

#### Section 3 - Load Image Paths & Label Data ; and perform train/test split

In [None]:
# Step 3.1. Load Images Paths and Label Strings into Pandas DataFrame
dataset_path = "Labeled_Handwritten_Arabic_Words/"
id_from_path = lambda x : x.split("\\")[1].split(".")[0]
images = [{"id": id_from_path(path), "img_path": path} for path in glob.glob(f"{dataset_path}/images/*.jpg")]
labels = [{"id": id_from_path(path), "label_str": open(path, encoding="utf8").read().strip()} for path in glob.glob(f"{dataset_path}/labels/*.txt")]
df = pd.merge(pd.DataFrame(images), pd.DataFrame(labels), on='id')

# Step 3.2. Train / Test Split
X_train, X_test, y_train, y_test = train_test_split(df['img_path'].tolist(), df['label_str'].tolist(), test_size=test_size, random_state=42)

#### Section 4 - Label Encoding & Prepare Train/Test Split

In [None]:
# Step 4.1. - Label Vectorization Class
class label_vectorization(object):
    
    # Step 4.1.1. Define Initialization Function
    def __init__(self, labels_list):
        # Step 4.1.1.1. Extract a Sorted list of all unique characters within all labels
        characters = sorted(list(set(char for label in labels_list for char in label)))
        # Task 4.1.1.2. Create a String to Number Convertor using StringLookup

        # Task 4.1.1.3. Create a Number to String Convertor using StringLookup (Inverse of 3.1.1.2)


    # Step 4.1.2. Define vectorize function; which converts a string into a vector
    def vectorize(self, label_str):
        # Task 4.1.2.1. Use String to Number convertor for each character within the label_str

        # Task 4.1.2.2. Pad the label_vector creating in 3.1.2.1 with padding_tokens; based on max_length - len(label_vector)

        # Task 4.1.2.3. Return the Padded Label Vector


    # Step 4.1.3. Define unvectorize function; which converts a vector into a string
    def unvectorize(self, label_vector, pred=False):
        # Step 4.1.3.1. Define which padding token to remove; if prediction then padding_token is -1
        pad_token = -1 if pred else padding_token
        # Task 4.1.3.2. Remove All Numbers within the label_vector which are equal to the padding token

        # Task 4.1.3.3. Use the Number to String convertor to convert the label_vector to a string; hint(also use reduce_join)

        # Task 4.1.3.4. Return the inverse of the string (as arabic is RTL)

        
# Step 4.2. Instantiate the Label Vectorizer
label_vectorizer = label_vectorization(df['label_str'].tolist())

# Step 4.3. Define Function that loads the image from path, preprocesses it and vectorizes the label
#           Returning a dictionary with format {'image': image, 'label': vectorized_label}
def preprocess_sample(im_path, label):
    # Task 4.3.1. Read image using tf.io.read_file

    # Task 4.3.2. Decode and convert to grayscale

    # Task 4.3.3. Convert to float32 in [0, 1] range

    # Task 4.3.4. Resize to the desired size

    # Task 4.3.5. Transpose the image because we want the time dimension to correspond to the width of the image.

    # Task 4.3.6. Return loaded & Preprocessing image


# Step 4.4. Load & preprocess the training and testing datasets; formating them as tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.map(preprocess_sample, num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.map(preprocess_sample, num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

#### Section 5 - Build the Model Architecture

In [None]:
# Step 5.1. Define Connectionist Temporal Classification Loss Layer
class CTCLayer(Layer):
    
    # Step 5.1.1. Define Initialization Function
    def __init__(self, name=None):
        # Trigger the generic "Layer" Initialization
        super().__init__(name=name)

    # Step 5.1.2. Define the call function (which calculates the output and loss)
    def call(self, y_true, y_pred):
        # Calculate the batch size
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        # Calculate the Input and Label Length (y_pred is the input and y_true is the label)
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        # Compute the ctc loss value
        loss = tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
        # Add Computed loss to the layer using `self.add_loss()`.
        self.add_loss(loss)
        # Return the Same outputs as received from the previous layer
        return y_pred

In [None]:
# Step 5.2. Define Model Architecture

# Step 5.2.1. Define Input Layer with shape=(image width, image height, 1) and name = image
input_img = Input(shape=(img_width, img_height, 1), name="image", dtype="float32")

# Step 5.2.2. Define Input Layer with shape=(None,) and name = label
labels = Input(name="label", shape=(None,), dtype="float32")

# Task 5.2.3. Define 1st Convolutional Block (Conv2D, and MaxPooling2D)
# Recommended Parameters: activation="relu", kernel_initializer="he_normal", padding="same", pool size = 2, stride = 2


# Task 5.2.4. Define 2nd Convolutional Block (Conv2D, and MaxPooling2D)
# Recommended Parameters: activation="relu", kernel_initializer="he_normal", padding="same", pool size = 2, stride = 2
# Size of 2nd Convolutional Block is usually larger


# Step 5.2.5. We have used two max pool with pool size and strides 2. Hence, downsampled feature maps are 4x smaller.
#The number of filters in the last layer is 64. Reshape accordingly before passing the output to the LSTM part of the model
new_shape = ((img_width // 4), (img_height // 4) * 64)
x = Reshape(target_shape=new_shape, name="reshape")(x)
x = Dense(64, activation="relu", name="dense1")(x)
x = Dropout(0.2)(x)

# Task 5.2.6. Define 2x Bi-Directional LSTMS 
# Recommended Parameters: return_sequences=True, dropout=0.25


# Step 5.2.7. Define Output layer; where number of outputs depends on the size of character vocabulary
x = Dense(len(label_vectorizer.char_to_num.get_vocabulary()) + 1, activation="softmax", name="dense2")(x)

# Step 5.2.8. Add CTC layer for calculating CTC loss at each step
output = CTCLayer(name="ctc_loss")(labels, x)

# Step 5.2.9. Define overall model
model = tf.keras.models.Model(inputs=[input_img, labels], outputs=output, name="ocr_model_v1")

# Step 5.2.10. Compile the model using Adam optimizer
model.compile(optimizer=tf.keras.optimizers.Adam())

# Step 5.2.11. Train the model and use the testing data for validation
history = model.fit(train_dataset, validation_data=test_dataset, epochs=epochs)

#### Section 6 - Predict & Visualize Predictions Sample

In [None]:
# Step 6.1. Get the prediction model by extracting layers till the output layer
prediction_model = tf.keras.models.Model(model.get_layer(name="image").input, model.get_layer(name="dense2").output)

# Step 6.2.  Visualizethe Predictions
# For 1 batch of the testing set
for batch in test_dataset.take(1):
    # Calculate the predictions by only supplying the image data
    preds = prediction_model.predict(batch["image"])    
    # Decode the results of the CTC Layer
    input_len = np.ones(preds.shape[0]) * preds.shape[1]
    results = tf.keras.backend.ctc_decode(preds, input_length=input_len)[0][0][:, :max_label_length]
    # Unvectorize the Decoded Predictions
    pred_texts = [label_vectorizer.unvectorize(result, pred=True) for result in results]
    
    # For each image/prediction within the batch; add to subplot
    _, ax = plt.subplots(4, 4, figsize=(15, 10))
    for i in range(len(preds)):
        img = (batch["image"][i, :, :, 0] * 255).numpy().astype(np.uint8).T
        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(f"Prediction: {pred_texts[i]}")
        ax[i // 4, i % 4].axis("off")
plt.show()

#### Optional Section 7 - Calculate Character Error Rate for testing data

In [None]:
# Task 7.1. Calculate the Character Error Rate for the test_dataset