# Base Model
---
This model is taken from the Recursive Recurrent Nets with Attention Modeling for OCR in the Wild paper by Lee et al. In their paper, Lee construct a recursive recurrent neural network with attention modeling. For our project we want to first understand this model architecture, and then try to improve upon it. Later we will provide an ethical analysis for OCR technology.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import string
import pickle

import torch

From the paper, the base model: 
> has 8 convolutional layer with 64, 64, 128, 128, 256, 256, 512 and 512 channels, and each convolutional layer uses kernel with a 3 × 3 spatial extent. Convo- lutions are performed with stride 1, zero padding, and ReLU activation function. 2 × 2 max pooling follows the second, fourth, and sixth convolutional layers. The two fully con- nected layers have 4096 units. The input is a resized 32 × 100 gray scale image.

In [2]:
softmax_classes = len(string.printable)
print(string.printable)

eow = torch.zeros(size=(len(string.printable) + 1,))
eow[len(string.printable)] = 1
eow

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])

In [3]:
from base_model import BaseModel

base_cnn_model = BaseModel(eow=eow)

x = torch.zeros(size=(1, 1, 32, 100))
x = base_cnn_model(x)

preds = torch.argmax(x, dim=2).tolist()[0]  # [0] because its the first item in a batch size of 1
for pred in preds:
    print(string.printable[pred], end='')

$$$$$$$$$$$

In [9]:
# --------------------------------- TESTING ENDS HERE --------------------------------- #

In [4]:
with open('captcha/shard_0.pkl', 'rb') as file:
    image_shard, label_shard = pickle.load(file)

label_encodings = []
for label in label_shard:
    encoding = np.zeros(shape=(len(label) + 1, len(string.printable)))  # +1 for <eos>
    for i, letter in enumerate(label):
        pos = string.printable.index(letter)
        encoding[i, pos] = 1
    label_encodings.append(encoding)

image_shard = np.expand_dims(np.asarray(image_shard), axis=3)
label_encodings = np.asarray(label_encodings)
image_shard.shape, label_encodings.shape

((11306, 32, 100, 1), (11306, 6, 100))

In [18]:
base_cnn_model.fit(image_shard, label_encodings, batch_size=256, epochs=30)

Epoch 1/30


ValueError: in user code:

    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/losses.py", line 1789, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/home/spencer/anaconda3/lib/python3.9/site-packages/keras/backend.py", line 5083, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 6, 100) and (None, 100) are incompatible


Lee et al continue saying:
> For the character-level language modeling, we use RNNs with 1024 hidden units equipped with hyperbolic tangent activation function.

I believe it is within the RNN that the activation layer is called, not the CNN like I initially believed.

In [17]:
from keras.layers.dense_attention import Attention
if 'base_rnn_model' in locals():
    del base_rnn_model

input_shape = (100, 32, 1)

base_rnn_model = keras.Sequential(
    [
     keras.Input(shape = input_shape),
     layers.Dense(units = 1024, activation = 'tanh', name='rnn_layer1'),
     layers.Attention(['rnn_layer1']),
     layers.Dense(units = 1024, activation = 'tanh'),
     layers.Dense(softmax_classes, activation = "softmax")
    ],
    name = 'base_rnn_model'
)

base_rnn_model.summary()

ValueError: ignored