# Base Model
---
This model is taken from the Recursive Recurrent Nets with Attention Modeling for OCR in the Wild paper by Lee et al. In their paper, Lee construct a recursive recurrent neural network with attention modeling. For our project we want to first understand this model architecture, and then try to improve upon it. Later we will provide an ethical analysis for OCR technology.

In [1]:
import numpy as np
import string
import os
import torch
from torch import nn

From the paper, the base model: 
> has 8 convolutional layer with 64, 64, 128, 128, 256, 256, 512 and 512 channels, and each convolutional layer uses kernel with a 3 × 3 spatial extent. Convo- lutions are performed with stride 1, zero padding, and ReLU activation function. 2 × 2 max pooling follows the second, fourth, and sixth convolutional layers. The two fully con- nected layers have 4096 units. The input is a resized 32 × 100 gray scale image.

In [2]:
softmax_classes = len(string.printable)
print(string.printable)

eow = torch.zeros(size=(len(string.printable) + 1,))
eow[len(string.printable)] = 1

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

from base_model import BaseModel
base_cnn_model = BaseModel(eow=eow).to(device)

x = torch.rand(1, 1, 32, 100).to(device)
x = base_cnn_model(x)

preds = torch.argmax(x[0].T, dim=1).tolist()  # [0] because its the first item in a batch size of 1
for pred in preds:
    print(string.printable[pred], end='')

;;;;;;;;;;;;;;;;;;;;;;;

In [4]:
from IIIT5K.dataset import IIIT5KDataset
from torch.utils.data import DataLoader

train_set = IIIT5KDataset(split='train')
val_set = IIIT5KDataset(split='val')
test_set = IIIT5KDataset(split='test')

train_loader = DataLoader(train_set, batch_size=256, shuffle=True)
val_loader = DataLoader(val_set, batch_size=256, shuffle=True)
test_loader = DataLoader(test_set, batch_size=256, shuffle=True)

In [5]:
def train_model(model_name: str, num_epochs: int):
    model = BaseModel(eow=eow).to(device)
    print('Total Parameters:', sum(p.numel() for p in model.parameters()))
    print('Trainable Parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=.002)
    running_loss = 0.
    for epoch in range(num_epochs):
        for step, (image, label) in enumerate(train_loader):
            image = torch.unsqueeze(image, dim=1).to(device)
            label = torch.stack(label, dim=0).to(device)

            optimizer.zero_grad()

            output = model(image)
            loss = criterion(output, label.T)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), 10.)

            optimizer.step()

            # Gather data and report
            running_loss += loss.item()
        last_loss = running_loss / step # loss per batch
        
        acc, total = 0., 0.
        for o, l in zip(output, label):
            o = torch.argmax(o, dim=1)
            for o_, l_ in zip(o, l):
                if l_ == 100:
                    break
                if o_ == l_:
                    acc += 1.
                total += 1.
                
        acc = acc / total
        print(f'epoch {epoch+1} -> train_loss: {last_loss:.4f}, train_acc: {acc:.4f}')

        running_loss = 0.

        # Validation
        val_loss = 0.
        val_acc = 0.
        val_total = 0.
        with torch.no_grad():
            for step, (image, label) in enumerate(val_loader):
                image = torch.unsqueeze(image, dim=1).to(device)
                label = torch.stack(label, dim=0).to(device)

                output = model(image)
                loss = criterion(output, label.T)
                val_loss += loss.item()

                for o, l in zip(output, label):
                    o = torch.argmax(o, dim=1)
                    for o_, l_ in zip(o, l):
                        if l_ == 100:
                            break
                        if o_ == l_:
                            val_acc += 1.
                        val_total += 1.
            val_acc = val_acc / val_total
            val_loss = val_loss / step
            print(f'epoch {epoch+1} -> val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}')

    return model

In [6]:
model = train_model(model_name="R2AM", num_epochs=30)

Total Parameters: 140792256
Trainable Parameters: 140792256
epoch 1 -> train_loss: 4.9249, train_acc: 0.0079
epoch 1 -> val_loss: 9.2274, val_acc: 0.0090
epoch 2 -> train_loss: 4.9179, train_acc: 0.0250
epoch 2 -> val_loss: 9.2144, val_acc: 0.0105
epoch 3 -> train_loss: 4.9109, train_acc: 0.0198
epoch 3 -> val_loss: 9.2015, val_acc: 0.0118
epoch 4 -> train_loss: 4.9039, train_acc: 0.0000
epoch 4 -> val_loss: 9.1885, val_acc: 0.0096
epoch 5 -> train_loss: 4.8969, train_acc: 0.0075
epoch 5 -> val_loss: 9.1754, val_acc: 0.0034
epoch 6 -> train_loss: 4.8898, train_acc: 0.0183
epoch 6 -> val_loss: 9.1622, val_acc: 0.0033
epoch 7 -> train_loss: 4.8826, train_acc: 0.0199
epoch 7 -> val_loss: 9.1490, val_acc: 0.0097
epoch 8 -> train_loss: 4.8755, train_acc: 0.0000
epoch 8 -> val_loss: 9.1356, val_acc: 0.0079
epoch 9 -> train_loss: 4.8682, train_acc: 0.0000
epoch 9 -> val_loss: 9.1221, val_acc: 0.0029
epoch 10 -> train_loss: 4.8609, train_acc: 0.0000
epoch 10 -> val_loss: 9.1084, val_acc: 0.014