# Base Model
---
This model is taken from the Recursive Recurrent Nets with Attention Modeling for OCR in the Wild paper by Lee et al. In their paper, Lee construct a recursive recurrent neural network with attention modeling. For our project we want to first understand this model architecture, and then try to improve upon it. Later we will provide an ethical analysis for OCR technology.

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import string
import os

import torch
from torch import nn

From the paper, the base model: 
> has 8 convolutional layer with 64, 64, 128, 128, 256, 256, 512 and 512 channels, and each convolutional layer uses kernel with a 3 × 3 spatial extent. Convo- lutions are performed with stride 1, zero padding, and ReLU activation function. 2 × 2 max pooling follows the second, fourth, and sixth convolutional layers. The two fully con- nected layers have 4096 units. The input is a resized 32 × 100 gray scale image.

In [2]:
softmax_classes = len(string.printable)
print(string.printable)

eow = torch.zeros(size=(len(string.printable) + 1,))
eow[len(string.printable)] = 1

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	



In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

from base_model import BaseModel
base_cnn_model = BaseModel(eow=eow).to(device)

x = torch.zeros(size=(1, 1, 32, 100)).to(device)
x = base_cnn_model(x)

preds = torch.argmax(x, dim=2).tolist()[0]  # [0] because its the first item in a batch size of 1
for pred in preds:
    if pred == 100:
        break
    print(string.printable[pred], end='')

cuda:0
rrrrrrrrrrrrrrrrrrrrrrr

In [4]:
from IIIT5K.dataset import IIIT5KDataset
from torch.utils.data import DataLoader

train_set = IIIT5KDataset(split='train')
val_set = IIIT5KDataset(split='val')
test_set = IIIT5KDataset(split='test')

train_loader = DataLoader(train_set, batch_size=256, shuffle=True)
val_loader = DataLoader(val_set, batch_size=256, shuffle=True)
test_loader = DataLoader(test_set, batch_size=256, shuffle=True)

In [5]:
def train_model(model_name: str, num_epochs: int):
    model = BaseModel(eow=eow).to(device)
    model.train()
    print('Total Parameters:', sum(p.numel() for p in model.parameters()))
    print('Trainable Parameters:', sum(p.numel() for p in model.parameters() if p.requires_grad))
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.002)
    running_loss = 0
    for epoch in range(num_epochs):
        for step, (image, label) in enumerate(train_loader):
            image = torch.unsqueeze(image, dim=1).to(device)
            label = torch.stack(label, dim=1).to(device)

            optimizer.zero_grad()

            output = model(image)
            loss = criterion(output, label)
            loss.backward()

            optimizer.step()

            # Gather data and report
            running_loss += loss.item()
        last_loss = running_loss / step # loss per batch
        print('epoch {} loss: {}'.format(epoch + 1, last_loss))
        running_loss = 0.

        # Validation
        val_loss = 0.
        val_acc = 0.
        val_total = 0.
        with torch.no_grad():
            for step, (image, label) in enumerate(val_loader):
                image = torch.unsqueeze(image, dim=1).to(device)
                label = torch.stack(label, dim=1).to(device)

                output = model(image)
                loss = criterion(output, label)
                val_loss += loss.item()

                for o, l in zip(output, label):
                    o = torch.argmax(o, dim=1)
                    l = torch.argmax(l, dim=1)
                    for o_, l_ in zip(o, l):
                        if o_ == l_:
                            val_acc += 1.
                        val_total += 1.
            print('epoch {} val loss: {}'.format(epoch + 1, val_loss / step))
            print('epoch {} val acc: {}'.format(epoch + 1, val_acc / val_total))

    return model

In [6]:
model = train_model(model_name="R2AM", num_epochs=30)

Total Parameters: 62982693
Trainable Parameters: 62982693
epoch 1 loss: 0.7616255756066774
epoch 1 val loss: 1.4280479213985031
epoch 1 val acc: 0.0011304347826086956
epoch 2 loss: 0.7616255743540947
epoch 2 val loss: 1.4280479193632285
epoch 2 val acc: 0.0011304347826086956
epoch 3 loss: 0.761625574093816
epoch 3 val loss: 1.428047919400566
epoch 3 val acc: 0.0011304347826086956
epoch 4 loss: 0.7616255735439978
epoch 4 val loss: 1.4280479161658035
epoch 4 val acc: 0.0011304347826086956


KeyboardInterrupt: 