In [1]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
IMG_SIZE = 128
EPOCHS   = 7
BATCH    = 512
ARC      = True

In [3]:
import sys
curr_dir = r'/content/drive/My Drive/course_ocr/task2'
if curr_dir not in sys.path:
  print(curr_dir, 'added to sys.path')
  sys.path.append(curr_dir)
else:
  print(curr_dir, 'in sys.path')

/content/drive/My Drive/course_ocr/task2 added to sys.path


In [4]:
from data_reader import Vocabulary, HWDBDatasetHelper, ArchivedHWDBReader

# your path to data
train_path = r'/content/drive/My Drive/HWDBTrain/Images.zip'
test_path = r'/content/drive/My Drive/HWDBTest/Images.zip'
gt_path = './gt.txt'

# Simple CNN baseline

pytorch is required for this baseline implementation

## Baseline method

- Naively resize to 32x32 (DON'T DO THIS IN YOUR WORK, try to save geometry somehow, it is important)
- Train LeNet-like CNN
- Enjoy :)

In [5]:
import cv2
import numpy as np

### Data tools

In [6]:
train_reader = ArchivedHWDBReader(train_path)
train_reader.open()
train_helper = HWDBDatasetHelper(train_reader)

In [7]:
train_helper, val_helper = train_helper.train_val_split()

In [8]:
train_helper.size(), val_helper.size()

(2578433, 644609)

In [9]:
import torch

from torch.utils.data import Dataset, DataLoader
from torch import nn

class HWDBDataset(Dataset):
    def __init__(self, helper: HWDBDatasetHelper):
        self.helper = helper
    
    def __len__(self):
        return self.helper.size()
    
    def __getitem__(self, idx):
        im, label = self.helper.get_item(idx)

        desired_size = IMG_SIZE

        old_size = im.shape[:2] # old_size is in (height, width) format
        ratio = float(desired_size)/max(old_size)
        new_size = tuple([int(x*ratio) for x in old_size])

        # new_size should be in (width, height) format

        im = cv2.resize(im, (new_size[1], new_size[0]))

        delta_w = desired_size - new_size[1]
        delta_h = desired_size - new_size[0]
        top, bottom = delta_h//2, delta_h-(delta_h//2)
        left, right = delta_w//2, delta_w-(delta_w//2)

        color = [255, 255, 255]
        new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,
            value=color)

        im_rgb = cv2.cvtColor(new_im, cv2.COLOR_GRAY2RGB)
        return (im_rgb - 127.5) / 255., label

In [10]:
train_dataset = HWDBDataset(train_helper)
val_dataset = HWDBDataset(val_helper)

### Model & training

In [11]:
from torchvision import models

model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, 2048)

# model_ft = model_ft.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

In [12]:
model =  model_ft
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [13]:
# model(torch.tensor(train_dataset[0][0], dtype=torch.float32).view(1, 3, 64, 64))

In [14]:
model = model.cuda()

In [15]:
train_loader = DataLoader(train_dataset, batch_size=BATCH, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=2048, shuffle=False)

In [16]:
!pip install pytorch-metric-learning

Collecting pytorch-metric-learning
  Downloading pytorch_metric_learning-1.3.0-py3-none-any.whl (109 kB)
[?25l[K     |███                             | 10 kB 32.5 MB/s eta 0:00:01[K     |██████                          | 20 kB 41.0 MB/s eta 0:00:01[K     |█████████                       | 30 kB 36.8 MB/s eta 0:00:01[K     |████████████                    | 40 kB 26.8 MB/s eta 0:00:01[K     |███████████████                 | 51 kB 23.0 MB/s eta 0:00:01[K     |██████████████████              | 61 kB 26.4 MB/s eta 0:00:01[K     |█████████████████████           | 71 kB 24.7 MB/s eta 0:00:01[K     |████████████████████████        | 81 kB 25.7 MB/s eta 0:00:01[K     |███████████████████████████     | 92 kB 27.8 MB/s eta 0:00:01[K     |██████████████████████████████  | 102 kB 30.0 MB/s eta 0:00:01[K     |████████████████████████████████| 109 kB 30.0 MB/s 
Installing collected packages: pytorch-metric-learning
Successfully installed pytorch-metric-learning-1.3.0


In [17]:
from pytorch_metric_learning import losses
if ARC == True:
  loss_fn = losses.ArcFaceLoss(num_classes=train_helper.vocabulary.num_classes(), embedding_size=2048).to(torch.device('cuda'))
  loss_optimizer = torch.optim.Adam(loss_fn.parameters(), lr=1e-4)
else:
  loss_fn = torch.nn.CrossEntropyLoss()

# For classification losses, you can get logits using the get_logits function:
# loss_func = losses.SomeClassificationLoss()
# logits = loss_func.get_logits(embeddings)

In [18]:
optim = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_fn = torch.nn.CrossEntropyLoss()

In [19]:
from tqdm import tqdm


def run_validation(val_loader: DataLoader, model: nn.Module, loss_fn, n_steps=None):
    model.eval()
    n_good = 0
    n_all = 0
    wrapper = lambda x: x
    if n_steps is None:
        n_steps = len(val_loader)
        wrapper = tqdm
    
    with torch.no_grad():
        for batch, (X, y) in enumerate(wrapper(val_loader)):
            if batch == n_steps:
                break
            X = torch.swapaxes(X, 1, 3)
            embeddings = model(X.to(torch.float32).cuda())
            logits = loss_fn.get_logits(embeddings)
            classes = torch.argmax(logits, dim=1).cpu().numpy()
            n_good += sum(classes == y.cpu().numpy())
            n_all += len(classes)
    
    return n_good / n_all


def train_epoch(train_loader: DataLoader, val_loader: DataLoader, model: nn.Module, optim, loss_fn, loss_optimizer):
    for batch, (X, y) in enumerate(tqdm(train_loader)):
        model.train()
        X = torch.swapaxes(X, 1, 3)
        logits = model(X.to(torch.float32).cuda())

        loss = loss_fn(logits, y.to(torch.long).cuda())
        
        optim.zero_grad()

        if ARC == True:
          loss_optimizer.zero_grad()
          

        loss.backward()

        if ARC == True:
          loss_optimizer.step()
        optim.step()

In [None]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch}:')
    train_epoch(train_loader, val_loader, model, optim, loss_fn, loss_optimizer)
    accuracy = run_validation(val_loader, model, loss_fn)
    print(f'accuracy: {accuracy}')
    torch.save(model.state_dict(), f'/content/drive/My Drive/course_ocr/task2/Checkpoints/baseline_ARC_{BATCH}_{IMG_SIZE}_epoch{epoch}.pth')

Epoch 0:


100%|██████████| 5036/5036 [1:20:31<00:00,  1.04it/s]


accuracy: 0.9355733475641823
Epoch 1:


100%|██████████| 5036/5036 [1:16:34<00:00,  1.10it/s]


accuracy: 0.95117970738851
Epoch 2:


100%|██████████| 5036/5036 [1:16:36<00:00,  1.10it/s]


accuracy: 0.952003462564128
Epoch 3:


100%|██████████| 5036/5036 [1:16:35<00:00,  1.10it/s]


accuracy: 0.9557902542471483
Epoch 4:


100%|██████████| 5036/5036 [1:16:34<00:00,  1.10it/s]


accuracy: 0.9605031887547335
Epoch 5:


 81%|████████  | 4081/5036 [1:02:04<14:31,  1.10it/s]

In [None]:
torch.save(model.state_dict(), f'/content/drive/My Drive/course_ocr/task2/Checkpoints/baseline_ARC_{BATCH}_{IMG_SIZE}.pth')

### Evaluation

In [None]:

pred_path = r'/content/drive/My Drive/course_ocr/task2/predARC18.txt'

test_reader = ArchivedHWDBReader(test_path)
test_reader.open()
test_helper = HWDBDatasetHelper(test_reader, prefix='Test')

In [None]:
test_dataset = HWDBDataset(test_helper)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)

In [None]:
preds = []
model.eval()
with torch.no_grad():
    for X, _ in tqdm(test_loader):
        X = torch.swapaxes(X, 1, 3)
        embeddings = model(X.to(torch.float32).cuda())
        logits = loss_fn.get_logits(embeddings)
        classes = torch.argmax(logits, dim=1).cpu().numpy()
        preds.extend(classes)

In [None]:
with open(pred_path, 'w') as f_pred:
    for idx, pred in enumerate(preds):
        name = test_helper.namelist[idx]
        cls = train_helper.vocabulary.class_by_index(pred)
        print(name, cls, file=f_pred)

In [None]:
! cd /content/drive/'My Drive'/course_ocr/task2/course_ocr_t2 ; python -m evaluate