In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
import torchvision
from torchvision import datasets, models, transforms as T
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [12]:
from tqdm import tqdm
import pathlib
import os
from PIL import Image
import string
from typing import Tuple
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running training on [{device}]")

Running training on [cuda]


In [13]:
class CustomDataset(Dataset):
	def __init__(self, root_path, type="train"):
		self.root_path = root_path
		self.type = type
		self.images_paths = list(pathlib.Path(self.root_path + "./images").glob('*.png'))
		self.transforms = {
			'train' : T.Compose([
				T.Resize((200,40)),
				T.RandomRotation(20),
				T.GaussianBlur(3),
				T.ToTensor()
			]),
			'valid' : T.Compose([
				T.ToTensor()
			])
		}
		self.alphabet = string.ascii_letters + string.digits
		self.alphabet_size = len(self.alphabet)

	def __getitem__(self, idx):
		image_path = self.images_paths[idx]
		sample_name = str(image_path).split(os.sep)[-1].split(".")[0]
		text_path = self.root_path + "/transcripts/" + sample_name + ".txt"

		image = Image.open(image_path).convert("RGB")
		with open(text_path) as f:
			text = f.read()

		image = self.transforms[self.type](image)
		text = self.wordToTensor(text)
		return image, text
		

	def __len__(self):
		return len(self.images_paths)

	def letterToIndex(self, letter):
		return self.alphabet.find(letter)

	def letterToTensor(self, letter):
		tensor = torch.zeros(1, n_letters)
		tensor[0][letterToIndex(letter)] = 1
		return tensor

	def wordToTensor(self, word):
		tensor = torch.zeros(len(word), self.alphabet_size)
		for li, letter in enumerate(word):
			tensor[li][self.letterToIndex(letter)] = 1
		return tensor


In [22]:
class DataHandler:
	def __init__(self, run_config):
		self._training_dataset = None
		self._validation_dataset = None
		self._run_config = run_config

		self._load_datasets()
		
	def _load_datasets(self):
		self._training_dataset = CustomDataset("dataset/training")
		self._validation_dataset = CustomDataset("dataset/validation")

	def get_data_loaders(self) -> Tuple[DataLoader]:
		return (
			DataLoader(self._training_dataset, batch_size=self._run_config["batch_size"], shuffle=True, pin_memory=True), 
			DataLoader(self._validation_dataset, batch_size=self._run_config["batch_size"], shuffle=True, pin_memory=True)
		)

	def get_datasets(self) -> Tuple[Dataset]:
		return self._training_dataset, self._validation_dataset

	def get_datasets_sizes(self) -> Tuple[int]:
		return len(self._training_dataset), len(self._validation_dataset)

In [23]:
BATCH_SIZE = 128
WORKERS = 1
EPOCHS = 10

In [24]:
data_handler = DataHandler(run_config = {
    "batch_size": BATCH_SIZE,
    "workers": WORKERS
})
train_loader, validation_loader = data_handler.get_data_loaders()

In [25]:
class TranscribeModel(nn.Module):
    def __init__(self):
        super(TranscribeModel, self).__init__()
        self.conv_block1 = nn.Sequential(
			nn.Conv2d(in_channels=3, out_channels=16, kernel_size=2, padding=1),
			nn.MaxPool2d(kernel_size=2),
			nn.BatchNorm2d(16),
			nn.LeakyReLU(0.2, inplace=True),

			nn.Conv2d(in_channels=16, out_channels=32, kernel_size=2, padding=1),
			nn.MaxPool2d(kernel_size=2),
			nn.BatchNorm2d(32),
			nn.LeakyReLU(0.2, inplace=True)
		)

        self.rnn_block1 = nn.Sequential(
            nn.LSTM(input_size=32, hidden_size=62, num_layers=2, batch_first=True, bidirectional=True)
        )

    def forward(self, input):
        out = self.conv_block1(input)
        # (B, S, ) 
        out = out.view(out.size(0), out.size(1), -1)
        print(out.shape)

        out, (_, _) = self.rnn_block1(out)
        out = F.Softmax(dim=2)
        return out

In [26]:
model = TranscribeModel()
model.to(device)
optimizer = optim.AdamW(
    model.parameters(), 
    lr=0.0001, 
    betas=(0.9, 0.999), 
    eps=1e-08, 
    weight_decay=1e-4
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
loss_criterion = nn.CTCLoss()

TranscribeModel(
  (conv_block1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): LeakyReLU(negative_slope=0.2, inplace=True)
    (4): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
  )
  (rnn_block1): Sequential(
    (0): LSTM(32, 62, num_layers=2, batch_first=True, bidirectional=True)
  )
)

In [27]:
input = iter(train_loader).next()
model(input)

RuntimeError: stack expects each tensor to be equal size, but got [1, 62] at entry 0 and [8, 62] at entry 1

In [None]:
for epoch in range(EPOCHS):
	print('Epoch {}/{}'.format(epoch, run.num_epochs))
	print('-' * 10)

	########### Training step ###########
	model.train()
	training_loss = []
	running_loss = 0.0
	running_corrects = 0
			
	for i, data in enumerate(tqdm(train_loader, desc=f"Epoch [{epoch + 1}] progress")):

		x_batch, label_batch = data
		x_batch, label_batch = x_batch.to(device), label_batch.to(device)

		optimizer.zero_grad()
		outputs = model(x_batch)
		_, preds = torch.max(outputs, 1)

		loss = loss_criterion(outputs, label_batch)

		loss.backward()
		optimizer.step()
				
		# statistics
		running_loss += loss.item() * x_batch.size(0)
		running_corrects += torch.sum(preds == label_batch.detach())
		training_loss.append(loss.item())

	epoch_loss = running_loss / training_dataset_size
	epoch_acc = running_corrects.double() / training_dataset_size

	# tensorboard logging
	writer.add_scalar("Loss/train", epoch_loss, epoch)
	writer.add_scalar("Accuracy/train", epoch_acc, epoch)

	print('Training step => Loss: {:.4f} Acc: {:.4f}'.format(
		epoch_loss, epoch_acc
	))

	scheduler.step(epoch_acc)


	########### Validation step ###########
	model.eval()
	validation_loss = []
	running_loss = 0.0
	running_corrects = 0

	for i, data in enumerate(validation_loader):
		with torch.no_grad():
			x_batch, label_batch = data
			x_batch, label_batch = x_batch.to(device), label_batch.to(device)

			outputs = model(x_batch)
			_, preds = torch.max(outputs, 1)
			loss = loss_criterion(...)

			running_loss += loss.item() * x_batch.size(0)
			running_corrects += torch.sum(preds == label_batch.detach())
			validation_loss.append(loss.item())
			
	epoch_loss = running_loss / validation_dataset_size
	epoch_acc = running_corrects.double() / validation_dataset_size

	# tensorboard logging
	writer.add_scalar("Loss/validation", epoch_loss, epoch)
	writer.add_scalar("Accuracy/validation", epoch_acc, epoch)

	print('Evaluation step => Loss: {:.4f} Acc: {:.4f}'.format(
		epoch_loss, epoch_acc
	))

	#Save the best model based on accuracy
	if epoch_acc > best_acc:
		best_acc = epoch_acc
		best_config = f"{run}"
		best_model_wts = copy.deepcopy(model.state_dict())

	#Checkpoint
	torch.save({
		"epoch": epoch,
		"model_state_dict": model.state_dict(),
		"optimizer_state_dict": optimizer.state_dict()
	}, "./checkpoints/ckp.pt")



time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
	time_elapsed // 60, time_elapsed % 60
))
print('Best (so far) validation Acc: {:4f}'.format(best_acc))

print('-' * 10)
print('### Final results ###\n')
print('Best validation Acc: {:4f}'.format(best_acc))

model.load_state_dict(best_model_wts)