In [83]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torch.utils.tensorboard import SummaryWriter
import torchvision
from torchvision import datasets, models, transforms as T
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [84]:
from tqdm.notebook import tqdm
import pathlib
import os
from PIL import Image
import string
from typing import Tuple
import datetime
import copy
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running training on [{device}]")


Running training on [cuda]


In [85]:
BATCH_SIZE = 128
WORKERS = 1
EPOCHS = 10
MAX_WORD_LENGTH = 10
ALPHABET = string.ascii_letters + string.digits + "_" #blank char for CTC

In [86]:
class CustomDataset(Dataset):
	def __init__(self, root_path, type="train"):
		self.root_path = root_path
		self.type = type
		self.images_paths = list(pathlib.Path(self.root_path + "./images").glob('*.png'))
		self.transforms = {
			'train' : T.Compose([
				T.Resize((40,200)),
				T.RandomRotation(20),
				T.GaussianBlur(3),
				T.ToTensor()
			]),
			'valid' : T.Compose([
				T.ToTensor()
			])
		}
		global ALPHABET
		self.alphabet = ALPHABET
		self.alphabet_size = len(self.alphabet)
		print(f"Alphabet size: {self.alphabet_size}")

	def __getitem__(self, idx):
		image_path = self.images_paths[idx]
		sample_name = str(image_path).split(os.sep)[-1].split(".")[0]
		text_path = self.root_path + "/transcripts/" + str(int(sample_name) + 1) + ".txt"

		image = Image.open(image_path).convert("RGB")
		with open(text_path) as f:
			text = f.read()

		image = self.transforms[self.type](image)
		text_tensor = self.wordToTensor(text)
		return image, (text_tensor, len(text), text)
		

	def __len__(self):
		return len(self.images_paths)

	def letterToIndex(self, letter):
		return self.alphabet.find(letter)

	def letterToTensor(self, letter):
		tensor = torch.zeros(1, n_letters)
		tensor[0][letterToIndex(letter)] = 1
		return tensor

	def wordToTensor(self, word):
		# tensor = torch.zeros(MAX_WORD_LENGTH, self.alphabet_size)
		# for li, letter in enumerate(word):
		# 	tensor[li][self.letterToIndex(letter)] = 1

		tensor = torch.zeros(MAX_WORD_LENGTH)
		for li, letter in enumerate(word):
			tensor[li] = self.letterToIndex(letter)
		return tensor


In [87]:
class DataHandler:
	def __init__(self, run_config):
		self._training_dataset = None
		self._validation_dataset = None
		self._run_config = run_config

		self._load_datasets()
		
	def _load_datasets(self):
		self._training_dataset = CustomDataset("dataset/training")
		self._validation_dataset = CustomDataset("dataset/validation")

	def get_data_loaders(self) -> Tuple[DataLoader]:
		return (
			DataLoader(self._training_dataset, batch_size=self._run_config["batch_size"], shuffle=True, pin_memory=True, drop_last=True), 
			DataLoader(self._validation_dataset, batch_size=self._run_config["batch_size"], shuffle=True, pin_memory=True, drop_last=True)
		)

	def get_datasets(self) -> Tuple[Dataset]:
		return self._training_dataset, self._validation_dataset

	def get_datasets_sizes(self) -> Tuple[int]:
		return len(self._training_dataset), len(self._validation_dataset)

In [88]:
data_handler = DataHandler(run_config = {
    "batch_size": BATCH_SIZE,
    "workers": WORKERS
})
train_loader, validation_loader = data_handler.get_data_loaders()
training_dataset_size, validation_dataset_size = data_handler.get_datasets_sizes()

Alphabet size: 63
Alphabet size: 63


In [89]:
class TranscribeModel(nn.Module):
    def __init__(self):
        super(TranscribeModel, self).__init__()
        self.conv_block1 = nn.Sequential(
			nn.Conv2d(in_channels=3, out_channels=16, kernel_size=2, padding=1),
			nn.MaxPool2d(kernel_size=2),
			nn.BatchNorm2d(16),
			nn.LeakyReLU(0.2, inplace=True),

			nn.Conv2d(in_channels=16, out_channels=32, kernel_size=2, padding=1),
			nn.MaxPool2d(kernel_size=2),
			nn.BatchNorm2d(32),
			nn.LeakyReLU(0.2, inplace=True)
		)

        self.rnn_block1 = nn.Sequential(
            nn.LSTM(input_size=320, hidden_size=62, num_layers=2, batch_first=True, bidirectional=True)
        )

        self.softmax = nn.Softmax(dim=2)

    def forward(self, input):
        out = self.conv_block1(input)
        # print(out.shape) #[128, 32, 10, 50]
        out = out.permute([0, 3, 2, 1])
        # (B, S, ) 
        out = out.reshape(out.size(0), out.size(1), -1)

        out, (_, _) = self.rnn_block1(out)
        out = self.softmax(out)
        return out

In [90]:
model = TranscribeModel()
model.to(device)
optimizer = optim.AdamW(
    model.parameters(), 
    lr=0.0001, 
    betas=(0.9, 0.999), 
    eps=1e-08, 
    weight_decay=1e-4
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
loss_criterion = nn.CTCLoss(blank=len(ALPHABET)-1)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, steps_per_epoch=10, epochs=EPOCHS,anneal_strategy='linear')

log_dir = "logs/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter(log_dir)

TranscribeModel(
  (conv_block1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): LeakyReLU(negative_slope=0.2, inplace=True)
    (4): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
  )
  (rnn_block1): Sequential(
    (0): LSTM(320, 62, num_layers=2, batch_first=True, bidirectional=True)
  )
  (softmax): Softmax(dim=2)
)

In [91]:
def tensorToWord(tensor):
	tensor = tensor[..., :62]
	indices = torch.argmax(tensor, dim=2).tolist()
	
	words = []
	for batch_idx in range(BATCH_SIZE):
		cur_word_indices = indices[batch_idx]
		cur_word = []
		last_letter = None
		for idx in range(50):
			if ALPHABET[idx] == ALPHABET[-1]:
				last_letter = None
				continue
			else:
				if last_letter == None or (last_letter is not None and last_letter != cur_word_indices[idx]):
					cur_word.append(ALPHABET[cur_word_indices[idx]])
					last_letter = cur_word_indices[idx]
					continue
		words.append("".join(cur_word))
	return words

	

In [92]:
image, (label_tensor, text_length, label_text) = iter(train_loader).next()
image, label_tensor = image.to(device), label_tensor.to(device)
output = model(image)
print(text_length)

tensor([10,  9,  5,  2,  5,  7,  4,  1,  3,  9,  9,  1,  4,  4,  5,  6,  2,  9,
         6, 10,  9,  8, 10,  8,  4,  5, 10,  2,  1,  3,  8,  8,  5,  8,  5,  3,
         9, 10,  2,  4,  5,  5,  9,  4,  1,  8,  3,  4, 10,  8,  6,  2,  7,  6,
         7,  6,  1,  2,  3,  1,  4,  7,  2,  1,  1, 10, 10,  5,  2,  8,  2,  7,
         9,  5,  8,  4,  7,  5, 10, 10,  3,  9,  4, 10,  4,  6,  6,  1,  4,  6,
         9,  7,  4,  3,  8, 10,  8,  5,  6, 10,  4,  1,  7,  9,  3,  3,  8,  6,
         1,  4,  4,  4,  7,  8,  5,  4, 10,  9,  5,  5,  5,  9,  4,  4,  6,  1,
         7, 10])


In [93]:
output.shape
out = output[..., :62]
# torch.argmax(out, dim=2)

tensorToWord(out)

torch.Size([128, 50, 124])

['1jHUlUrjw',
 'SrTrHrTrHrjTrj',
 'ArUrjNwM',
 'ATwjHr',
 'AUrFrFrFjHjHrjNw',
 'HUwUwjTr',
 'jrjrtrlHUrcrTwDZDM',
 'SHrk',
 'AwjsHr',
 'jAHwHjr',
 'AwHwjUr',
 'jwFUwTsHr',
 'TjwjwFjUFrHrTr',
 'SfwAHwrterTwM6j',
 'rHrTwM6j6w',
 'TjUwHrF',
 'AjwjKrHr',
 'STHrjrjrj',
 'hHFU6wjHr',
 'hHFjFHwMFjFSruerSFJMr',
 'ArjwM',
 'rjwM6jU6w',
 'rfwAHwrjrjwZDM6juB',
 'jwHwjwHjFjTHTr',
 '6rHrjrj',
 '6THTFArATrjHr',
 'ASrjrjwM',
 'jHFUwMsHr',
 'AjSjTKr',
 'AjwjKHr',
 'AHwjFjFjFjTr',
 '6UwUHTr',
 'AwrwrSrFrjrTwM46j',
 'GUrjw',
 'AHjHjKFUwjUjFjHrTrTrTr',
 'AjTr',
 'TwMFMTFHrHUrHur',
 'Hj6wHwjHr',
 'SHrTwM46wj',
 'rfHrTwM6j6w',
 'jHwjTHr',
 'TjU6UwMHrF',
 'SrHjHrjrjUVjUjN',
 'AUrjNwM',
 'AjwHwjHr',
 'ASrjwM46w6',
 '6SUwHwjsHr',
 'jwFUjwTsHr',
 'Arjw',
 'HUj6FwjSjrTSrMrF',
 'jHBFjFwHwqSGHrJjrSFJr',
 'GUrjNwM',
 'jFUFwFTHTHr',
 'ArArTrHrNTw',
 'jU4wjHrHrF',
 'AjwjVjVjTrHr',
 'GTqHqHAwyKTwTUHrvjr',
 'AjwjKHr',
 'THjFUFwTsHrF',
 'jAwjHsHr',
 'SrTwM46j',
 'ArjrjNwM',
 'jF6jFw4AwyTeHrvjrSrfJF',
 'ArjNwM',
 '6THr'

In [94]:
for epoch in range(EPOCHS):
	print('Epoch {}/{}'.format(epoch, EPOCHS))
	print('-' * 10)

	########### Training step ###########
	model.train()
	training_loss = []
	running_loss = 0.0
	running_corrects = 0
			
	for i, data in enumerate(tqdm(train_loader, desc=f"Epoch [{epoch + 1}] progress")):

		x_batch, (label_batch, label_length, label_text) = data
		x_batch, label_batch, label_length = x_batch.to(device), label_batch.to(device), label_length.to(device)

		optimizer.zero_grad()
		outputs = model(x_batch)
		outputs_permuted = outputs.permute((1, 0, 2))

		loss = loss_criterion(outputs_permuted, label_batch, torch.full((BATCH_SIZE,), MAX_WORD_LENGTH), label_length)

		loss.backward()
		optimizer.step()
				
		# statistics
		running_loss += loss.item() * x_batch.size(0)
		running_corrects += torch.tensor([0])
		# running_corrects += torch.sum(preds == label_text)
		training_loss.append(loss.item())

	epoch_loss = running_loss / training_dataset_size
	epoch_acc = running_corrects.double() / training_dataset_size

	# tensorboard logging
	writer.add_scalar("Loss/train", epoch_loss, epoch)

	print('Training step => Loss: {:.4f}'.format(
		epoch_loss
	))

	scheduler.step(epoch_acc)


	########### Validation step ###########
	model.eval()
	validation_loss = []
	running_loss = 0.0
	running_corrects = 0

	for i, data in enumerate(validation_loader):
		with torch.no_grad():
			x_batch, (label_batch, label_length, label_text) = data
			x_batch, label_batch, label_length = x_batch.to(device), label_batch.to(device), label_length.to(device)

			outputs = model(x_batch)
			outputs = outputs.permute((1, 0, 2))
			loss = loss_criterion(outputs, label_batch, torch.full((BATCH_SIZE,), MAX_WORD_LENGTH), label_length)

			running_loss += loss.item() * x_batch.size(0)
			running_corrects += torch.tensor([0])
			# running_corrects += torch.sum(preds == label_batch.detach())
			validation_loss.append(loss.item())
			
	epoch_loss = running_loss / validation_dataset_size
	epoch_acc = running_corrects.double() / validation_dataset_size

	# tensorboard logging
	writer.add_scalar("Loss/validation", epoch_loss, epoch)

	print('Evaluation step => Loss: {:.4f}'.format(
		epoch_loss
	))
	best_acc = 0
	#Save the best model based on accuracy
	if True:
		best_model_wts = copy.deepcopy(model.state_dict())

	#Checkpoint
	torch.save({
		"epoch": epoch,
		"model_state_dict": model.state_dict(),
		"optimizer_state_dict": optimizer.state_dict()
	}, "./checkpoints/ckp.pt")



time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
	time_elapsed // 60, time_elapsed % 60
))
print('Best (so far) validation Acc: {:4f}'.format(best_acc))

print('-' * 10)
print('### Final results ###\n')
print('Best validation Acc: {:4f}'.format(best_acc))

model.load_state_dict(best_model_wts)

Epoch 0/10
----------


TranscribeModel(
  (conv_block1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): LeakyReLU(negative_slope=0.2, inplace=True)
    (4): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
  )
  (rnn_block1): Sequential(
    (0): LSTM(320, 62, num_layers=2, batch_first=True, bidirectional=True)
  )
  (softmax): Softmax(dim=2)
)

HBox(children=(FloatProgress(value=0.0, description='Epoch [1] progress', max=48.0, style=ProgressStyle(descri…

inf
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128
nan
128

Training step => Loss: nan


TranscribeModel(
  (conv_block1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): LeakyReLU(negative_slope=0.2, inplace=True)
    (4): Conv2d(16, 32, kernel_size=(2, 2), stride=(1, 1), padding=(1, 1))
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
  )
  (rnn_block1): Sequential(
    (0): LSTM(320, 62, num_layers=2, batch_first=True, bidirectional=True)
  )
  (softmax): Softmax(dim=2)
)

Evaluation step => Loss: nan


FileNotFoundError: [Errno 2] No such file or directory: './checkpoints/ckp.pt'