In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.simplefilter('ignore')

from os import path
import sys
sys.path.append(path.abspath('..'))

In [None]:
import cv2
import numpy as np
from PIL import Image
import seaborn as sns
from timm import create_model

from src.transforms import PadResizeOCR, get_transforms
from src.dataset import PlatesCodeDataset
import jpeg4py as jpeg
import matplotlib.pyplot as plt

from typing import Tuple
from numpy.typing import NDArray
from torch import Tensor
import albumentations as albu
from torch.utils.data import DataLoader, RandomSampler

import random
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

In [None]:
def denormalize(
    img: NDArray[float],
    mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
    std: Tuple[float, ...] = (0.229, 0.224, 0.225),
    max_value: int = 255,
) -> NDArray[int]:
    denorm = albu.Normalize(
        mean=[-me / st for me, st in zip(mean, std)],  # noqa: WPS221
        std=[1.0 / st for st in std],
        always_apply=True,
        max_pixel_value=1.0,
    )
    denorm_img = denorm(image=img)['image'] * max_value
    return denorm_img.astype(np.uint8)

def tensor_to_cv_image(tensor: Tensor) -> NDArray[float]:
    return tensor.permute(1, 2, 0).cpu().numpy()

def text_decode(
        text_vector: Tensor,
        vocabular: str
) -> str:
    text_vector = text_vector.cpu().numpy()
    text_list = [vocabular[x-1] for x in text_vector if x > 0]
    return ''.join(text_list)

In [None]:
DATA_PATH='../data/'
VOCAB = '#&0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZÄÅÖÜĆČĐŠŽБГДЗИЛПУЦЧЭЯ'

## Check data

In [None]:
image = jpeg.JPEG('/home/dmitriy/Nextcloud/Projects/Proj_courses/DeepSchool/hw-02/model_plate-ocr/data/dataset-plates/kg/test/8074200.jpg').decode()
image.shape
plt.imshow(image)

### Prepare data

In [None]:
# create dataset
DATA_FOLDER = '../data'

train_dataset = PlatesCodeDataset(
    phase='train', 
    data_folder=DATA_FOLDER,
    reset_flag=True
)
print('\n\n')
valid_dataset = PlatesCodeDataset(
    phase='test', 
    data_folder=DATA_FOLDER,
    reset_flag=True
)

In [None]:
len(valid_dataset)
len(train_dataset)

In [None]:
# get all image sizes
train_shapes = []
for i in range(len(train_dataset)):
    train_shapes.append(train_dataset[i][0].shape)
train_shapes = np.array(train_shapes)

valid_shapes = []
for i in range(len(valid_dataset)):
    valid_shapes.append(valid_dataset[i][0].shape)
valid_shapes = np.array(valid_shapes)


### Select height

In [None]:
# check statistics and height distribution
np.median(train_shapes[:, 0])
_ = sns.distplot(train_shapes[:, 0])

np.median(valid_shapes[:, 0])
_ = sns.distplot(valid_shapes[:, 0])

In [None]:
# check statistics and wight distribution
np.median(train_shapes[:, 1])
_ = sns.distplot(train_shapes[:, 1])

np.median(valid_shapes[:, 1])
_ = sns.distplot(valid_shapes[:, 1])

In [None]:
# check original crops
for i in range(10):
    Image.fromarray(train_dataset[i][0])

We can see, that height is suitable, and we can use 64 for standardization (we need it dividable to 32 for pretrained backbone).

In [None]:
for i in range(10):
    image = train_dataset[i][0]
    scale = 64 / image.shape[0]
    scaled_image = cv2.resize(image, None, fx=scale, fy=scale)
    Image.fromarray(scaled_image)

Grete, we can clearly see each element with height 64.

For OCR tasks, it is better to use resize with preserved aspect ratio and fill empty pixels with 0. Let's select weight for that.

### Select weight

In [None]:
train_width = train_shapes[:, 1] * 64/train_shapes[:, 0]
valid_width = valid_shapes[:, 1] * 64/valid_shapes[:, 0]

np.max(train_width)
_ = sns.distplot(train_width)

np.max(valid_width)
_ = sns.distplot(valid_width)

We select weight according to maximum value + some extra for possible future crops. We have here 1109 pixels as maximum value, but they are outlets, and we can use 416 pixels (also dividable to 32). Let's check.

In [None]:
print(max(train_width))
plt.imshow(Image.fromarray(train_dataset[np.argmax(train_width)][0]))

In [None]:
train_shapes[:, 1]

In [None]:
_selected = np.array(train_dataset.image_paths)[train_width > 390]
print(min(train_width[train_width > 390]))
print(min(train_shapes[:, 1][train_width > 390]))
print(len(_selected))
for i in range(10):
    image_path = _selected[i]
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    scale = 64 / image.shape[0]
    scaled_image = cv2.resize(image, None, fx=scale, fy=scale)
    Image.fromarray(scaled_image)

In [None]:
print(min(train_width))
Image.fromarray(train_dataset[np.argmin(train_width)][0])

In [None]:
_selected = np.array(train_dataset.image_paths)[train_width < 50]
print(len(_selected))
for i in range(10):
    image_path = _selected[i]
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    scale = 64 / image.shape[0]
    scaled_image = cv2.resize(image, None, fx=scale, fy=scale)
    Image.fromarray(scaled_image)

So, according to height resize in 64 pixels, boundary conditions for weight are [50, 390] pixels. Our assumption about 416 pixel is valid.

### Check final crops

Finally, we can add some paddings to images, and we are ready to train! We can use *PadResizeOCR* self-written augmentation class.

In [None]:
transform = PadResizeOCR(target_width=416, target_height=64, mode='left')

for i in range(10):
    image = train_dataset[i][0]
    transformed_image = transform(image=image)['image']
    Image.fromarray(transformed_image)

That is what we needed, we can train now.

### Backbone selection

We need to select symbol weight according to feature map (how many "gaps" we use for single symbol prediction). Usually is recommended to use 2-3 "gap" per symbol.

Thus, we have maximum 10 simbols in number with max weight 390 pixels: 390 / (10*3) = 13 pixels in "gap" (we can rounded that value if needed). But we use 416 pixel wight, so we need no less than 32 "gaps" (416 / 13 = 32.0)


In [None]:
# check featuremap size after each layer
backbone = create_model(
            'resnet18',
            pretrained=True,
            features_only=True,
            out_indices=(1,2,3,4),
        )

with torch.no_grad():
    pred = backbone(torch.rand(1, 3, 64, 416))

pred[0].shape
pred[1].shape
pred[2].shape
pred[3].shape


We need to use features from depper layers, according to our "gap" criterion:

* 4th layer, wight 13 - too few
* 3rd layer, wight 26 - we can use it, but, probably we'll get worst result
* 2nd layer, wight 52 - suitable (32 < 52)
* 1st layer, weight 104 - too much, and from 1st layer feature will include too few information

If we need more deeper layers, we can use predefined backbone network and replace some straits from (2, 2) to (2, 1).

Grete! We select backbone for our OCR model. If we need to tune other parameters, we can see sourse code for *CRNN*.

## Check symbols encoding and augmentations

In [None]:
symbols_non_latin = (
        "АВ5СЕКХНМОРТУ0123456789ӨҮՈ",
        "AB5CEKXHMOPTY0123456789&Y#",
    )

tr_non_latin = {ord(a): ord(b) for a, b in zip(*symbols_non_latin)}

"а551оу750".upper().translate(tr_non_latin)

In [None]:
_train_transforms = get_transforms(
    width=416,
    height=64,
    vocab=VOCAB,
    text_size=10,
    postprocessing=True, 
    augmentations=True
)

In [None]:
DATA_FOLDER = '../data'

dataset = PlatesCodeDataset(
    phase='train', 
    data_folder=DATA_FOLDER,
    reset_flag=False,
    transforms=_train_transforms
)

In [None]:
idx = 1000000
image, text, text_lenght, region = dataset[idx]
text_str = text_decode(text, VOCAB)
print(f'true: text = {text_str}, text_lenght = {text_lenght}, region = {region}')

Image.fromarray(denormalize(tensor_to_cv_image(dataset[idx][0])))

## Check Dataset, dataloader

In [None]:
_train_transforms = get_transforms(
    width=416,
    height=64,
    vocab=VOCAB,
    text_size=10,
)
_valid_transforms = get_transforms(
    width=416,
    height=64,
    vocab=VOCAB,
    text_size=10,
    augmentations=False,
)

In [None]:

train_dataset = PlatesCodeDataset(
    phase='train',
    data_folder=DATA_PATH,
    reset_flag=False,
    transforms=_train_transforms,
)
valid_dataset = PlatesCodeDataset(
    phase='test',
    data_folder=DATA_PATH,
    reset_flag=False,
    transforms=_valid_transforms,
)

num_iterations = 100
batch_size = 16
if num_iterations != -1:
    train_sampler = RandomSampler(
        data_source=train_dataset,
        num_samples=num_iterations * batch_size,
    )

In [None]:
train_dl = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    num_workers=10,
    sampler=train_sampler,
    shuffle=False if train_sampler else True,
    pin_memory=True,
)

valid_dl = DataLoader(
    dataset=valid_dataset,
    batch_size=batch_size,
    num_workers=10,
    shuffle=False,
    pin_memory=True,
)

In [None]:
# Test dataset
for _test_dataset in [train_dataset, valid_dataset]:
    random_i = random.randint(0, _test_dataset.__len__() - 1)
    print(
        f"Selected index {random_i} from {_test_dataset.__len__()}. Image: {_test_dataset.image_paths[random_i]}"
    )
    
    image, text, text_lenght, region = _test_dataset[random_i]
    text_str = text_decode(text, VOCAB)
    print(f'true: text = {text_str}, text_lenght = {text_lenght}, region = {region}')
    
    Image.fromarray(denormalize(tensor_to_cv_image(_test_dataset[random_i][0])))
    print()

In [None]:
# Test dataloader
for _test_dataloader in [train_dl, valid_dl]:
    print('process')
    images, texts, text_lenghts, regions = next(iter(_test_dataloader))
    text_str = text_decode(texts[0], VOCAB)
    print(f'true: text = {text_str}, text_lenght = {text_lenghts[0]}, region = {regions[0]}')
    
    Image.fromarray(denormalize(tensor_to_cv_image(images[0])))
    print()