In [6]:
!pip -q install pybboxes
!pip -q install pytorch_lightning


[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 24.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [27]:
import cv2
import numpy as np
import os
import pandas as pd
import pickle
from tqdm import tqdm
import pybboxes as pybbx
from matplotlib import pyplot as plt


import torch
from torch.utils.data import Dataset
from torch import nn
from torch.nn import functional as F
from torchvision import transforms

import pytorch_lightning as pl
from torchmetrics import Accuracy


from torchvision.models import resnet

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
UCODE_DICT = 'E:/Datasets/NomDataset/HWDB1.1-bitmap64-ucode-hannom-v2-tst_seen-label-set-ucode.pkl'

# Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/
# !pip install kaggle
!mkdir -v ~/.kaggle

!cp -f "/content/drive/MyDrive/Thesis Resource/kaggle.json" ~/.kaggle
!kaggle datasets download -d ngcthunhb/nomdataset-crops

/content
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Dataset URL: https://www.kaggle.com/datasets/ngcthunhb/nomdataset-crops
License(s): unknown
Downloading nomdataset-crops.zip to /content
 98% 178M/182M [00:01<00:00, 134MB/s]
100% 182M/182M [00:01<00:00, 106MB/s]


In [None]:
!unzip -q /content/nomdataset-crops.zip -d dataset/

## NomImageDataset - For loading raw-cropped images

In [4]:
# Dataset class for inputting YoloV5
class NomImageDataset(Dataset):
    def __init__(self, image_dir, annotation_file, unicode_dict_path, image_size=(224, 224), transform=None):
        self.root_dir = image_dir
        self.label_list = list()
        self.image_list = list()
        self.unicode_dict = dict()
        self.transform = transform
        self.image_size = image_size
        self.n_crop = 0

        with open(unicode_dict_path, 'rb') as f:
            tmp = pickle.load(f)
            tmp = sorted(list(tmp.keys()))
        for idx, k in enumerate(tmp):
            self.unicode_dict[k] = idx

        with open(annotation_file, 'r') as f:
            for line in tqdm(f):
                line = line.strip().split(',')
                image_name, label = line
                label = label.strip()
                image_path = os.path.join(self.root_dir, image_name)

                self.image_list.append(image_path)
                try:
                    self.label_list.append(self.unicode_dict[label])
                except:
                    self.label_list.append(self.unicode_dict['UNK'])
                    # print(f'Unknown label: {label}')

    def __len__(self):
        return len(self.image_list)

    def __getitem__(self, idx):
        x_image = cv2.imread(self.image_list[idx])
        y_label = self.label_list[idx]
        x_image = cv2.cvtColor(x_image, cv2.COLOR_BGR2RGB)

        if self.transform:
            x_image = self.transform(x_image)
        else:
            x_image = x_image *  1.0 / 255
            x_image = cv2.resize(x_image, self.image_size, interpolation=cv2.INTER_LANCZOS4)
            # x_image = (x_image - [0.485, 0.456, 0.406]) / [0.229, 0.224, 0.225]
            x_image = torch.from_numpy(x_image).permute(2, 0, 1).float()
        y_label = torch.tensor(y_label, dtype=torch.long)
        return x_image, y_label

# opt = dict(
#     image_dir = '../NomDataset/datasets/mono-domain-datasets/tale-of-kieu/1871/1871-raw-images',
#     annotation_file = '../TempResources/ToK1871.txt',
#     unicode_dict_path = '../NomDataset/HWDB1.1-bitmap64-ucode-hannom-v2-tst-label-set-ucode.pkl',
#     transform = None,
# )
# dataset = NomImageDataset(**opt)

# from matplotlib import pyplot as plt
# img = dataset[2][0]
# detBoxes = dataset[2][1]


# textLabel = []
# for box in detBoxes:
#     x_tl, y_tl, x_br, y_br, label = box
#     cv2.rectangle(img, (x_tl, y_tl), (x_br, y_br), (0, 255, 0), 2)
#     cv2.putText(img, label, (x_tl, y_tl), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
#     textLabel.append(chr(int(label, 16)))
# plt.imshow(img)
# plt.show()

# print(textLabel)

## NomYoloImageDataset
Yolo inference creates new crops that doesn't have labels. This class is exclusively for finding labels of such crops

In [None]:
class YoloCropDataset(Dataset):
    def __init__(self, image_file_path : str, annotation_file_path : str, label_file_path : str, unicode_dict_path : str, image_size : int | int, transform = None, scale = 1.0):
        self.image_file_path = image_file_path
        self.annotation_file_path = annotation_file_path
        self.label_file_path = label_file_path
        self.unicode_dict_path = unicode_dict_path
        self.image_size = image_size    # Target crop image size
        self.scale = scale

        self.image_files = []
        self.annotation_files = []
        self.label_files = []

        self.transform = transform
        self.load_files_list()

        self.crop_dict = {'crops': [], 'original_images_name': [], 'labels': [], 'unicode_labels': []}
        self.load_crops()

    def load_files_list(self) -> None:
        for file in os.listdir(self.image_file_path):
            if file.endswith('.jpg'):
                self.image_files.append(file)
        for file in os.listdir(self.annotation_file_path):
            if file.endswith('.txt'):
                self.annotation_files.append(file)
        assert len(self.image_files) == len(self.annotation_files), "Number of image files and annotation files do not match"

        for file in os.listdir(self.label_file_path):
            if file.endswith('.xlsx'):
                self.label_files.append(file)
        assert len(self.image_files) == len(self.label_files), f"Number of image files and label files do not match. {len(self.image_files)} != {len(self.label_files)}"


    def load_crops(self) -> None:
        def find_best_IOU(ref_box, boxes) -> float | tuple | int:
            def calculate_IOU(box1, box2):
                x1, y1, x2, y2 = box1
                x3, y3, x4, y4 = box2
                x5, y5 = max(x1, x3), max(y1, y3)
                x6, y6 = min(x2, x4), min(y2, y4)
                intersection = max(0, x6 - x5) * max(0, y6 - y5)
                area1 = (x2 - x1) * (y2 - y1)
                area2 = (x4 - x3) * (y4 - y3)
                union = area1 + area2 - intersection
                return intersection / union

            best_iou = 0
            best_box = None
            best_index = -1
            for index, box in enumerate(boxes, 0):
                iou = calculate_IOU(ref_box, box)
                if iou > best_iou:
                    best_iou = iou
                    best_box = box
                    best_index = index
            return best_iou, best_box, best_index


        # Label dictionary
        with open(self.unicode_dict_path, 'rb') as f:
            unicode_labels = pickle.load(f)
        for i, (k, v) in enumerate(unicode_labels.items()):
            unicode_labels[k] = i

        # For reading yolo txt files
        total_n = len(self.image_files)
        for image_file, txt_file, excel_file in tqdm(zip(self.image_files, self.annotation_files, self.label_files)):
            image = cv2.cvtColor(cv2.imread(os.path.join(self.image_file_path, image_file)), cv2.COLOR_BGR2RGB)    # Grayscale, so I can stack 3 channels later
            h, w, _ = image.shape
            df = pd.read_excel(os.path.join(self.label_file_path, excel_file))

            label_dict = {'boxes': [], 'labels': []}
            for _, row in df.iterrows():
                x1, y1, x2, y2 = row['LEFT'], row['TOP'], row['RIGHT'], row['BOTTOM']
                label = row['UNICODE']

                x1, y1, x2, y2 = x1 // self.scale, y1 // self.scale, x2 // self.scale, y2 // self.scale

                label_dict['boxes'].append((x1, y1, x2, y2))
                label_dict['labels'].append(label)

            with open(os.path.join(self.annotation_file_path, txt_file), 'r') as f:
                lines = f.readlines()
                for line in lines:
                    _, x, y, b_w, b_h = map(float, line.split(' '))
                    bbox = pybbx.YoloBoundingBox(x, y, b_w, b_h, image_size=(w, h)).to_voc(return_values=True)
                    x1, y1, x2, y2 = bbox

                    # Find the best IOU to label the cropped image
                    iou, box, idx = find_best_IOU(bbox, label_dict['boxes'])

                    crop_img = image[int(y1):int(y2), int(x1):int(x2)]

                    self.crop_dict['crops'].append(crop_img)

                    try:
                        label = unicode_labels[label_dict['labels'][idx]]
                    except:
                        label = unicode_labels['UNK']
                    self.crop_dict['labels'].append(label)

        assert len(self.crop_dict['crops']) == len(self.crop_dict['labels']), "Number of crops and labels do not match"

    def __len__(self) -> int:
        return len(self.crop_dict['crops'])

    def __getitem__(self, index: int) -> torch.Tensor | torch.Tensor:
        assert index <= len(self), "Index out of range"

        image = self.crop_dict['crops'][index]
        label = self.crop_dict['labels'][index]


        if self.transform:
            image = self.transform(image)
        else:
            # Resize the image to 224x224
            image = cv2.resize(image, self.image_size, interpolation=cv2.INTER_LANCZOS4)
            image = image *  1.0 / 255

            # TODO: This is the mean and std of ImageNet dataset, need to change to the mean and std of the dataset
            mean = [0.485, 0.456, 0.406]
            std = [0.229, 0.224, 0.225]

            # mean = [0.799, 0.818, 0.829]
            # std = [0.183, 0.179, 0.179]

            image = (image - mean) / std
            image = torch.from_numpy(image).permute(2, 0, 1).float()
        label = torch.tensor(label, dtype=torch.long)


        return image, label

# opt = dict(
#     image_file_path = '../NomDataset/datasets/mono-domain-datasets/tale-of-kieu/1871/1871-raw-images',
#     annotation_file_path = YOLO_ANNOTATION,
#     label_file_path = '../NomDataset/datasets/mono-domain-datasets/tale-of-kieu/1871/1871-annotation/annotation-mynom',
#     unicode_dict_path = '../NomDataset/HWDB1.1-bitmap64-ucode-hannom-v2-tst-label-set-ucode.pkl',
#     image_size = (224, 224),
#     transform = None,
# )

# dataset = YoloCropDataset(**opt)

In [None]:
# img = dataset[3][0].permute(1, 2, 0).numpy()
# label = dataset[3][1].item()
# from matplotlib import pyplot as plt
# cv2.imwrite('test.jpg', img * 255)

# new_unicode_dict = dict()
# with open('../NomDataset/HWDB1.1-bitmap64-ucode-hannom-v2-tst-label-set-ucode.pkl', 'rb') as f:
#     unicode_dict = pickle.load(f)
# for idx, (k, v) in enumerate(unicode_dict.items()):
#     new_unicode_dict[idx] = k
# print(new_unicode_dict[label])
# print(chr(int(new_unicode_dict[label], 16)))


# Architectures

## Recognizer : Nom_Resnet101

In [8]:
class Nom_Resnet101(nn.Module):
    def __init__(self, n_classes, pretrained=True):
        super(Nom_Resnet101, self).__init__()
        self.model = resnet.resnet101(weights=resnet.ResNet101_Weights.DEFAULT)

        # Modify the last layer
        self.model.fc = nn.Linear(self.model.fc.in_features, n_classes)

    def forward(self, x):
        return self.model(x)

class PytorchResNet101(pl.LightningModule):
    def __init__(self, num_labels):
        super(PytorchResNet101, self).__init__()
        self.save_hyperparameters()
        self.num_labels = num_labels

        # Get ResNet architecture and remove the last FC layer
        backbone = resnet.resnet101(weights=resnet.ResNet101_Weights.DEFAULT)
        num_filters = backbone.fc.in_features
        layers = list(backbone.children())[:-1]

        # Initialize layers
        self.feature_extractor = nn.Sequential(*layers)
        self.flatten = nn.Flatten()
        self.classifier = nn.Linear(num_filters, self.num_labels)

        self.criterion = nn.CrossEntropyLoss()
        self.metrics = Accuracy(task="multiclass", num_classes=self.num_labels)

        self.training_step_outputs = []
        self.validation_step_outputs = []
        self.test_step_outputs = []

    def forward(self, x):
        x = self.feature_extractor(x)
        x = self.flatten(x)
        x = self.classifier(x)
        return x

# Training Loop

In [None]:
# Blank

# Testing

In [None]:
# Seperate cell because Dataset loading is slow
dataset = YoloCropDataset(
    image_file_path = '/content/dataset/Nomdataset/datasets/mono-domain-datasets/luc-van-tien/lvt-raw-images',
    annotation_file_path = YOLO_ANNOTATION,
    label_file_path = '/content/dataset/Nomdataset/datasets/mono-domain-datasets/luc-van-tien/lvt-annotation/annotation-mynom',
    unicode_dict_path = UCODE_DICT,
    image_size = (32, 32),
    transform = None,
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

105it [00:11,  8.94it/s]


In [13]:
unicode_dict = dict()
with open(UCODE_DICT, 'rb') as f:
    temp = pickle.load(f)
for idx, (k, v) in enumerate(temp.items()):
    unicode_dict[idx] = k

# # Load the SR model
# sr_model = RRDBNet(num_in_ch=3, num_out_ch=3, scale=4, num_feat=64, num_block=23, num_grow_ch=32)
# sr_model.load_state_dict(torch.load('/content/drive/MyDrive/Resource/BasicSR/PretrainedModels/Real-ESRGAN/RealESRGAN_x4plus.pth')['params_ema'])
# sr_model.eval()

# Load the recognizer model
recognizer_model = Nom_Resnet101(n_classes=len(unicode_dict.keys()))
recognizer_model.model.load_state_dict(torch.load('E:/Github/Thesis/Backup/pretrained_model/NomResnet101.pth'))
recognizer_model.eval()

# from torchsummary import summary
# summary(sr_model, (3, 56, 56), device='cpu')
# summary(recognizer_model, (3, 224, 224), device='cpu')

Nom_Resnet101(
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
      

In [37]:
# sr_model.to('cpu')
recognizer_model.to('cpu')

DATASET_NAME = 'LVT'
dataset = NomImageDataset(
    image_dir = f'E:/Datasets/TempResources/LVT/LVT_SR_raw_crops/003_realSR_BSRGAN_DFO_s64w8_SwinIR-M_x4_GAN',
    annotation_file = f'E:/Datasets/TempResources/LVT/LVT_crops.txt',
    unicode_dict_path = UCODE_DICT,
    # scale=SCALE,
    image_size=(32, 32),
    transform = None,
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=False, num_workers=0)

label_dict = dict()
with open(UCODE_DICT, 'rb') as f:
    tmp = pickle.load(f)
for idx, (k, v) in enumerate(tmp.items()):
    label_dict[idx] = k

# sample = next(iter(dataloader))
# plt.figure(figsize=(16, 16))
# for i in range(4):
#     plt.subplot(4, 4, i+1)
#     img = sample[0][i].permute(1, 2, 0).numpy() * 255
#     plt.imshow(sample[0][i].permute(1, 2, 0).numpy())

#     grad_mask = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
#     thres = np.array(grad_mask).mean()
#     grad_mask = cv2.threshold(grad_mask, thres, 255, cv2.THRESH_BINARY)[1] / 255.0
#     grad_mask = torch.tensor(grad_mask).unsqueeze(0)
#     img_tensor = torch.cat((sample[0][i], grad_mask), dim=0).unsqueeze(0)

#     sr_img = sr_model(img_tensor).squeeze().detach().cpu().permute(1, 2, 0).numpy()[:, :, :3]
#     print(sr_img.shape)
#     # Take first 3 channel
#     plt.subplot(4, 4, i+5)
#     plt.imshow(sr_img)

#     # plt.title(sample[1][i].item())
#     try:
#         print(chr(int(label_dict[sample[1][i].item()], 16)), end=' ')
#     except:
#         print('UNK', end=' ')
# plt.show()




14450it [00:00, 197099.47it/s]


## Test on Raw images

In [38]:
torch.cuda.empty_cache()
# sr_model.to(DEVICE)
recognizer_model.to(DEVICE)

pbar = tqdm(total=len(dataloader), desc='Testing')

correct_pred = 0
incorrect_pred = []
for idx, (imgs, labels) in enumerate(dataloader, 1):
    imgs = imgs.to(DEVICE)
    labels = labels.to(DEVICE)

    with torch.no_grad():
        # Essentially normal Resnet operation is inference on bicubic upscaled images
        bicubic_imgs = F.interpolate(imgs, size=(224, 224), mode='bicubic')
        bicubic_imgs = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(bicubic_imgs)


        preds = recognizer_model(bicubic_imgs)
        preds = F.softmax(preds, dim=1)
        preds = torch.argmax(preds, dim=1)

        correct_pred += torch.sum(preds == labels).item()
        # Record failure cases
        for i, (pred, label) in enumerate(zip(preds, labels)):
            if pred != label:
                incorrect_pred.append((f'{idx}_{i}', pred, label))
        pbar.update(1)

pbar.close()
print("\nAccuracy:", correct_pred / len(dataset))



Testing:   0%|                                                                                                                                   | 0/452 [00:00<?, ?it/s][A[A

Testing:   0%|▎                                                                                                                          | 1/452 [00:00<04:17,  1.75it/s][A[A

Testing:   0%|▌                                                                                                                          | 2/452 [00:00<03:05,  2.42it/s][A[A

Testing:   1%|▊                                                                                                                          | 3/452 [00:01<02:42,  2.76it/s][A[A

Testing:   1%|█                                                                                                                          | 4/452 [00:01<02:40,  2.78it/s][A[A

Testing:   1%|█▎                                                                                                 


Accuracy: 0.5031141868512111





## Test on SR images

In [None]:
torch.cuda.empty_cache()
sr_model.to(DEVICE)
recognizer_model.to(DEVICE)

pbar = tqdm(total=len(dataloader), desc='Testing')

correct_pred = 0
incorrect_pred = []
for idx, (imgs, labels) in enumerate(dataloader, 1):
    imgs = imgs.to(DEVICE)
    labels = labels.to(DEVICE)

    with torch.no_grad():

        grayscale = imgs.mean(dim=1, keepdim=True)
        threshold_value = grayscale.mean()
        threshold_tensor = (grayscale > threshold_value).float()
        imgs = torch.cat([imgs, threshold_tensor], dim=1)
        # print(imgs.shape)

        sr_imgs = sr_model(imgs)

        # Remove mask channel
        sr_imgs = sr_imgs[:, :3, :, :]
        # print(sr_imgs.shape)



        # Interpolate the image to 224x224
        sr_imgs = F.interpolate(sr_imgs, size=(224, 224), mode='bicubic')
        # print(sr_imgs.shape)
        sr_imgs = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(sr_imgs)


        preds = recognizer_model(sr_imgs)
        preds = F.softmax(preds, dim=1)
        preds = torch.argmax(preds, dim=1)

        correct_pred += torch.sum(preds == labels).item()
        # Record failure cases
        for i, (pred, label) in enumerate(zip(preds, labels)):
            if pred != label:
                incorrect_pred.append((f'{idx}_{i}', pred, label))
        pbar.update(1)

pbar.close()
print("\nAccuracy:", correct_pred / len(dataset))
