In [305]:
import os
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torchvision import transforms as tr
from tqdm import tqdm
from PIL import Image
from lightning.pytorch.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping
from torch.utils.data import Dataset, DataLoader

## Обработаем данные

In [306]:
TRAIN_DATASET_PATH = Path().absolute().parent.parent / "CourseOCRTask3/Train"
TEST_DATASET_PATH = Path().absolute().parent.parent / "CourseOCRTask3/Test"

In [307]:
df = pd.read_csv(TRAIN_DATASET_PATH / 'markup.csv', encoding='utf-16', header=None)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fc272e4092ccb147a31e66bc1d10fbfb.png,3601514743345,538,248,1590,225,1597,663,532,676,1010101111000110101100110111001011001101000110...
1,8cde8cb1b54bca37c9347baf74157d22.png,7000001062307,372,433,360,1289,124,1293,147,431,1010001101010011100011010100111000110101100110...
2,70a9be0887eae17eabd70c8e607f963f.png,2250001407126,408,230,1224,229,1224,688,414,683,1010010011011000101001110100111000110101100110...
3,a029e49dff95a15d2cd396d8f62220c8.png,10101427156,335,149,1005,155,1005,438,335,447,1010001101001100100011010011001000110100110010...
4,82ff01909b6d215170dfc26c7be43074.png,2400745402226,469,237,1388,260,1392,712,464,696,1010100011000110101001110010001010001101110010...
...,...,...,...,...,...,...,...,...,...,...,...
8194,c532771808e76e333ba9e959853886f3.png,4823077510083,86,54,622,53,622,202,87,202,1010110111001101101111010001101001000100100010...
8195,186338ad8f3ff17b6082c023f24563f0.png,4823077505386,109,44,664,55,668,206,107,192,1010110111001101101111010001101001000100100010...
8196,5bac8f6c9bd5e99f92eb396442974c4e.png,4823077505492,121,27,704,13,704,168,120,187,1010110111001101101111010001101001000100100010...
8197,6247db3e424148a859ebf2180f1dcb6f.png,2010000019696,158,81,165,518,62,521,55,82,1010001101001100101001110100111000110101001110...


In [308]:
transform = tr.Compose([
            tr.ToTensor(),
            tr.Resize((128, 128))
        ])

In [309]:
class BarcodeDataset(Dataset):
    def __init__(self, df, path):
        self.items = []
        for item in tqdm(df.itertuples(index=False)):
            try:
                x = np.array(Image.open(path / 'Images' / item[0])) / 255.
                y = cv2.fillConvexPoly(np.zeros(x.shape[:2]), np.array(item[2:10]).reshape(-1, 2), 1)
                y = np.expand_dims(y, axis=-1)
                self.items.append((transform(x).float(), transform(y).float()))
            except Exception as exc:
                print(exc)

    def __len__(self):
        return len(self.items)
    
    def __getitem__(self, idx):
        x, y = self.items[idx]
        return x, y


In [310]:
train_dataset = BarcodeDataset(train_df, TRAIN_DATASET_PATH)
valid_dataset = BarcodeDataset(val_df, TRAIN_DATASET_PATH)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True)
val_loader = DataLoader(valid_dataset, batch_size=4, shuffle=False)

3it [00:00, 25.19it/s]

3510it [09:01,  3.88it/s]

[Errno 2] No such file or directory: 'C:\\Users\\bakyt\\ml\\CourseOCRTask3\\Train\\Images\\cd593cabcf1886a3cc0126491be6c4bc.png'


6559it [16:33,  6.60it/s]
1640it [03:57,  6.90it/s]


## Воспользуемся UNet для сегментации изображений

In [311]:
class UNet(nn.Module):
    def conv_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.encoder1 = self.conv_block(in_channels, 64)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.encoder2 = self.conv_block(64, 128)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.encoder3 = self.conv_block(128, 256)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.encoder4 = self.conv_block(256, 512)
        self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)

        self.bottleneck = self.conv_block(512, 1024)

        self.upconv4 = nn.ConvTranspose2d(1024, 512, kernel_size=2, stride=2)
        self.decoder4 = self.conv_block(1024, 512)
        self.upconv3 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.decoder3 = self.conv_block(512, 256)
        self.upconv2 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.decoder2 = self.conv_block(256, 128)
        self.upconv1 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.decoder1 = self.conv_block(128, 64)

        self.outconv = nn.Conv2d(64, out_channels, kernel_size=1)
        
        
    def forward(self, x):
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool1(enc1))
        enc3 = self.encoder3(self.pool2(enc2))
        enc4 = self.encoder4(self.pool3(enc3))
        
        bottleneck = self.bottleneck(self.pool4(enc4))
        
        dec4 = self.upconv4(bottleneck)
        dec4 = torch.cat((enc4, dec4), dim=1)
        dec4 = self.decoder4(dec4)
        dec3 = self.upconv3(dec4)
        dec3 = torch.cat((enc3, dec3), dim=1)
        dec3 = self.decoder3(dec3)
        dec2 = self.upconv2(dec3)
        dec2 = torch.cat((enc2, dec2), dim=1)
        dec2 = self.decoder2(dec2)
        dec1 = self.upconv1(dec2)
        dec1 = torch.cat((enc1, dec1), dim=1)
        dec1 = self.decoder1(dec1)
        
        out = self.outconv(dec1)
        return out

class DiceLoss(nn.Module):
    def __init__(self, smooth=1.):
        super(DiceLoss, self).__init__()
        self.smooth = smooth

    def forward(self, y_pred, y_true):
        y_pred_flat = y_pred.view(-1)
        y_true_flat = y_true.view(-1)
        intersection = torch.sum(y_true_flat * y_pred_flat)
        dice_coefficient = (2. * intersection + self.smooth) / (torch.sum(y_true_flat) + torch.sum(y_pred_flat) + self.smooth)
        return 1 - dice_coefficient

unet_model = UNet(in_channels=3, out_channels=1).cuda()

In [312]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [313]:
class LitCornerIdentifier(pl.LightningModule):
    def __init__(self, model, optimizer, loss_fn):
        super().__init__()
        self.model = model
        self.optimizer = optimizer
        self.loss_fn = loss_fn

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.model(inputs)
        outputs = torch.sigmoid(outputs)
        loss = self.loss_fn(outputs, targets)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self.model(inputs)
        outputs = torch.sigmoid(outputs)
        loss = self.loss_fn(outputs, targets)
        self.log('val_loss', loss)

    def configure_optimizers(self):
        return self.optimizer

def train(model, optim, loss_fn, epochs, train_loader, val_loader):
    model = LitCornerIdentifier(model, optim, loss_fn)
    early_stop_callback = EarlyStopping(
        monitor='val_loss',
        patience=2,
        mode='min'
    )
    logger = TensorBoardLogger("tb_logs", name="Unet")
    trainer = pl.Trainer(
        logger=logger,
        max_epochs=epochs,
        callbacks=[early_stop_callback]
    )
    trainer.fit(model, train_loader, val_loader)


In [314]:
train(unet_model, torch.optim.Adam(unet_model.parameters()), DiceLoss(), 10, train_loader, val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name    | Type     | Params
-------------------------------------
0 | model   | UNet     | 31.0 M
1 | loss_fn | DiceLoss | 0     
-------------------------------------
31.0 M    Trainable params
0         Non-trainable params
31.0 M    Total params
124.174   Total estimated model params size (MB)


                                                                           

c:\Users\bakyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
c:\Users\bakyt\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Epoch 9: 100%|██████████| 1639/1639 [02:33<00:00, 10.66it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 1639/1639 [02:36<00:00, 10.48it/s, v_num=0]


In [315]:
%tensorboard --logdir tb_logs

Reusing TensorBoard on port 6006 (pid 17316), started 3:39:22 ago. (Use '!kill 17316' to kill it.)

## Протестируем

In [316]:
def corners_frame(file, prediction):
    mask = (torch.sigmoid(prediction).cpu().detach().numpy() > 0.5).astype(np.uint8)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    return pd.DataFrame([[file, '-', x, y + h, x, y, x + w, y, x + w, y + h, '-']])

def load_image(file):
    try:
        img = np.array(Image.open(TEST_DATASET_PATH / 'Images' / file)) / 255.
        return img
    except Exception as exc:
        print(exc)
        return None


test_result = []
for file in tqdm(os.listdir(TEST_DATASET_PATH / 'Images')):
    img = load_image(file)
    if img is not None:
        output = unet_model(transform(img).unsqueeze(0).float())
        output = tr.Resize(img.shape[:2])(output).squeeze()
        test_result.append(corners_frame(file, output))

test_result = pd.concat(test_result, ignore_index=True)


  0%|          | 0/101 [00:00<?, ?it/s]

 85%|████████▌ | 86/101 [00:33<00:05,  2.77it/s]

cannot identify image file 'C:\\Users\\bakyt\\ml\\CourseOCRTask3\\Test\\Images\\desktop.ini'


100%|██████████| 101/101 [00:38<00:00,  2.61it/s]


In [317]:
test_result.to_csv('answer.csv', header=False, index=False, encoding='utf-16')

In [318]:
from course_intro_ocr_t3.evaluate import main

main()

Checking answer (c:\Users\bakyt\ml\course_intro_ocr\task3\answer.csv) against markup(c:\Users\bakyt\ml\course_intro_ocr\task3\markup.csv)
recognition_accuracy=0.0
detection_result=0.99
score=0.99
