In [1]:
# General Idea:
# /content/gdrive/My Drive/Colab Notebooks/

# Plan
# 1. Add Background to Image dependent on image size (choose from x different backgrounds)
# 2. Add some more distortions
# 3. Rotate and sheer image
# 4. Rescale images to fixed size.
# 5. Write as generator
# 6. Write CNN which predicts 2 angles

# Augmentation

In [2]:
from PIL import Image
import random
import numpy as np
import cv2
import os

from imgaug import augmenters as iaa
import imgaug as ia

In [3]:
max_foreground_size = 300
scaler = 3

In [4]:
def add_background_img(foreground, background, max_foreground_size, scaler):
    width = foreground.size[0]
    height = foreground.size[1]

    if(width > height):
        percentage = max_foreground_size/width
        max_size = int(width*scaler*percentage)
    else:
        percentage = max_foreground_size/height
        max_size = int(height*scaler*percentage)
        
#     print(width, height, percentage)

    foreground = foreground.resize((int(width*percentage), int(height*percentage)), Image.ANTIALIAS)

    background = background.resize((max_size, max_size), Image.ANTIALIAS)

    margin_w = int((background.size[0]-foreground.size[0])/2)
    margin_h = int((background.size[1]-foreground.size[1])/2)

    # foreground.show()
    background.paste(foreground, (margin_w, margin_h))
#     background.show()

    return background

In [5]:
def augment_image(foreground, background, max_foreground_size, scaler):
    # load image with background
    open_cv_image = np.array(add_background_img(foreground, background, max_foreground_size, scaler).convert('RGB'))

    # Convert RGB to BGR
    open_cv_image = open_cv_image[:, :, ::-1].copy() 

    # define augmentations
    rotation = random.uniform(-85,85)
    shear = random.uniform(-10,10)

    # 2. Add some more distortions
    blur_aug = ia.augmenters.blur.MotionBlur(k=(3,10), angle=(0, 360), direction=(-1.0, 1.0))
    
    # 3. Rotate and sheer image
    rotate_aug = ia.augmenters.geometric.Affine(rotate=rotation)
    shear_aug = ia.augmenters.geometric.Affine(shear=shear)
    
    # exectue augmentation
    new_img = blur_aug.augment_image(open_cv_image)
    new_img = rotate_aug.augment_image(new_img)
    new_img = shear_aug.augment_image(new_img)

    # show
    img = cv2.cvtColor(new_img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
#     img.show()

    # crop image
    rand_scaler = scaler*random.uniform(0.7,1.3)
#     print("rand_scaler", rand_scaler)
    crop_side_percentage = (rand_scaler-1)/(2*rand_scaler)

    area = (
        img.size[0]*crop_side_percentage, # width left
        img.size[1]*crop_side_percentage, # height top
        img.size[0]*(1-crop_side_percentage), # width right
        img.size[1]*(1-crop_side_percentage), # height bottom
    )
    cropped_img = img.crop(area)
#     cropped_img.show()
    
    return cropped_img, rotation, shear

In [7]:
# Generator

In [22]:
input_data_folder = "invoice_img_data"
background_img_folder = "background_img"

input_img_names = [x for x in os.listdir(input_data_folder) if "_label" not in x and ".png" in x]
background_img_names = [
    x for x in os.listdir(background_img_folder) if "_label" not in x and ".png" in x or ".jpg" in x or ".jpeg" in x]

In [23]:
len(input_img_names), len(background_img_names)

(300, 23)

In [26]:
random.choice(input_img_names), random.choice(background_img_names)

('out_244.png',
 'mobile-phone-with-blank-screen-on-wooden-table-background-top-view-with-copy-space_1253-984.jpg')

In [28]:
# test augmentation
for i in range(5):
    background = Image.open(os.path.join(background_img_folder, random.choice(background_img_names)))
    foreground = Image.open(os.path.join(input_data_folder, random.choice(input_img_names)))

    final_img, rotation, shear = augment_image(foreground, background, max_foreground_size, scaler)
    final_img.show()

# Training

In [10]:
from tqdm import tqdm

import torch
from torch import nn

# https://stackoverflow.com/questions/42480111/model-summary-in-pytorch
from torchsummary import summary

# https://github.com/lanpa/tensorboardX
from tensorboardX import SummaryWriter
writer = SummaryWriter("logs/image_rotation")



In [11]:
device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

In [12]:
class depthwise_separable_conv(nn.Module):
    def __init__(self, nin, nout, ksize, padd):
        super(depthwise_separable_conv, self).__init__()
        self.depthwise = nn.Conv2d(nin, nin, kernel_size=ksize, padding=padd, groups=nin)
        self.pointwise = nn.Conv2d(nin, nout, kernel_size=1)

    def forward(self, x):
        out = self.depthwise(x)
        out = self.pointwise(out)
        return out

In [13]:
class conv_max_step(nn.Module):
    def __init__(self, nin, nout, ksize, padd):
        super(conv_max_step, self).__init__()
        self.conv = depthwise_separable_conv(nin, nout, ksize, padd)
        self.batchn = nn.BatchNorm2d(nout)
        self.relu = nn.ReLU()
        self.maxp = nn.MaxPool2d(2)

    def forward(self, x):
        x = self.conv(x)
        x = self.batchn(x)
        x = self.relu(x)
        x = self.maxp(x)
        return x

In [14]:
class Model(nn.Module):
    def __init__(self, max_img_size, nchannel, nclasses, ):
        super(Model, self).__init__()

        nin = nchannel
        nout = int(nin*2)
        self.cm1 = conv_max_step(nin, nout, 7, 3)
        max_img_size = max_img_size/2

        nin = nout
        nout = int(nin*2)
        self.cm2 = conv_max_step(nin, nout, 3, 1)
        max_img_size = max_img_size/2

        nin = nout
        nout = int(nin*2)
        self.cm3 = conv_max_step(nin, nout, 3, 1)
        max_img_size = max_img_size/2

        nin = nout
        nout = int(nin*2)
        self.cm4 = conv_max_step(nin, nout, 3, 1)
        max_img_size = int(max_img_size/2)

        self.lin_dim = nout*max_img_size*max_img_size
        self.fc = nn.Linear(in_features=self.lin_dim, out_features=nclasses)

    def forward(self, x):
        x = self.cm1(x)
        x = self.cm2(x)
        x = self.cm3(x)
        x = self.cm4(x)
        x = x.view(-1, self.lin_dim)
        out = self.fc(x)
       
        return out

In [15]:
def training_data_generator(input_img_names, background_img_names, itr, batch_size):
    inpu = []
    rotation = []
    shear = []

    # put into generator function for evaluation
    for img_name in input_img_names[itr*batch_size:(itr+1)*batch_size]:
        background_name = random.choice(background_img_names)

        foreground = Image.open(os.path.join(input_data_folder, img_name))
        background = Image.open(os.path.join(background_img_folder, background_name))

        curr_img, rot, she = augment_image(foreground, background, max_foreground_size, scaler)

        curr_img = curr_img.resize((max_img_size, max_img_size), Image.ANTIALIAS)
        curr_img = np.array(curr_img)

        inpu.append(curr_img)
        rotation.append(rot)
        shear.append(she)

    # move channel to second index position
    inpu = np.swapaxes(np.array(inpu), 3, -3)
    X = torch.from_numpy(inpu).float()

    y = np.transpose(np.vstack([rotation, shear]))
    y = torch.from_numpy(y).float()
    
    return X, y

In [17]:
max_epochs = 50
batch_size = 254
learning_rate = 1e-4

n_batches = int(np.ceil(len(input_img_names)/batch_size))

max_img_size = 64
inp_channels = 3
nclasses = 2

# model
model = Model(max_img_size, inp_channels, nclasses)
# loss
loss_fn = torch.nn.MSELoss(reduction='sum')
# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in tqdm(range(max_epochs)):
#     print("epoch: %i" % epoch)
    for itr in range(n_batches):
        # get training data from generator
        X, y = training_data_generator(input_img_names, background_img_names, itr, batch_size)

        # Forward pass: compute predicted y by passing x to the model.
        y_pred = model.forward(X)

        # Compute and print loss.
        loss = loss_fn(y_pred, y)

        # Zero the gradients before running the backward pass.
        model.zero_grad()

        # Backward pass: compute gradient of the loss.
        loss.backward()

        # Calling the step function on an Optimizer makes an update to its parameters
        optimizer.step()

#     print(loss.item())
    writer.add_scalar("total_loss", loss.item(), epoch)


  0%|          | 0/50 [00:00<?, ?it/s][A

epoch: 0



  2%|▏         | 1/50 [00:26<21:49, 26.72s/it][A

epoch: 1



  4%|▍         | 2/50 [00:53<21:29, 26.86s/it][A

epoch: 2



  6%|▌         | 3/50 [01:31<23:27, 29.94s/it][A

epoch: 3



  8%|▊         | 4/50 [02:00<22:56, 29.92s/it][A

epoch: 4



 10%|█         | 5/50 [02:44<25:25, 33.89s/it][A

epoch: 5



 12%|█▏        | 6/50 [03:16<24:38, 33.59s/it][A

epoch: 6



 14%|█▍        | 7/50 [03:50<24:02, 33.55s/it][A

epoch: 7



 16%|█▌        | 8/50 [04:20<22:43, 32.47s/it][A

epoch: 8



 18%|█▊        | 9/50 [04:55<22:47, 33.35s/it][A

epoch: 9



 20%|██        | 10/50 [05:26<21:37, 32.44s/it][A

epoch: 10



 22%|██▏       | 11/50 [05:54<20:23, 31.38s/it][A

epoch: 11



 24%|██▍       | 12/50 [06:23<19:17, 30.46s/it][A

epoch: 12



 26%|██▌       | 13/50 [06:54<18:54, 30.66s/it][A

epoch: 13



 28%|██▊       | 14/50 [07:27<18:47, 31.33s/it][A

epoch: 14



 30%|███       | 15/50 [07:58<18:19, 31.41s/it][A

epoch: 15



 32%|███▏      | 16/50 [08:27<17:20, 30.61s/it][A

epoch: 16



 34%|███▍      | 17/50 [08:59<17:05, 31.06s/it][A

epoch: 17



 36%|███▌      | 18/50 [09:28<16:14, 30.45s/it][A

epoch: 18



 38%|███▊      | 19/50 [09:58<15:36, 30.20s/it][A

epoch: 19



 40%|████      | 20/50 [10:30<15:18, 30.61s/it][A

epoch: 20



 42%|████▏     | 21/50 [10:59<14:37, 30.25s/it][A

epoch: 21



 44%|████▍     | 22/50 [11:28<13:58, 29.96s/it][A

epoch: 22



 46%|████▌     | 23/50 [11:58<13:25, 29.84s/it][A

epoch: 23



 48%|████▊     | 24/50 [12:26<12:47, 29.51s/it][A

epoch: 24



 50%|█████     | 25/50 [12:55<12:12, 29.31s/it][A

epoch: 25



 52%|█████▏    | 26/50 [13:23<11:32, 28.86s/it][A

epoch: 26



 54%|█████▍    | 27/50 [14:00<12:01, 31.37s/it][A

epoch: 27



 56%|█████▌    | 28/50 [15:48<19:54, 54.31s/it][A

epoch: 28



 58%|█████▊    | 29/50 [17:39<24:58, 71.34s/it][A

epoch: 29



 60%|██████    | 30/50 [18:32<21:55, 65.79s/it][A

epoch: 30



 62%|██████▏   | 31/50 [19:01<17:16, 54.58s/it][A

epoch: 31



 64%|██████▍   | 32/50 [19:28<13:53, 46.31s/it][A

epoch: 32



 66%|██████▌   | 33/50 [19:55<11:30, 40.60s/it][A

epoch: 33



 68%|██████▊   | 34/50 [20:24<09:54, 37.18s/it][A

epoch: 34



 70%|███████   | 35/50 [20:52<08:36, 34.46s/it][A

epoch: 35



 72%|███████▏  | 36/50 [21:19<07:30, 32.19s/it][A

epoch: 36



 74%|███████▍  | 37/50 [21:50<06:54, 31.88s/it][A

epoch: 37



 76%|███████▌  | 38/50 [22:17<06:05, 30.42s/it][A

epoch: 38



 78%|███████▊  | 39/50 [22:46<05:28, 29.87s/it][A

epoch: 39



 80%|████████  | 40/50 [23:15<04:58, 29.81s/it][A

epoch: 40



 82%|████████▏ | 41/50 [23:48<04:35, 30.63s/it][A

epoch: 41



 84%|████████▍ | 42/50 [24:22<04:12, 31.53s/it][A

epoch: 42



 86%|████████▌ | 43/50 [24:52<03:37, 31.10s/it][A

epoch: 43



 88%|████████▊ | 44/50 [25:20<03:00, 30.16s/it][A

epoch: 44



 90%|█████████ | 45/50 [25:48<02:27, 29.57s/it][A

epoch: 45



 92%|█████████▏| 46/50 [26:14<01:54, 28.57s/it][A

epoch: 46



 94%|█████████▍| 47/50 [26:44<01:27, 29.01s/it][A

epoch: 47



 96%|█████████▌| 48/50 [27:14<00:58, 29.39s/it][A

epoch: 48



 98%|█████████▊| 49/50 [27:42<00:28, 28.88s/it][A

epoch: 49



100%|██████████| 50/50 [28:09<00:00, 28.36s/it][A
[A

# Sudo Inference

In [None]:
def correct_image(final_img, shear, rotation):
    shear_aug_ = ia.augmenters.geometric.Affine(shear=-shear)
    rotate_aug_ = ia.augmenters.geometric.Affine(rotate=-rotation)

    reverse_img = np.array(final_img.convert('RGB'))

    # Convert RGB to BGR
    reverse_img = reverse_img[:, :, ::-1].copy()

    # reverse augmentation
    reverse_img = shear_aug_.augment_image(reverse_img)
    reverse_img = rotate_aug_.augment_image(reverse_img)

    # show
    img = cv2.cvtColor(reverse_img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    
    return img

In [None]:
correct_image(final_img).show()

In [None]:
for i in range(batch_size):
    Image.fromarray(inpu[i]).show()
    correct_image(Image.fromarray(inpu[i]), shear[i], rotation[i]).show()

In [None]:
# load image with background
open_cv_image = np.array(add_background_img(foreground, background, max_foreground_size, scaler).convert('RGB'))

# Convert RGB to BGR
open_cv_image = open_cv_image[:, :, ::-1].copy() 

# augment image (rotate)
# define rotation
rotation = random.uniform(-85,85)
shear = random.uniform(-10,10)
# aug = iaa.Affine(rotate=45) # rotation

# rotate_aug = ia.augmenters.geometric.Affine(rotate=rotation)
# shear_aug = ia.augmenters.geometric.Affine(shear=shear)

# exectue augmentation
# new_img = rotate_aug.augment_image(open_cv_image)
# new_img = shear_aug.augment_image(new_img)

new_img = blur.augment_image(open_cv_image)

# show
img = cv2.cvtColor(new_img, cv2.COLOR_BGR2RGB)
img = Image.fromarray(img)
img.show()