### This notebook contains code for processing raw images and segments from elastic
Let's convert images to consistently scaled images and segments to masks

In [3]:
import os
import ujson as json
from pprint import pprint
import pathlib
from functools import reduce
from shutil import copyfile

import cv2
import numpy as np
from tqdm import tqdm_notebook
from tensorboardX import SummaryWriter
from sklearn.model_selection import train_test_split

import random
from random import shuffle
from sys import getsizeof

from turbojpeg import TurboJPEG
jpeg = TurboJPEG('/opt/libjpeg-turbo/lib64/libturbojpeg.so')

In [4]:
def mkdir(dir_):
    pathlib.Path(dir_).mkdir(exist_ok=True)

In [5]:
def get_path_pairs(images_paths, segments_path, images_path):
    # generator, which returns path for image and corresponding path for json
    for image_path in images_paths:
        photo_id = pathlib.Path(image_path).stem
        if photo_id.startswith('.') or not((image_path.endswith('.jpeg') or image_path.endswith('.jpg'))):
            continue
        json_path = os.path.join(segments_path, f'{photo_id}.json')
        if os.path.isfile(json_path):
            yield (os.path.join(images_path, image_path), json_path)

In [6]:
SEGMENT_CATEGORIES = (
    "Name",
    "ProductCode",
    "RegularRuble",
    "RegularKop",
    "DiscountRuble",
    "DiscountKop",
    "CardRuble",
    "CardKop"
)

SEGCAT2NUMBER9 = {cat: n for n, cat in enumerate(SEGMENT_CATEGORIES)}

SEGCAT2NUMBER9

{'Name': 0,
 'ProductCode': 1,
 'RegularRuble': 2,
 'RegularKop': 3,
 'DiscountRuble': 4,
 'DiscountKop': 5,
 'CardRuble': 6,
 'CardKop': 7}

In [7]:
def rotate(x: float, y: float, angle: float) -> np.ndarray:
    # angle is positive to counter-clock-wise
    angle = angle * np.pi / 180
    vector = np.array([[x], [y]])
    _sin = np.sin(angle)
    _cos = np.cos(angle)
    matrix = np.array([[_cos, -_sin], [_sin, _cos]])
    return np.squeeze(np.matmul(matrix, vector))

def rotate_around_axis(point, axis, angle):
    translated_x, translated_y = point[0] - axis[0], point[1] - axis[1]
    rotated_x, rotated_y = rotate(translated_x, translated_y, angle)
    return rotated_x + axis[0], rotated_y + axis[1]

rounding = lambda x: int(round(x))

def get_corners(x, y, w, h, a):
    # return 4 points COUNTER-CLOCK-WISE starting with UPPER-LEFT
    x1, y1 = map(rounding, rotate_around_axis((x - w / 2, y - h / 2), (x, y), a))
    x2, y2 = map(rounding, rotate_around_axis((x - w / 2, y + h / 2), (x, y), a))
    x3, y3 = map(rounding, rotate_around_axis((x + w / 2, y + h / 2), (x, y), a))
    x4, y4 = map(rounding, rotate_around_axis((x + w / 2, y - h / 2), (x, y), a))
    return (x1, y1), (x2, y2), (x3, y3), (x4, y4)

def xywha_fetcher(obj):
    return obj['x'], obj['y'], obj['width'], obj['height'], obj['angle']

MAX_SIDE = 256

def resize(img, mask, inter=cv2.INTER_LINEAR):
    h, w, c = img.shape
    max_ = max(h, w)
    h = h * MAX_SIDE // max_ 
    w = w * MAX_SIDE // max_
    return cv2.resize(img, (w, h), interpolation=inter), \
           cv2.resize(mask, (w, h))

In [8]:
def obtain_true_ratios(dataset_len, ratios, max_size):
    return {mode: ratios[mode] if ratios[mode] * dataset_len <= max_size[mode] else max_size[mode] / dataset_len
            for mode in ('test', 'val')}

In [9]:
def transer_data(images, masks, img_dir, label_dir):
    for image_path, mask_path in tqdm_notebook(zip(images, masks), total=len(images)):
        if os.path.isfile(image_path):
            stem = pathlib.Path(image_path).stem
            copyfile(image_path, os.path.join(img_dir, f'{stem}.jpg'))
            copyfile(mask_path, os.path.join(label_dir, f'{stem}.npy'))

In [10]:
def process_dataset(dataset_path, tensorboard_path, split_params=None):
    images_path = os.path.join(dataset_path, 'images')
    segments_path = os.path.join(dataset_path, 'segments')
    
    output_dir = os.path.join(dataset_path, 'processed_data')
    output_images_path = os.path.join(output_dir, 'images')
    output_masks_path = os.path.join(output_dir, 'masks9')
    mkdir(output_dir)
    mkdir(output_images_path)
    mkdir(output_masks_path)
    
    image_files_paths = [x 
                     for x in os.listdir(images_path)
                     if os.path.isfile(os.path.join(images_path, x))]

    img_number = len(image_files_paths)
    market = dataset_path.split('-')[-1] if len(dataset_path.split('-')) > 1 else dataset_path.split('/')[-1]
    print(f'{market} processing has begun')
    with SummaryWriter(tensorboard_path) as writer:

        for i, (image_path, json_path) in tqdm_notebook(enumerate(get_path_pairs(image_files_paths,
                                                                                 segments_path,
                                                                                 images_path)),
                                                        total=img_number):
            stem = pathlib.Path(image_path).stem
            processed_image_path = os.path.join(output_images_path, f'{stem}.jpg')
            mask_path = os.path.join(output_masks_path, f'{stem}.npy')

            if os.path.isfile(processed_image_path) and os.path.isfile(mask_path):
                continue

            with open(image_path, 'rb') as f:
                img = jpeg.decode(f.read())
            img_shape = img.shape

            with open(json_path, 'r') as json_file:
                segments = json.load(json_file)
            manual_segments = [(y['coordinates'], y['segmentType'])
                               for y in filter(lambda x: not x['moderation']['moderatedBy'].endswith('machine') \
                                                         and x['segmentType'] in SEGMENT_CATEGORIES,
                                               segments)]

            mask = np.zeros((*img_shape[:2], 9), dtype=np.uint8)
            for segment, segment_type in manual_segments:
                corners = get_corners(*xywha_fetcher(segment))
                corners = np.array([corners], dtype=np.int32)
                temp_canvas = mask[..., SEGCAT2NUMBER9[segment_type]].copy()
                mask[..., SEGCAT2NUMBER9[segment_type]] = cv2.fillConvexPoly(temp_canvas, corners, 255)

            mask[..., 8] = 255 - reduce(cv2.add, iter(mask[..., c] for c in range(8)))
            img, mask = resize(img, mask)

            with open(mask_path, 'wb') as numpy_file:
                np.save(numpy_file, mask)

            with open(processed_image_path, 'wb') as jpeg_file:
                jpeg_file.write(jpeg.encode(img, quality=100))
                
            writer.add_scalar(f'DataProcessing/{market}', (img_number - 1) - i, i)
    
    if split_params:
        all_proccessed_images = os.listdir(output_images_path)
        all_proccessed_tags = [os.path.join(output_masks_path, image_name.split('.')[0]+'.npy')
                               for image_name in all_proccessed_images]
        all_proccessed_images = [os.path.join(output_images_path, image_name)
                                 for image_name in all_proccessed_images]
        
        ratios = split_params['ratios']
        max_size = split_params['max_size']
        ratios = obtain_true_ratios(len(all_proccessed_images), ratios, max_size)
        
        X, X_test, y, y_test = train_test_split(all_proccessed_images, all_proccessed_tags, test_size=ratios['test'], random_state=42)
        val_percent = ratios['val'] / (1 - ratios['test'])
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_percent, random_state=42)
        
        cache_dir = split_params['directory']
        mkdir(cache_dir)

        train_dir = os.path.join(cache_dir, 'train')
        train_images = os.path.join(train_dir, 'images')
        train_labels = os.path.join(train_dir, 'labels')

        val_dir = os.path.join(cache_dir, 'val')
        val_images = os.path.join(val_dir, 'images')
        val_labels = os.path.join(val_dir, 'labels')

        test_dir = os.path.join(cache_dir, 'test')
        test_images = os.path.join(test_dir, 'images')
        test_labels = os.path.join(test_dir, 'labels')
        
        for dir_ in (train_dir, val_dir, test_dir, 
                     train_images, train_labels,
                     val_images, val_labels,
                     test_images, test_labels):
            mkdir(dir_)
        
        transer_data(X_train, y_train, train_images, train_labels)
        transer_data(X_val, y_val, val_images, val_labels)
        transer_data(X_test, y_test, test_images, test_labels)

In [11]:
main_dir = '../storage/segmentator-tags/'
datasets_list = [os.path.join(main_dir, x) for x in os.listdir(main_dir) if not x.startswith('.')]
datasets_list

['../storage/segmentator-tags/magnit-info.ru',
 '../storage/segmentator-tags/auchan.ru',
 '../storage/segmentator-tags/ataksupermarket.ru',
 '../storage/segmentator-tags/globus.ru',
 '../storage/segmentator-tags/maxi-retail.ru',
 '../storage/segmentator-tags/okmarket.ru',
 '../storage/segmentator-tags/bristol.ru',
 '../storage/segmentator-tags/europa-ts.ru',
 '../storage/segmentator-tags/5ka.ru',
 '../storage/segmentator-tags/spar.ru',
 '../storage/segmentator-tags/lenta.com',
 '../storage/segmentator-tags/dixy.ru']

In [12]:
split_params = {
    "directory": '../cached_data/all/',
    "ratios":
    {
        "val": 0.01,
        "test": 0.005
    },
    "max_size":
    {
        "val": 200,
        "test": 50
    }
}

In [None]:
for dataset_path in datasets_list:
    process_dataset(dataset_path, '../tensorboard/processing/run1')#, split_params)

info.ru processing has begun


HBox(children=(IntProgress(value=0, max=99807), HTML(value='')))

tags/auchan.ru processing has begun


HBox(children=(IntProgress(value=0, max=64162), HTML(value='')))

tags/ataksupermarket.ru processing has begun


HBox(children=(IntProgress(value=0, max=1163), HTML(value='')))

tags/globus.ru processing has begun


HBox(children=(IntProgress(value=0, max=8842), HTML(value='')))

retail.ru processing has begun


HBox(children=(IntProgress(value=0, max=17355), HTML(value='')))

tags/okmarket.ru processing has begun


HBox(children=(IntProgress(value=0, max=80111), HTML(value='')))

tags/bristol.ru processing has begun


HBox(children=(IntProgress(value=0, max=33743), HTML(value='')))

ts.ru processing has begun


HBox(children=(IntProgress(value=0, max=65541), HTML(value='')))

tags/5ka.ru processing has begun


HBox(children=(IntProgress(value=0, max=38236), HTML(value='')))

tags/spar.ru processing has begun


HBox(children=(IntProgress(value=0, max=93730), HTML(value='')))

tags/lenta.com processing has begun


HBox(children=(IntProgress(value=0, max=95607), HTML(value='')))

tags/dixy.ru processing has begun


HBox(children=(IntProgress(value=0, max=3035), HTML(value='')))

In [14]:
print('Done')

Done


In [72]:
obtain_true_ratios(100000, {'test': 0.005, 'val': 0.01}, {'test': 100, 'val': 1000})

{'test': 0.001, 'val': 0.01}