<b>Назначение:</b> <br>
Балансировка классов в исходной выборке

In [16]:
import pandas as pd
import os 
import numpy as np
from tqdm import tqdm
import random
import cv2
import xxhash
import albumentations as A
import matplotlib.pyplot as plt
from collections import Counter

In [17]:
DATA_INFO_CSV = './data/confirmed_fronts_info.csv'
AUGMENTATION_INFO_CSV = './data/augmented_fronts_info.csv'
AUG_FOLDER = './data/augmented_fronts'
UPPER_BOUND = 6000
LOWER_BOUND = 200

In [18]:
data_info = pd.read_csv(DATA_INFO_CSV, sep=';')

In [19]:
UNIQUE_LABELS = data_info['label'].unique().tolist()

In [20]:
TRANSFORM_PIPELINE = A.Compose([
    #A.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0, always_apply=True),
    A.Rotate(border_mode=cv2.BORDER_REPLICATE, limit=180, always_apply=True),
    A.Perspective(),
    A.HorizontalFlip(),
    ])

In [21]:
aug_tmp = []
minor_classes = []
image_size = []

for label_name in UNIQUE_LABELS:
    # Пропус классов, размерность которых не 
    # соответствует заданным границам
    label_idxs = data_info[data_info['label'] == label_name].index.tolist()
    if len(label_idxs) >= UPPER_BOUND:
        continue
    if len(label_idxs) <= LOWER_BOUND:
        print(f"Minor class: {label_name}")
        minor_classes.append(label_name)
        continue

    # Выбор изображений для аугментации
    aug_amount = UPPER_BOUND - len(label_idxs)
    aug_idxs = label_idxs * (aug_amount // len(label_idxs))
    aug_left = aug_amount - len(aug_idxs)
    aug_idxs += random.choices(label_idxs, k=aug_left)

    # Выполнение аугментации изображений
    print(f"{label_name}:")
    for aug_idx in tqdm(aug_idxs):
        # load image
        orig_image_path = data_info['relative_path'][aug_idx] + '/' + data_info['image_name'][aug_idx]
        orig_image = cv2.imread(orig_image_path)
        image_size.append(orig_image.shape)
        orig_image_hash = xxhash.xxh64(orig_image).hexdigest()
        
        # augment image
        aug_image = TRANSFORM_PIPELINE(image=orig_image)
        aug_image = aug_image["image"]
        aug_image_hash = xxhash.xxh64(aug_image).hexdigest()

        # save image
        relative_path = AUG_FOLDER
        image_name = f"{aug_idx}$${aug_image_hash}$${label_name}.jpg"
        aug_image_path = f"{relative_path}/{image_name}" 
        cv2.imwrite(aug_image_path, aug_image)
        
        # add info about image into table
        aug_tmp.append([relative_path, image_name, label_name])

aug_df = pd.DataFrame(aug_tmp, columns=['relative_path','image_name', 'label'])

Minor class: Multicolour
Gold:


  0%|          | 0/5783 [00:00<?, ?it/s]

100%|██████████| 5783/5783 [00:21<00:00, 274.16it/s]


Brown:


100%|██████████| 5089/5089 [00:18<00:00, 272.82it/s]


Beige:


100%|██████████| 5400/5400 [00:19<00:00, 274.37it/s]


Green:


100%|██████████| 5223/5223 [00:19<00:00, 274.10it/s]


Bronze:


100%|██████████| 5671/5671 [00:20<00:00, 273.88it/s]


Orange:


100%|██████████| 5441/5441 [00:19<00:00, 272.67it/s]


Yellow:


100%|██████████| 5333/5333 [00:19<00:00, 269.24it/s]


Purple:


100%|██████████| 5638/5638 [00:20<00:00, 272.23it/s]

Minor class: Burgundy
Minor class: Navy
Minor class: Turquoise
Minor class: Magenta
Minor class: Pink
Minor class: Maroon
Minor class: Indigo





In [26]:
aug_df.to_csv(AUGMENTATION_INFO_CSV, sep=';', index=False)

In [27]:
Counter(image_size)

Counter({(300, 300, 3): 43578})

In [28]:
minor_classes

['Multicolour',
 'Burgundy',
 'Navy',
 'Turquoise',
 'Magenta',
 'Pink',
 'Maroon',
 'Indigo']

In [29]:
Counter(aug_df['label'])

Counter({'Gold': 5783,
         'Bronze': 5671,
         'Purple': 5638,
         'Orange': 5441,
         'Beige': 5400,
         'Yellow': 5333,
         'Green': 5223,
         'Brown': 5089})