In [None]:
! pip install imgaug
! pip install imagecorruptions

In [51]:
import os
import json
from ast import literal_eval
import math
import random

import numpy as np 
import pandas as pd 
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import imgaug as ia
import imgaug.augmenters as iaa

from PIL import Image

import warnings
warnings.filterwarnings('ignore')

## Чтение датасета

In [133]:
f_train = open('./data/train_anno.json')
f_val = open('./data/val_anno.json')
 
data_train = json.load(f_train)
data_val = json.load(f_val)
  
f_train.close()
f_val.close()

In [134]:
df_train_img = pd.DataFrame(data_train['images'])
df_train_annot = pd.DataFrame(data_train['annotations'])
df_train_cat = pd.DataFrame(data_train['categories'])

df_val_img = pd.DataFrame(data_val['images'])
df_val_annot = pd.DataFrame(data_val['annotations'])
df_val_cat = pd.DataFrame(data_val['categories'])

In [135]:
df_train = pd.merge(
    df_train_img,
    df_train_annot,
    how="inner",
    left_on="id",
    right_on="image_id"
)

df_train = pd.merge(
    df_train,
    df_train_cat,
    how="inner",
    left_on="category_id",
    right_on="id"
)

df_train.drop(columns=['id_x', 'id_y', 'id'], inplace=True)

df_train.head(5)

Unnamed: 0,width,height,file_name,image_id,category_id,area,bbox,iscrowd,name
0,1280,720,rtsd-frames/autosave01_02_2012_09_13_33.jpg,0,1,324,"[649, 376, 18, 18]",0,2_1
1,1280,720,rtsd-frames/autosave01_02_2012_09_13_34.jpg,1,1,420,"[671, 356, 20, 21]",0,2_1
2,1280,720,rtsd-frames/autosave01_02_2012_09_13_35.jpg,2,1,702,"[711, 332, 27, 26]",0,2_1
3,1280,720,rtsd-frames/autosave01_02_2012_09_13_36.jpg,3,1,1332,"[764, 290, 37, 36]",0,2_1
4,1280,720,rtsd-frames/autosave01_02_2012_09_13_37.jpg,4,1,3192,"[876, 200, 56, 57]",0,2_1


In [136]:
df_val = pd.merge(
    df_val_img,
    df_val_annot,
    how="inner",
    left_on="id",
    right_on="image_id"
)

df_val = pd.merge(
    df_val,
    df_val_cat,
    how="inner",
    left_on="category_id",
    right_on="id"
)

df_val.drop(columns=['id_x', 'id_y', 'id'], inplace=True)

df_val.head(5)

Unnamed: 0,width,height,file_name,image_id,category_id,area,bbox,iscrowd,name
0,1280,720,rtsd-frames/autosave10_10_2012_13_50_36_1.jpg,22978,3,810,"[622, 375, 30, 27]",0,1_17
1,1920,1080,rtsd-frames/autosave16_04_2013_15_11_26_1.jpg,34704,3,667,"[1050, 436, 29, 23]",0,1_17
2,1920,1080,rtsd-frames/autosave16_04_2013_13_19_50_2.jpg,32165,3,1720,"[1033, 506, 43, 40]",0,1_17
3,1920,1080,rtsd-frames/autosave13_04_2013_11_07_29_0.jpg,25166,3,1260,"[1185, 380, 36, 35]",0,1_17
4,1920,1080,rtsd-frames/autosave16_04_2013_13_19_50_0.jpg,32163,3,980,"[979, 530, 35, 28]",0,1_17


In [137]:
df = pd.concat([df_train, df_val], axis=0)
df = df.rename(columns={'name': 'category_name'})

df.head(5)

Unnamed: 0,width,height,file_name,image_id,category_id,area,bbox,iscrowd,category_name
0,1280,720,rtsd-frames/autosave01_02_2012_09_13_33.jpg,0,1,324,"[649, 376, 18, 18]",0,2_1
1,1280,720,rtsd-frames/autosave01_02_2012_09_13_34.jpg,1,1,420,"[671, 356, 20, 21]",0,2_1
2,1280,720,rtsd-frames/autosave01_02_2012_09_13_35.jpg,2,1,702,"[711, 332, 27, 26]",0,2_1
3,1280,720,rtsd-frames/autosave01_02_2012_09_13_36.jpg,3,1,1332,"[764, 290, 37, 36]",0,2_1
4,1280,720,rtsd-frames/autosave01_02_2012_09_13_37.jpg,4,1,3192,"[876, 200, 56, 57]",0,2_1


In [138]:
temp = np.array([list(item) for item in df['bbox'].to_numpy()])
df = pd.concat([df.reset_index(), pd.DataFrame(temp, columns=['x', 'y', 'w', 'h']).reset_index()], axis=1) 
df.drop('bbox', axis=1, inplace=True)

In [78]:
grouped_data = df.groupby('category_name').size().reset_index(name='count')
grouped_data = grouped_data.sort_values(by='count', ascending=False)
fig = px.bar(grouped_data, x='category_name' ,y ='count', 
             labels={'category_name': 'category_name'}, color_discrete_sequence=['blue'], 
             title=f'Distribution of sign_class')
fig.show()

In [79]:
df.drop('index', axis=1, inplace=True)

In [80]:
df

Unnamed: 0,width,height,file_name,image_id,category_id,area,iscrowd,category_name,x,y,w,h
0,1280,720,rtsd-frames/autosave01_02_2012_09_13_33.jpg,0,1,324,0,2_1,649,376,18,18
1,1280,720,rtsd-frames/autosave01_02_2012_09_13_34.jpg,1,1,420,0,2_1,671,356,20,21
2,1280,720,rtsd-frames/autosave01_02_2012_09_13_35.jpg,2,1,702,0,2_1,711,332,27,26
3,1280,720,rtsd-frames/autosave01_02_2012_09_13_36.jpg,3,1,1332,0,2_1,764,290,37,36
4,1280,720,rtsd-frames/autosave01_02_2012_09_13_37.jpg,4,1,3192,0,2_1,876,200,56,57
...,...,...,...,...,...,...,...,...,...,...,...,...
104353,1920,1080,rtsd-frames/autosave16_04_2013_13_34_06_2.jpg,32502,147,840,0,1_26,1192,838,30,28
104354,1280,720,rtsd-frames/autosave24_10_2013_11_21_12_0.jpg,57590,153,1000,0,7_18,869,350,25,40
104355,1280,720,rtsd-frames/autosave10_10_2012_09_29_51_1.jpg,18492,133,899,0,5_12,781,333,31,29
104356,1280,720,rtsd-frames/autosave16_10_2012_08_36_42_0.jpg,38147,149,966,0,6_8_1,820,316,23,42


## Уменьшение датасета

In [94]:
df_result = pd.DataFrame()

for cat_name in df['category_name'].unique():
    df_cat = df[df['category_name'] == cat_name]
    df_cat = df_cat.sample(frac=1).reset_index(drop=True).iloc[:500]
    df_result = pd.concat([df_result, df_cat], axis=0)

In [95]:
grouped_data = df_result.groupby('category_name').size().reset_index(name='count')
grouped_data = grouped_data.sort_values(by='count', ascending=False)
fig = px.bar(grouped_data, x='category_name' ,y ='count', 
             labels={'category_name': 'category_name'}, color_discrete_sequence=['blue'], 
             title=f'Distribution of sign_class')
fig.show()

In [96]:
df_result

Unnamed: 0,width,height,file_name,image_id,category_id,area,iscrowd,category_name,x,y,w,h
0,1280,720,rtsd-frames/autosave24_10_2013_11_21_31_0.jpg,57609,1,1764,0,2_1,803,351,42,42
1,1280,720,rtsd-frames/autosave09_10_2012_12_54_26_0.jpg,11150,1,702,0,2_1,788,333,26,27
2,1920,1080,rtsd-frames/autosave16_04_2013_11_53_05_0.jpg,30508,1,323,0,2_1,924,535,19,17
3,1920,1080,rtsd-frames/autosave13_04_2013_09_48_32_0.jpg,24663,1,5313,0,2_1,1119,225,69,77
4,1280,720,rtsd-frames/autosave23_10_2012_09_03_01_1.jpg,45945,1,624,0,2_1,1006,313,26,24
...,...,...,...,...,...,...,...,...,...,...,...,...
7,1280,720,rtsd-frames/autosave24_10_2013_11_23_19_2.jpg,57701,154,2340,0,7_14,767,306,39,60
8,1280,720,rtsd-frames/autosave24_10_2013_11_23_20_1.jpg,57703,154,16912,0,7_14,1143,101,112,151
0,1280,720,rtsd-frames/autosave24_10_2013_11_29_00_2.jpg,57824,155,3648,0,8_23,1203,283,76,48
1,1280,720,rtsd-frames/autosave24_10_2013_11_29_00_0.jpg,57822,155,612,0,8_23,762,367,36,17


In [97]:
files_to_drop = list(set(df['file_name']) - set(df_result['file_name']))

for i in range(len(files_to_drop)):
    files_to_drop[i] = files_to_drop[i].split('/')[1]

['rtsd-frames/autosave01_02_2012_09_28_01_0.8794150510117291.jpg',
 'rtsd-frames/autosave24_10_2012_10_38_51_1.jpg',
 'rtsd-frames/autosave09_10_2012_14_04_29_1_0.8618184723759348.jpg',
 'rtsd-frames/autosave09_10_2012_13_17_48_0.jpg',
 'rtsd-frames/autosave16_04_2013_14_52_54_2_0.481617925019181.jpg',
 'rtsd-frames/autosave09_11_2012_09_34_35_0_0.4535132293067088.jpg',
 'rtsd-frames/autosave13_04_2013_09_58_33_0_0.6219367067135131.jpg',
 'rtsd-frames/autosave23_10_2012_11_13_20_1_0.3381856559581883.jpg',
 'rtsd-frames/autosave10_10_2012_09_40_11_1_0.5602914673825458.jpg',
 'rtsd-frames/autosave02_10_2012_12_15_58_2_0.458516878415713.jpg',
 'rtsd-frames/autosave09_10_2012_09_54_14_0.jpg',
 'rtsd-frames/autosave09_10_2012_07_56_38_1_0.07259765182907585.jpg',
 'rtsd-frames/autosave13_04_2013_09_57_00_0_0.20733226428615537.jpg',
 'rtsd-frames/autosave02_10_2012_12_42_37_2_0.7755345589591811.jpg',
 'rtsd-frames/autosave16_04_2013_11_42_47_1_0.14048468186331875.jpg',
 'rtsd-frames/autosave2

In [102]:
path = './data/rtsd-frames/test/'

for file_name in files_to_drop:
    os.system(f"rm {path + file_name}")

## Аугментация

In [84]:
path = './data/rtsd-frames/rtsd-frames/'
new_path = './data/rtsd-frames/test/'

In [85]:
# ia.seed(1)
p = 0.3

seq = iaa.Sequential([  
    iaa.Sometimes(
        p,
        iaa.GaussianBlur(sigma=(0.0, 1)),
    ), 
    iaa.Sometimes(
        p,
        iaa.MotionBlur(k=(3, 6)),
    ),    
    
    iaa.Sometimes(
        p,
        iaa.pillike.Equalize(),
    ),
    iaa.Sometimes(
        p,
        iaa.pillike.Autocontrast(),
    ),    

    iaa.Sometimes(
        p,
        iaa.imgcorruptlike.GaussianNoise(severity=1),
    ),
    iaa.Sometimes(
        p,
        iaa.imgcorruptlike.Fog(severity=(1, 2)),
    ),
    iaa.Sometimes(
        0.2,
        iaa.imgcorruptlike.Frost(severity=1),
    ),
    iaa.Sometimes(
        0.2,
        iaa.imgcorruptlike.Snow(severity=1),
    ),
    iaa.Sometimes(
        0.2,
        iaa.imgcorruptlike.Spatter(severity=(1, 2)),
    ),

], random_order=True) # apply augmenters in random order

In [86]:
categories = df_result['category_name'].unique()
temp = pd.DataFrame()


for category in categories:
    df_cat = df_result[df_result['category_name'] == category]
    count = df_cat.shape[0]
    
    if count >= 500:
        pass
    elif (500 - count) <= count:
        df_temp = df_cat.sample(500 - count)
        for index, row in df_temp.iterrows():
            row_copy = pd.DataFrame(row.copy()).transpose()
            image_name = row['file_name'].split('/')[1]

            image = Image.open(path + image_name)
            image = np.array(image, dtype=np.uint8)

            new_image_name = image_name.split('.jpg')[0] + '_' + str(random.random()) + '.jpg'

            image_aug = seq(images=np.array([image]))[0]
            image_aug = Image.fromarray(image_aug)
            image_aug.save(f'{new_path}' + new_image_name)

            row_copy['file_name'] = new_image_name
            temp = pd.concat([temp, row_copy], axis=0)
    else:   
        count_augs = math.ceil((500 - count) / count)
        df_temp = df_cat.sample(count)
        
        for index, row in df_temp.iterrows():
            row_copy = pd.DataFrame(row.copy()).transpose()
            image_name = row['file_name'].split('/')[1]

            image = Image.open(path + image_name)
            image = np.array(image, dtype=np.uint8)

            for _ in range(count_augs):
                new_image_name = image_name.split('.jpg')[0] + '_' + str(random.random()) + '.jpg'

                image_aug = seq(images=np.array([image]))[0]
                image_aug = Image.fromarray(image_aug)
                image_aug.save(f'{new_path}' + new_image_name)

                row_copy['file_name'] = new_image_name
                temp = pd.concat([temp, row_copy], axis=0)

OSError: [Errno 28] No space left on device

In [89]:
temp['file_name'] = temp['file_name'].apply(lambda x: 'rtsd-frames/' + x)

In [132]:
grouped_data = temp.groupby('category_name').size().reset_index(name='count')
grouped_data = grouped_data.sort_values(by='count', ascending=False)
fig = px.bar(grouped_data, x='category_name' ,y ='count', 
             labels={'category_name': 'category_name'}, color_discrete_sequence=['blue'], 
             title=f'Distribution of sign_class')
fig.show()

In [90]:
df_result = pd.concat([df_result, temp], axis=0)

In [103]:
grouped_data = df_result.groupby('category_name').size().reset_index(name='count')
grouped_data = grouped_data.sort_values(by='count', ascending=False)
fig = px.bar(grouped_data, x='category_name' ,y ='count', 
             labels={'category_name': 'category_name'}, color_discrete_sequence=['blue'], 
             title=f'Distribution of sign_class')
fig.show()

In [105]:
df_result.to_csv('./data/rtsd-frames/data_signs.csv')