In [2]:
from os.path import split, splitext, join

import cv2 as cv
import json
import numpy as np
import pandas as pd
from glob import glob
from shutil import copy
from tqdm.notebook import tqdm

from utils import map_data_frame_from_json, map_data_frame_to_json

# Hard hats

In [2]:
label_path_base = '../1_helmet/labels'
output_folder_path = '../1_helmet/out'
name_list = ['head', 'helmet', 'person']
data_list = []
for img_path in tqdm(glob('../1_helmet/images/*')[:]):
    img = cv.imread(img_path)
    h, w = img.shape[:2]
    img_name = split(img_path)[1]
    img_name_base = splitext(img_name)[0]
    label_path = join(label_path_base, f'{img_name_base}.txt')
    with open(label_path) as f:
        label_lines = f.readlines()

    label_lines_splitted = [l.split() for l in label_lines]
    label_lines_splitted = [line for line in label_lines_splitted if line[0] == '1']
    class_list = [name_list[int(line[0])] for line in label_lines_splitted]
    bbox_list = [[float(coord) for coord in line_splitted[1:]]
                 for line_splitted in label_lines_splitted
                 if line_splitted[0] == '1']

    for class_name, bbox in zip(class_list, bbox_list):
        x, y, width, height = bbox
        width, height = width * w, height * h
        x, y = x * w, y * h
        x0, y0 = int(x - width / 2), int(y - height / 2)
        x1, y1 = int(x + width / 2), int(y + height / 2)

        data_list.append((
            img_name,
            w,
            h,
            w * h,
            class_name,
            (int(x0), int(y0), int(x1), int(y1)),
            (x1 - x0) * (y1 - y0),
            round((x1 - x0) * (y1 - y0) / (w * h), 6),
        ))

        cv.rectangle(img, (x0, y0), (x1, y1), [0, 0, 255], 2)

    if len(bbox_list) > 0:
        cv.imwrite(join(output_folder_path, img_name), img)

100%|██████████| 100/100 [00:00<00:00, 445.16it/s]


In [34]:
df = pd.DataFrame(data_list, columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p'])
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(361, 116, 401, 161)",1800,0.010811
1,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(71, 122, 106, 164)",1470,0.008829
2,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(150, 126, 183, 165)",1287,0.00773
3,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(280, 116, 315, 157)",1435,0.008619
4,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(230, 128, 259, 161)",957,0.005748


In [35]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [36]:
df_out.to_csv('../1_helmet/1_original_statistics.csv')

# bdd100k

In [6]:
with open('../3_bdd100k/det_train.json') as f:
    json_train = json.load(f)

with open('../3_bdd100k/det_val.json') as f:
    json_val = json.load(f)

json_dataset = json_train + json_val

In [None]:
image_base_path_train = '../3_bdd100k/images/'
output_path_base = '../3_bdd100k/out'
data_list = []
for idx, sample in tqdm(enumerate(json_dataset[:])):
    if 'labels' not in sample:
        continue

    filename = sample['name']
    file_path = join(image_base_path_train, filename)
    img = cv.imread(file_path)
    h, w = img.shape[:2]
    attributes = sample['attributes']
    weather = attributes['weather']
    timeofday = attributes['timeofday']
    scene = attributes['scene']
    to_save = False
    for label in sample['labels']:
        attr = label['attributes']
        occluded = attr['occluded']
        truncated = attr['truncated']
        traffic_light_color = attr['trafficLightColor']
        category = label['category']
        bbox = label['box2d']
        x0, y0, x1, y1 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2'],
        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)

        data_list.append((
            filename,
            w,
            h,
            w * h,
            category,
            (int(x0), int(y0), int(x1), int(y1)),
            int((x1 - x0) * (y1 - y0)),
            round((x1 - x0) * (y1 - y0) / (w * h), 2),
            weather,
            timeofday,
            scene,
            occluded,
            truncated,
            traffic_light_color,
        ))

    if to_save:
        cv.imwrite(join(output_path_base, filename), img)

In [65]:
df = pd.DataFrame(data_list,
                  columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p', 'weather',
                           'timeofday', 'scene', 'occluded', 'truncated', 'trafic_light_color'])
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p,weather,timeofday,scene,occluded,truncated,trafic_light_color
0,0000f77c-6257be58.jpg,1280,720,921600,traffic light,"(1125, 133, 1156, 210)",2414.35378,0.0,clear,daytime,city street,False,False,G
1,0000f77c-6257be58.jpg,1280,720,921600,traffic light,"(1156, 136, 1191, 210)",2563.388185,0.0,clear,daytime,city street,False,False,G
2,0000f77c-6257be58.jpg,1280,720,921600,traffic sign,"(1105, 211, 1170, 233)",1461.583957,0.0,clear,daytime,city street,False,False,
3,0000f77c-6257be58.jpg,1280,720,921600,traffic sign,"(0, 0, 100, 122)",12304.688432,0.01,clear,daytime,city street,False,True,
4,0000f77c-6257be58.jpg,1280,720,921600,car,"(49, 254, 357, 487)",71964.026347,0.08,clear,daytime,city street,False,False,


In [58]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [59]:
df_out.to_csv('../3_bdd100k/1_original_statistics.csv')

# Playing cards

In [2]:
name_list = ['10C', '10D', '10H', '10S', '2C', '2D', '2H', '2S', '3C', '3D', '3H', '3S', '4C', '4D', '4H', '4S', '5C',
             '5D', '5H', '5S', '6C', '6D', '6H', '6S', '7C', '7D', '7H', '7S', '8C', '8D', '8H', '8S', '9C', '9D', '9H',
             '9S', 'AC', 'AD', 'AH', 'AS', 'JC', 'JD', 'JH', 'JS', 'KC', 'KD', 'KH', 'KS', 'QC', 'QD', 'QH', 'QS']
len(name_list)

52

In [5]:
output_folder_path = '../2_playing_cards/out'
data_list = []
for img_path in tqdm(glob('../2_playing_cards/images/*')[:]):
    img = cv.imread(img_path)
    h, w = img.shape[:2]
    img_path_base, img_name = split(img_path)
    img_name_base = splitext(img_name)[0]
    label_path = join(join(split(img_path_base)[0], 'labels'), f'{img_name_base}.txt')
    label_list = np.loadtxt(label_path)

    for lbl_row in label_list:
        name = name_list[int(lbl_row[0])]
        x_center, y_center = lbl_row[1], lbl_row[2]
        width, height = lbl_row[3], lbl_row[4]
        width, height = width * w, height * h
        x_center, y_center = x_center * w, y_center * h
        x0, y0 = int(x_center - width / 2), int(y_center - height / 2)
        x1, y1 = int(x_center + width / 2), int(y_center + height / 2)

        data_list.append((
            img_name,
            w,
            h,
            w * h,
            name,
            (int(x0), int(y0), int(x1), int(y1)),
            (x1 - x0) * (y1 - y0),
            round((x1 - x0) * (y1 - y0) / (w * h), 6),
        ))

        cv.rectangle(img, (x0, y0), (x1, y1), [0, 0, 255], 2)
    if len(label_list) > 0:
        cv.imwrite(join(output_folder_path, img_name), img)

  0%|          | 0/24233 [00:00<?, ?it/s]

In [6]:
df = pd.DataFrame(data_list, columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p'])
display(df.head())

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,9C,"(175, 367, 221, 399)",1472,0.003594
1,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,2D,"(190, 330, 235, 366)",1620,0.003955
2,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,8C,"(219, 290, 262, 330)",1720,0.004199
3,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,8C,"(405, 293, 447, 334)",1722,0.004204
4,000056694_jpg.rf.d3044032743b059a02077deb4bbf4...,640,640,409600,9C,"(426, 276, 470, 304)",1232,0.003008


In [7]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [8]:
df_out.to_csv('../2_playing_cards/1_original_statistics.csv', index=False)

# Identify selection criteria

In [3]:
df = pd.read_csv('../2_playing_cards/1_original_statistics.csv')
df = map_data_frame_from_json(df, column_list=['bbox'])
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,9C,"[175, 367, 221, 399]",1472,0.003594
1,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,2D,"[190, 330, 235, 366]",1620,0.003955
2,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,8C,"[219, 290, 262, 330]",1720,0.004199
3,000056694_jpg.rf.132f49ccfd4cc9e72ad7ceb43d845...,640,640,409600,8C,"[405, 293, 447, 334]",1722,0.004204
4,000056694_jpg.rf.d3044032743b059a02077deb4bbf4...,640,640,409600,9C,"[426, 276, 470, 304]",1232,0.003008


In [4]:
def is_on_edge(row):
    x0, y0, x1, y1 = row['bbox']
    w, h = row['width'], row['height']
    margin = min(w, h) * 0.01

    return x0 < margin or y0 < margin or (w - x1) < margin or (h - y1) < margin


df['is_on_edge'] = df.apply(is_on_edge, axis=1)
print(f'{df["is_on_edge"].sum()}/{len(df)}')

993/96909


In [9]:
df_groups = df.groupby(by=['filename'])
df_groups_filtered = [df_group for _, df_group in df_groups if not any(df_group['is_on_edge'])]
len(df_groups_filtered), len(df_groups)

  df_groups_filtered = [df_group for _, df_group in df_groups if not any(df_group['is_on_edge'])]


(23240, 23240)

In [6]:
df = pd.concat(df_groups_filtered, axis=0, ignore_index=True)
len(df)

92937

In [7]:
groups = df.groupby(by='name')
stat = []
for name, g in groups:
    number, card_type = name[:-1], name[-1]
    stat.append(
            (name,
             number,
             card_type,
             g['filename'].nunique(),
             len(g),
             g['area'].min(),
             g['area'].max(),
             g['area_p'].min(),
             g['area_p'].max(),)
    )
df_stat = pd.DataFrame(stat, columns=['name', 'number', 'card_type',
                                      'items_u', 'items', 'area min',
                                      'area max', 'area_p min', 'area_p max'])
df_stat = df_stat.sort_values(by='items_u', ignore_index=True, ascending=False)
df_stat

Unnamed: 0,name,number,card_type,items_u,items,area min,area max,area_p min,area_p max
0,QS,Q,S,1405,1921,738,4284,0.001802,0.010459
1,4D,4,D,1400,1895,756,3894,0.001846,0.009507
2,5C,5,C,1399,1936,756,4692,0.001846,0.011455
3,4H,4,H,1395,1898,779,3685,0.001902,0.008997
4,7S,7,S,1381,1855,774,4970,0.00189,0.012134
5,5D,5,D,1378,1872,731,4620,0.001785,0.011279
6,QH,Q,H,1372,1832,798,5183,0.001948,0.012654
7,QC,Q,C,1372,1828,817,4410,0.001995,0.010767
8,3C,3,C,1372,1865,798,5256,0.001948,0.012832
9,9D,9,D,1365,1849,714,4160,0.001743,0.010156


In [8]:
key = 'area'
# group_sorted = df[df['name'] == '4D'].sort_values(by=key, ascending=True)
group_sorted = df.sort_values(by=key, ascending=True)
# group_sorted = group_sorted[group_sorted['area_p'] > 0.1]
# group_sorted = group_sorted[group_sorted['area'] > 1500]
group_size = 2000
min_diff = 100000
min_idx = -1
for start_idx in range(len(group_sorted) - group_size):
    diff = group_sorted.iloc[start_idx + group_size][key] - group_sorted.iloc[start_idx][key]
    if diff < min_diff:
        min_idx = start_idx
        min_diff = diff

print(min_diff)
print(group_sorted.iloc[min_idx][key], group_sorted.iloc[min_idx + group_size][key])
size_list = group_sorted.iloc[min_idx: min_idx + group_size].groupby(by='filename').size()
print((size_list != 1).sum(), size_list[size_list != 1].unique())

21
1344 1365
220 [2 3]


In [None]:
destination_dir = '../2_playing_cards/out'
for filename in tqdm(group_sorted.iloc[min_idx: min_idx + 100]['filename']):
    copy(f'../2_playing_cards/images/{filename}', destination_dir)