In [1]:
import json
from glob import glob
from os.path import split, splitext, join
from shutil import copy

import cv2 as cv
import pandas as pd
from tqdm import tqdm
from utils import map_data_frame_from_json, map_data_frame_to_json

In [9]:
df = pd.read_csv('../2_playing_cards/1_original_statistics.csv')
df = map_data_frame_from_json(df, column_list=['bbox'])
df.head()

Unnamed: 0.1,Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,0,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,8S,"[468, 376, 505, 422]",1702,0.004155
1,1,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,7S,"[427, 356, 458, 404]",1488,0.003633
2,2,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,5S,"[394, 351, 420, 397]",1196,0.00292
3,3,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,5S,"[259, 478, 285, 524]",1196,0.00292
4,4,207635979_jpg.rf.2781e90031c92657513837e9f36cf...,640,640,409600,3S,"[423, 265, 492, 316]",3519,0.008591


In [10]:
def is_on_edge(row):
    x0, y0, x1, y1 = row['bbox']
    w, h = row['width'], row['height']
    margin = min(w, h) * 0.01

    return x0 < margin or y0 < margin or (w - x1) < margin or (h - y1) < margin

df['is_on_edge'] = df.apply(is_on_edge, axis=1)
print(f'{df["is_on_edge"].sum()}/{len(df)}')

963/88829


In [11]:
df_groups = df.groupby(by=['filename'])
df_groups_filtered = [df_group for _, df_group in df_groups if not any(df_group['is_on_edge'])]
len(df_groups_filtered), len(df_groups)

(21250, 22213)

In [12]:
df = pd.concat(df_groups_filtered, axis=0, ignore_index=True)
len(df)

84977

In [13]:
groups = df.groupby(by='name')
stat = []
for name, g in groups:
    stat.append(
        (name,
         g['filename'].nunique(),
         len(g),
         g['area'].min(),
         g['area'].max(),
         g['area_p'].min(),
         g['area_p'].max(),)
    )
df_stat = pd.DataFrame(stat, columns=['name','items_u', 'items', 'area min', 'area max', 'area_p min', 'area_p max'])
df_stat = df_stat.sort_values(by='items_u', ignore_index=True, ascending=False)
df_stat

Unnamed: 0,name,items_u,items,area min,area max,area_p min,area_p max
0,4D,1307,1776,774,3894,0.00189,0.009507
1,QS,1294,1763,756,4284,0.001846,0.010459
2,4H,1284,1753,798,3685,0.001948,0.008997
3,5C,1275,1765,798,4692,0.001948,0.011455
4,QC,1273,1688,860,4410,0.0021,0.010767
5,7S,1268,1709,798,4970,0.001948,0.012134
6,JC,1263,1722,798,4200,0.001948,0.010254
7,9D,1262,1713,714,4160,0.001743,0.010156
8,QH,1257,1680,798,5183,0.001948,0.012654
9,3C,1250,1689,840,5256,0.002051,0.012832


In [15]:
key = 'area'
# group_sorted = df[df['name'] == '4D'].sort_values(by=key, ascending=True)
group_sorted = df.sort_values(by=key, ascending=True)
# group_sorted = group_sorted[group_sorted['area_p'] > 0.1]
# group_sorted = group_sorted[group_sorted['area'] > 1500]
group_size = 2000
min_diff = 100000
min_idx = -1
for start_idx in range(len(group_sorted) - group_size):
    diff = group_sorted.iloc[start_idx + group_size][key] - group_sorted.iloc[start_idx][key]
    if diff < min_diff:
        min_idx = start_idx
        min_diff = diff

print(min_diff)
print(group_sorted.iloc[min_idx][key], group_sorted.iloc[min_idx + group_size][key])
size_list = group_sorted.iloc[min_idx: min_idx+group_size].groupby(by='filename').size()
print((size_list != 1).sum(), size_list[size_list != 1].unique())

21
1500 1521
221 [2 3 4]


In [22]:
destination_dir = '../Playing Cards.v4-fastmodel-resized640-aug3x.yolov8/out'
for filename in tqdm(group_sorted.iloc[min_idx: min_idx+100]['filename']):
    file_path = glob(f'../Playing Cards.v4-fastmodel-resized640-aug3x.yolov8/**/images/{filename}', recursive=True)[0]
    copy(file_path, destination_dir)

100%|██████████| 100/100 [00:02<00:00, 34.93it/s]


# Hard hats

In [2]:
label_path_base = '../1_helmet/labels'
output_folder_path = '../1_helmet/out'
name_list = ['head', 'helmet', 'person']
data_list = []
for img_path in tqdm(glob('../1_helmet/images/*')[:100]):
    img = cv.imread(img_path)
    h, w = img.shape[:2]
    img_name = split(img_path)[1]
    img_name_base = splitext(img_name)[0]
    label_path = join(label_path_base, f'{img_name_base}.txt')
    with open(label_path) as f:
        label_lines = f.readlines()

    label_lines_splitted = [l.split() for l in label_lines]
    label_lines_splitted = [line for line in label_lines_splitted if line[0] == '1']
    class_list = [name_list[int(line[0])] for line in label_lines_splitted]
    bbox_list = [[float(coord) for coord in line_splitted[1:]] for line_splitted in label_lines_splitted if line_splitted[0] == '1']

    for class_name, bbox in zip(class_list, bbox_list):
        x, y, width, height = bbox
        width, height = width * w, height * h
        x, y = x * w, y * h
        x0, y0 = int(x - width / 2), int(y - height / 2)
        x1, y1 = int(x + width / 2), int(y + height / 2)

        data_list.append((
            img_name,
            w,
            h,
            w*h,
            class_name,
            (int(x0), int(y0), int(x1), int(y1)),
            (x1 - x0) * (y1 - y0),
            round((x1 - x0) * (y1 - y0) / (w * h), 6),
        ))

        cv.rectangle(img, (x0, y0), (x1, y1), [0, 0, 255], 2)

    if len(bbox_list) > 0:
        cv.imwrite(join(output_folder_path, img_name), img)

100%|██████████| 100/100 [00:00<00:00, 445.16it/s]


In [34]:
df = pd.DataFrame(data_list, columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p'])
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(361, 116, 401, 161)",1800,0.010811
1,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(71, 122, 106, 164)",1470,0.008829
2,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(150, 126, 183, 165)",1287,0.00773
3,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(280, 116, 315, 157)",1435,0.008619
4,001396_jpg.rf.215f6ba3dd727dfb58d53fba61410963...,500,333,166500,helmet,"(230, 128, 259, 161)",957,0.005748


In [35]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [36]:
df_out.to_csv('../Hard Hat Workers.v2-raw_75-25_traintestsplit.yolov8/meta.csv')

# Pistoles

In [39]:
label_path_base = '../Pistols.v1-resize-416x416.yolov8/export/labels'
output_folder_path = '../Pistols.v1-resize-416x416.yolov8/export/out'
data_list = []
for img_path in tqdm(glob('../Pistols.v1-resize-416x416.yolov8/export/images/*')):
    img = cv.imread(img_path)
    h, w = img.shape[:2]
    img_name = split(img_path)[1]
    img_name_base = splitext(img_name)[0]
    label_path = join(label_path_base, f'{img_name_base}.txt')
    with open(label_path) as f:
        label_lines = f.readlines()

    label_lines_splitted = [l.split() for l in label_lines]
    hard_hat_bboxes = [[float(coord) for coord in line_splitted[1:]] for line_splitted in label_lines_splitted]

    for bbox in hard_hat_bboxes:
        x, y, width, height = bbox
        width, height = width * w, height * h
        x, y = x * w, y * h
        x0, y0 = int(x - width / 2), int(y - height / 2)
        x1, y1 = int(x + width / 2), int(y + height / 2)

        data_list.append((
            img_name,
            w,
            h,
            w*h,
            'pistole',
            (int(x0), int(y0), int(x1), int(y1)),
            (x1 - x0) * (y1 - y0),
            round((x1 - x0) * (y1 - y0) / (w * h), 2),
        ))

        cv.rectangle(img, (x0, y0), (x1, y1), [0, 0, 255], 2)
    if len(hard_hat_bboxes) > 0:
        cv.imwrite(join(output_folder_path, img_name), img)

100%|██████████| 2971/2971 [00:03<00:00, 978.76it/s]


In [40]:
df = pd.DataFrame(data_list, columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p'])
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,armas (1737)_jpg.rf.3ec531d9667acc4d2dd18f0e42...,416,416,173056,pistole,"(29, 28, 403, 260)",86768,0.5
1,armas (2860)_jpg.rf.c363cecf4e6d456a23dd2ffb35...,416,416,173056,pistole,"(180, 170, 225, 317)",6615,0.04
2,armas (2065)_jpg.rf.fc313cf05a248176dca96eafac...,416,416,173056,pistole,"(38, 99, 138, 151)",5200,0.03
3,armas (2047)_jpg.rf.d7760b5478ecabd0d3ce975579...,416,416,173056,pistole,"(17, 13, 391, 297)",106216,0.61
4,armas (2593)_jpg.rf.ed32f57b78f1c80435e231a80b...,416,416,173056,pistole,"(20, 3, 400, 409)",154280,0.89


In [41]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [42]:
df_out.to_csv('../Pistols.v1-resize-416x416.yolov8/meta.csv')

# bdd100k

In [6]:
with open('../3_bdd100k/det_train.json') as f:
    json_train = json.load(f)

with open('../3_bdd100k/det_val.json') as f:
    json_val = json.load(f)

json_dataset = json_train + json_val

In [None]:
image_base_path_train = '../3_bdd100k/images/'
output_path_base = '../3_bdd100k/out'
data_list = []
for idx, sample in tqdm(enumerate(json_dataset[:100])):
    if 'labels' not in sample:
        continue

    filename = sample['name']
    file_path = join(image_base_path_train, filename)
    img = cv.imread(file_path)
    h, w = img.shape[:2]
    attributes = sample['attributes']
    weather = attributes['weather']
    timeofday = attributes['timeofday']
    scene = attributes['scene']
    to_save = False
    for label in sample['labels']:
        attr = label['attributes']
        occluded = attr['occluded']
        truncated = attr['truncated']
        traffic_light_color = attr['trafficLightColor']
        category = label['category']
        bbox = label['box2d']
        x0, y0, x1, y1 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2'],
        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)

        data_list.append((
            filename,
            w,
            h,
            w*h,
            category,
            (int(x0), int(y0), int(x1), int(y1)),
            int((x1 - x0) * (y1 - y0)),
            round((x1 - x0) * (y1 - y0) / (w * h), 2),
            weather,
            timeofday,
            scene,
            occluded,
            truncated,
            traffic_light_color,
        ))

    if to_save:
        cv.imwrite(join(output_path_base, filename), img)

In [65]:
df = pd.DataFrame(data_list, columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p', 'weather', 'timeofday', 'scene', 'occluded', 'truncated', 'trafic_light_color'])
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p,weather,timeofday,scene,occluded,truncated,trafic_light_color
0,0000f77c-6257be58.jpg,1280,720,921600,traffic light,"(1125, 133, 1156, 210)",2414.35378,0.0,clear,daytime,city street,False,False,G
1,0000f77c-6257be58.jpg,1280,720,921600,traffic light,"(1156, 136, 1191, 210)",2563.388185,0.0,clear,daytime,city street,False,False,G
2,0000f77c-6257be58.jpg,1280,720,921600,traffic sign,"(1105, 211, 1170, 233)",1461.583957,0.0,clear,daytime,city street,False,False,
3,0000f77c-6257be58.jpg,1280,720,921600,traffic sign,"(0, 0, 100, 122)",12304.688432,0.01,clear,daytime,city street,False,True,
4,0000f77c-6257be58.jpg,1280,720,921600,car,"(49, 254, 357, 487)",71964.026347,0.08,clear,daytime,city street,False,False,


In [58]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [59]:
df_out.to_csv('../bdd100k 2/images/100k/meta.csv')

In [80]:
df = pd.read_csv('../bdd100k 2/images/100k/meta.csv', index_col=0)
df.head()

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p,weather,timeofday,scene,occluded,truncated,trafic_light_color
0,0000f77c-6257be58.jpg,1280,720,921600,traffic light,"[1125, 133, 1156, 210]",2414.35378,0.0,clear,daytime,city street,False,False,G
1,0000f77c-6257be58.jpg,1280,720,921600,traffic light,"[1156, 136, 1191, 210]",2563.388185,0.0,clear,daytime,city street,False,False,G
2,0000f77c-6257be58.jpg,1280,720,921600,traffic sign,"[1105, 211, 1170, 233]",1461.583957,0.0,clear,daytime,city street,False,False,
3,0000f77c-6257be58.jpg,1280,720,921600,traffic sign,"[0, 0, 100, 122]",12304.688432,0.01,clear,daytime,city street,False,True,
4,0000f77c-6257be58.jpg,1280,720,921600,car,"[49, 254, 357, 487]",71964.026347,0.08,clear,daytime,city street,False,False,


# Playing cards

In [3]:
name_list = ['10C', '10D', '10H', '10S', '2C', '2D', '2H', '2S', '3C', '3D', '3H', '3S', '4C', '4D', '4H', '4S', '5C', '5D', '5H', '5S', '6C', '6D', '6H', '6S', '7C', '7D', '7H', '7S', '8C', '8D', '8H', '8S', '9C', '9D', '9H', '9S', 'AC', 'AD', 'AH', 'AS', 'JC', 'JD', 'JH', 'JS', 'KC', 'KD', 'KH', 'KS', 'QC', 'QD', 'QH', 'QS']
len(name_list)

52

In [4]:
output_folder_path = '../2_playing_cards/out'
data_list = []
for img_path in tqdm(
        glob('../2_playing_cards/images/*')[:100]):
    img = cv.imread(img_path)
    h, w = img.shape[:2]
    img_path_base, img_name = split(img_path)
    img_name_base = splitext(img_name)[0]
    label_path = join(join(split(img_path_base)[0], 'labels'), f'{img_name_base}.txt')
    with open(label_path) as f:
        label_lines = f.readlines()

    label_lines_splitted = [l.split() for l in label_lines]
    class_list = [name_list[int(line_splitted[0])] for line_splitted in label_lines_splitted]
    bbox_list = [[float(coord) for coord in line_splitted[1:]] for line_splitted in label_lines_splitted]

    for bbox, name in zip(bbox_list, class_list):
        x, y, width, height = bbox
        width, height = width * w, height * h
        x, y = x * w, y * h
        x0, y0 = int(x - width / 2), int(y - height / 2)
        x1, y1 = int(x + width / 2), int(y + height / 2)

        data_list.append((
            img_name,
            w,
            h,
            w * h,
            name,
            (int(x0), int(y0), int(x1), int(y1)),
            (x1 - x0) * (y1 - y0),
            round((x1 - x0) * (y1 - y0) / (w * h), 6),
        ))

        cv.rectangle(img, (x0, y0), (x1, y1), [0, 0, 255], 2)
    if len(bbox_list) > 0:
        cv.imwrite(join(output_folder_path, img_name), img)

100%|██████████| 100/100 [00:00<00:00, 236.47it/s]


In [13]:
df = pd.DataFrame(data_list, columns=['filename', 'width', 'height', 'size', 'name', 'bbox', 'area', 'area_p'])
display(df.head())

Unnamed: 0,filename,width,height,size,name,bbox,area,area_p
0,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,8S,"(468, 376, 505, 422)",1702,0.004155
1,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,7S,"(427, 356, 458, 404)",1488,0.003633
2,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,5S,"(394, 351, 420, 397)",1196,0.00292
3,284679715_jpg.rf.3a9046f136b511793d38233c9feb2...,640,640,409600,5S,"(259, 478, 285, 524)",1196,0.00292
4,207635979_jpg.rf.2781e90031c92657513837e9f36cf...,640,640,409600,3S,"(423, 265, 492, 316)",3519,0.008591


In [11]:
df_out = df.copy()
df_out = map_data_frame_to_json(df_out, column_list=['bbox'])

In [12]:
df_out.to_csv('../Playing Cards.v4-fastmodel-resized640-aug3x.yolov8/meta.csv')