In [1]:
import json
import pandas as pd
from tqdm import tqdm_notebook
from IPython.display import HTML, display
import tabulate
import sqlite3
from sqlite3 import Error
import random
import copy
import math


human_labels = ('woman', 'men', 'boy', 'kid', 'child', 'guy', 'man', 'person', 'girl', 'lady', 'people')

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [3]:
cp_val = '/home/test_pc/mmlab/mmdetection/datasets/cp_val.json'
f = open(cp_val)
cp_val_data = json.load(f)
cp_val_cat = cp_val_data['categories']
cp_val_images = cp_val_data['images']
cp_val_annotations = cp_val_data['annotations']
cp_val_img_df = pd.DataFrame(cp_val_images)
cp_val_ann_df = pd.DataFrame(cp_val_annotations)

In [4]:
cp_train = '/home/test_pc/mmlab/mmdetection/datasets/cp_train.json'
f = open(cp_train)
cp_train_data = json.load(f)
# cp_train_licenses = cp_train_data['licenses']
cp_train_cat = cp_train_data['categories']
cp_train_images = cp_train_data['images']
cp_train_annotations = cp_train_data['annotations']
cp_train_img_df = pd.DataFrame(cp_train_images)
cp_train_ann_df = pd.DataFrame(cp_train_annotations)

In [5]:
aug_test = '/home/test_pc/mmlab/mmdetection/datasets/aug_test.json'
f = open(aug_test)
aug_test_data = json.load(f)
# aug_test_licenses = aug_test_data['licenses']
aug_test_cat = aug_test_data['categories']
aug_test_images = aug_test_data['images']
aug_test_annotations = aug_test_data['annotations']
aug_test_img_df = pd.DataFrame(aug_test_images)
aug_test_ann_df = pd.DataFrame(aug_test_annotations)

In [6]:
def concat_df(dataset='EuroCity'):
    ec_train = cp_train_img_df[cp_train_img_df['file_name'].str.contains(dataset)]
    ec_val = cp_val_img_df[cp_val_img_df['file_name'].str.contains(dataset)]
    ec_test = aug_test_img_df[aug_test_img_df['file_name'].str.contains(dataset)]
    ec_tr = [el.rsplit('/', 1)[1] for el in list(ec_train['file_name'])]
    ec_v = [el.rsplit('/', 1)[1] for el in list(ec_val['file_name'])]
    ec_ts = [el.rsplit('/', 1)[1] for el in list(ec_test['file_name'])]
    out_list = ec_tr + ec_v + ec_ts
    print(f'len train: {len(ec_tr)}, len val: {len(ec_v)}, len test: {len(ec_ts)}')
#     out_list.extend(ec_tr, ec_v, ec_ts)
    print(f'len all of togather is: {len(out_list)}')
    return out_list

In [7]:
ec_list = concat_df(dataset='EuroCity')

len train: 871, len val: 168, len test: 155
len all of togather is: 1194


In [8]:
wp_list = concat_df(dataset='WiderPerson')

len train: 351, len val: 76, len test: 75
len all of togather is: 502


In [9]:
ch_list = concat_df(dataset='CrowdHuman')


len train: 436, len val: 94, len test: 94
len all of togather is: 624


In [10]:
def setA_setB(A, B):
    """
    set A - set B
    """
    common_list = []
    diff_set = []
    comm_set = []
    for idx, row in A.iterrows():
        index_list = B[(B['Object_1'] == row['Object_1'])&(B['Relation'] == row['Relation'])&(B['Object_2'] == row['Object_2'])].index.tolist()
        if not index_list:
           diff_set.append(idx) 
        else:
           comm_set.append(idx)
    diff_cp = A.iloc[diff_set]
    com_cp = A.iloc[comm_set]
    return diff_cp, com_cp

def find_quantile(diff_set,q=0.95, max_im=20):
    quantile_95 = diff_set[diff_set.Count > diff_set.Count.quantile(q)]
    quantile_95 = quantile_95.sort_values('Count', ascending=False)
    quantile_95.reset_index(drop=True, inplace=True)
    quantile_95['count_down'] = max_im
#     im_to_add = []
    for index, row in tqdm_notebook(quantile_95.iterrows(), desc ="update the r_cityperson_df"):
        if len(quantile_95.at[index, 'images']) > quantile_95.at[index, 'count_down']:
            s_images = random.sample(quantile_95.at[index, 'images'], quantile_95.at[index, 'count_down'])
            quantile_95.at[index, 'images'] = s_images
        else:
            s_images = quantile_95.at[index, 'images']
        quantile_95.at[index, 'count_down'] = quantile_95.at[index, 'count_down'] - len(s_images)
#         im_to_add.extend(s_images)

#     im_to_add = list(set(im_to_add))
#     return im_to_add, quantile_95
    return quantile_95

def splite_data(data_df, train_size=50, val_size=10, test_size=10, dt=None, examption_list=[]):
    check_list = isinstance(data_df, list)
    if check_list:
        all_images = copy.deepcopy(data_df)
    elif not dt:
        all_images = [el.rsplit('/', 1)[1] for el in data_df['file_name'].to_list()]
    else:
        all_images = data_df['file_name'].to_list()
#     data_df = copy.deepcopy(data_df)
#     all_images = list(set([item for sublist in data_df['images'].tolist() for item in sublist]))
    all_images = list(set(all_images).difference(set(examption_list)))
    
    train_images = random.sample(all_images, train_size)
    rem_images = list(set(all_images).difference(set(train_images)))
    val_images = random.sample(rem_images, val_size)
    rem_images = list(set(rem_images).difference(set(val_images)))
    test_images = random.sample(rem_images, test_size)
#     test_images = list(set(all_images).difference(set(train_images).union(set(val_images))))
    
#     print(set(train_images).intersection(set(val_images)))

#     assert len(all_images) == len(train_images) + len(val_images) + len(test_images)
    assert len(set(train_images).intersection(set(val_images))) == 0
    assert len(set(train_images).intersection(set(test_images))) == 0
    assert len(set(val_images).intersection(set(test_images))) == 0
    return train_images, val_images, test_images

def concat_df(df_list):
    bigdata = copy.deepcopy(df_list[0])
    for el in df_list[1:]:
        bigdata = bigdata.append(el, ignore_index=True)
    return bigdata

def find_index_of_images(img_df, image_name_list=None):
    index_id_list = []
    for el in image_name_list:
        index_ids = img_df.index[img_df['file_name'].str.contains(el)].tolist()
        image_id = [img_df.at[idx, 'id'] for idx in index_ids]
        if len(image_id) != 1:
            print(image_id)
        assert len(image_id) == 1
        index_id_list.append(index_ids[0])
    return index_id_list

def create_ann_df(ann_df, index_list):
    selected_ann_df = ann_df[ann_df['image_id'].isin([index_list[0] + 1])]
    for el in index_list[1:]:
        selected_ann_df = pd.concat([selected_ann_df,ann_df[ann_df['image_id'].isin([el + 1])]])
    return selected_ann_df

def create_img_df(img_df, index_list):
    selected_img_df = img_df.loc[[index_list[0]]]
    for el in index_list[1:]:
        selected_img_df = pd.concat([selected_img_df,img_df.loc[[el]]])
    return selected_img_df

def update_img_ids(aug_img_df, row=None, dataset='other'):
    last_id = aug_img_df.at[aug_img_df.index[-1], 'id']
    concat_df = pd.concat([aug_img_df, row])
    concat_df.reset_index(drop=True, inplace=True)
    concat_df.at[concat_df.index[-1], 'id'] = last_id + 1
#     if dataset == 'ch':
#         concat_df.at[concat_df.index[-1], 'file_name'] = 'CrowdHuman/Images/' + concat_df.at[concat_df.index[-1], 'file_name']
    return last_id + 1, concat_df

def uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df, dt='other'):
    selected_img_df = selected_img_df.sort_values('id')
    if dt == 'ch':
        selected_ann_df['height'] = ""
        selected_ann_df['width'] = ""
        for idx, row in selected_img_df.iterrows():
            new_image_id, aug_img_df = update_img_ids(aug_img_df, row=selected_img_df.loc[[idx]], dataset='ch')
            
            selected_ann_df.loc[selected_ann_df['image_id'] == idx + 1, 'height'] = aug_img_df.at[new_image_id -1, 'height']
            selected_ann_df.loc[selected_ann_df['image_id'] == idx + 1, 'width'] = aug_img_df.at[new_image_id -1, 'width']
            
            selected_ann_df.loc[selected_ann_df['image_id'] == idx + 1, 'image_id'] = new_image_id

    else:
        for idx, row in selected_img_df.iterrows():
            new_image_id, aug_img_df = update_img_ids(aug_img_df, row=selected_img_df.loc[[idx]])
            if 'image_id' in selected_ann_df.columns:
                selected_ann_df.loc[selected_ann_df['image_id'] == idx + 1, 'image_id'] = new_image_id
    
    return aug_img_df, selected_ann_df

def concat_ann_df(orig_df, add_df):
    ann_df_c = pd.concat([orig_df, add_df])
    ann_df_c.reset_index(drop=True, inplace=True)
    for idx, row in ann_df_c.iterrows():
        if idx + 1 != row['id']:
            ann_df_c.at[idx, 'id'] = idx + 1
    return ann_df_c

In [11]:
def add_area(df):
    df = copy.deepcopy(df)
    df['area'] = ""
    for idx, row in df.iterrows():
        bb = df.loc[df['id'] == idx + 1, 'bbox']
        bb = bb.tolist()[0]
        df.loc[df['id'] == idx + 1, 'area'] = bb[2]*bb[3]
    return df

# create annotatuion file
#### selected data frames :


In [12]:
cp_file = '/home/test_pc/mmlab/mmdetection/datasets/cityperson_train.json'
f = open(cp_file)
cp_data = json.load(f)
# cp_licenses = cp_data['licenses']
cp_categories = cp_data['categories']
cp_images = cp_data['images']
cp_annotations = cp_data['annotations']
cp_img_df = pd.DataFrame(cp_images)
cp_ann_df = pd.DataFrame(cp_annotations)

In [13]:
print(cp_data['categories'])

[{'id': 1, 'name': 'pedestrain', 'supercategory': 'pedestrain'}]


In [14]:
# for idx, row in cp_img_df.iterrows():
#     cp_img_df.at[cp_img_df.index[idx], 'file_name'] = 'CityPersons/' + cp_img_df.at[cp_img_df.index[idx], 'file_name']
aug_img_df = copy.deepcopy(cp_img_df)
aug_img_df

Unnamed: 0,id,file_name,width,height,date_captured,license,coco_url,flickr_url
0,1,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
1,2,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
2,3,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
3,4,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
4,5,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
...,...,...,...,...,...,...,...,...
2524,2525,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
2525,2526,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
2526,2527,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,
2527,2528,CityPersons/leftImg8bit_trainvaltest//leftImg8...,2048,1024,2019-07-25 11:20:43.195846,1,,


In [15]:
# cp_ann_df = add_area(cp_ann_df)
cp_ann_df

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area
0,1,1,1,True,"[1034.0, 366.0, 31.0, 75.0]",2048,1024,2325.0
1,2,1,1,True,"[1911.0, 359.0, 10.0, 54.0]",2048,1024,540.0
2,3,1,1,True,"[1137.0, 307.0, 8.0, 9.0]",2048,1024,72.0
3,4,2,1,False,"[602.0, 413.0, 60.0, 146.0]",2048,1024,8760.0
4,5,2,1,False,"[281.0, 381.0, 89.0, 216.0]",2048,1024,19224.0
...,...,...,...,...,...,...,...,...
23424,23425,2529,1,True,"[1570.0, 376.0, 20.0, 48.0]",2048,1024,960.0
23425,23426,2529,1,True,"[416.0, 300.0, 15.0, 58.0]",2048,1024,870.0
23426,23427,2529,1,True,"[1679.0, 361.0, 27.0, 59.0]",2048,1024,1593.0
23427,23428,2529,1,False,"[1662.0, 366.0, 26.0, 63.0]",2048,1024,1638.0


In [16]:
cp_info = copy.deepcopy(cp_data['info'])
cp_cat = copy.deepcopy(cp_data['categories'])
cp_data['categories']

[{'id': 1, 'name': 'pedestrain', 'supercategory': 'pedestrain'}]

In [17]:

# aug_data_train['info'] = cp_info
# aug_data_train['categories'] = cp_cat
# aug_data_train['images'] = aug_img_df.to_dict('records')
# aug_data_train['annotations'] = cp_ann_df.to_dict('records')
# with open('/home/test_pc/mmlab/mmdetection/datasets/cityperson_train.json', 'w') as fp:
#     json.dump(aug_data_train, fp)

## ECP 

In [18]:
ecp_file = 'json_files/EuroCity/day_train_all.json'
f = open(ecp_file)
ecp_data = json.load(f)
ecp_licenses = ecp_data['licenses']
ecp_categories = ecp_data['categories']
ecp_images = ecp_data['images']
ecp_annotations = ecp_data['annotations']
ecp_img_df = pd.DataFrame(ecp_images)
ecp_ann_df = pd.DataFrame(ecp_annotations)
ecp_img_df.shape[0]

23892

In [19]:
print(ecp_data['categories'])

[{'id': 1, 'name': 'pedestrain', 'supercategory': 'pedestrain'}]


In [20]:
for idx, row in ecp_img_df.iterrows():
    ecp_img_df.at[ecp_img_df.index[idx], 'file_name'] = 'EuroCity/' + ecp_img_df.at[ecp_img_df.index[idx], 'file_name']
ecp_img_df

Unnamed: 0,id,file_name,width,height,date_captured,license,coco_url,flickr_url
0,1,EuroCity/ECP/day/img/train/wuerzburg/wuerzburg...,1920,1024,2019-11-03 07:44:18.143034,1,,
1,2,EuroCity/ECP/day/img/train/wuerzburg/wuerzburg...,1920,1024,2019-11-03 07:44:18.143034,1,,
2,3,EuroCity/ECP/day/img/train/wuerzburg/wuerzburg...,1920,1024,2019-11-03 07:44:18.143034,1,,
3,4,EuroCity/ECP/day/img/train/wuerzburg/wuerzburg...,1920,1024,2019-11-03 07:44:18.143034,1,,
4,5,EuroCity/ECP/day/img/train/wuerzburg/wuerzburg...,1920,1024,2019-11-03 07:44:18.143034,1,,
...,...,...,...,...,...,...,...,...
23887,23888,EuroCity/ECP/day/img/train/potsdam/potsdam_004...,1920,1024,2019-11-03 07:44:18.143034,1,,
23888,23889,EuroCity/ECP/day/img/train/potsdam/potsdam_000...,1920,1024,2019-11-03 07:44:18.143034,1,,
23889,23890,EuroCity/ECP/day/img/train/potsdam/potsdam_005...,1920,1024,2019-11-03 07:44:18.143034,1,,
23890,23891,EuroCity/ECP/day/img/train/potsdam/potsdam_003...,1920,1024,2019-11-03 07:44:18.143034,1,,


In [21]:
train_img_ecp, val_img_ecp, test_ecp_img = splite_data(ecp_img_df, train_size=871, val_size=168, test_size=155, examption_list=ec_list)
# train_img_ecp

In [22]:
train_img_ecp

['budapest_00163.png',
 'barcelona_00775.png',
 'marseille_01047.png',
 'berlin_00002.png',
 'barcelona_00707.png',
 'barcelona_00620.png',
 'brno_00954.png',
 'amsterdam_00596.png',
 'pisa_00040.png',
 'ulm_00191.png',
 'roma_00718.png',
 'toulouse_00323.png',
 'pisa_00085.png',
 'bratislava_00750.png',
 'ulm_00239.png',
 'nuernberg_00012.png',
 'ulm_00607.png',
 'torino_00711.png',
 'bologna_00575.png',
 'montpellier_00256.png',
 'barcelona_00239.png',
 'prague_01081.png',
 'hamburg_00049.png',
 'montpellier_00041.png',
 'ljubljana_00837.png',
 'marseille_01183.png',
 'milano_00827.png',
 'barcelona_00585.png',
 'amsterdam_00158.png',
 'zagreb_00128.png',
 'marseille_00319.png',
 'toulouse_00618.png',
 'potsdam_00197.png',
 'marseille_01297.png',
 'bologna_00086.png',
 'amsterdam_00100.png',
 'amsterdam_00138.png',
 'torino_00490.png',
 'firenze_00566.png',
 'brno_00287.png',
 'firenze_00389.png',
 'milano_00574.png',
 'budapest_00454.png',
 'bologna_00425.png',
 'potsdam_00466.png',

In [23]:
index_id_list = find_index_of_images(ecp_img_df, image_name_list=train_img_ecp)

selected_ecp_img_df = create_img_df(ecp_img_df, index_id_list)
selected_ecp_ann_df = create_ann_df(ecp_ann_df, index_id_list)
print(selected_ecp_ann_df.shape[0])
print(selected_ecp_img_df.shape[0])

7603
871


In [24]:
train_img_ecp

['budapest_00163.png',
 'barcelona_00775.png',
 'marseille_01047.png',
 'berlin_00002.png',
 'barcelona_00707.png',
 'barcelona_00620.png',
 'brno_00954.png',
 'amsterdam_00596.png',
 'pisa_00040.png',
 'ulm_00191.png',
 'roma_00718.png',
 'toulouse_00323.png',
 'pisa_00085.png',
 'bratislava_00750.png',
 'ulm_00239.png',
 'nuernberg_00012.png',
 'ulm_00607.png',
 'torino_00711.png',
 'bologna_00575.png',
 'montpellier_00256.png',
 'barcelona_00239.png',
 'prague_01081.png',
 'hamburg_00049.png',
 'montpellier_00041.png',
 'ljubljana_00837.png',
 'marseille_01183.png',
 'milano_00827.png',
 'barcelona_00585.png',
 'amsterdam_00158.png',
 'zagreb_00128.png',
 'marseille_00319.png',
 'toulouse_00618.png',
 'potsdam_00197.png',
 'marseille_01297.png',
 'bologna_00086.png',
 'amsterdam_00100.png',
 'amsterdam_00138.png',
 'torino_00490.png',
 'firenze_00566.png',
 'brno_00287.png',
 'firenze_00389.png',
 'milano_00574.png',
 'budapest_00454.png',
 'bologna_00425.png',
 'potsdam_00466.png',

In [25]:
pd.set_option('display.max_colwidth', None)
selected_ecp_img_df.file_name

3296       EuroCity/ECP/day/img/train/budapest/budapest_00163.png
19576    EuroCity/ECP/day/img/train/barcelona/barcelona_00775.png
12594    EuroCity/ECP/day/img/train/marseille/marseille_01047.png
4347           EuroCity/ECP/day/img/train/berlin/berlin_00002.png
19954    EuroCity/ECP/day/img/train/barcelona/barcelona_00707.png
                                   ...                           
7095         EuroCity/ECP/day/img/train/firenze/firenze_00974.png
14150        EuroCity/ECP/day/img/train/hamburg/hamburg_00641.png
9118               EuroCity/ECP/day/img/train/roma/roma_00869.png
22630    EuroCity/ECP/day/img/train/nuernberg/nuernberg_00303.png
16379          EuroCity/ECP/day/img/train/prague/prague_01279.png
Name: file_name, Length: 871, dtype: object

In [26]:
print(f'add #{selected_ecp_img_df.shape[0]} images to #{aug_img_df.shape[0]} train set')
aug_img_df, selected_ecp_ann_df = uodate_id_in_image_ann(selected_ecp_img_df, aug_img_df, selected_ecp_ann_df)
print(f'train img df after aug shape: {aug_img_df.shape[0]}')


add #871 images to #2529 train set
train img df after aug shape: 3400


In [27]:
selected_ecp_ann_df = add_area(selected_ecp_ann_df)
print(f'add #{selected_ecp_ann_df.shape[0]} annotations to #{cp_ann_df.shape[0]} train set')
ann_df_c = concat_ann_df(cp_ann_df, selected_ecp_ann_df)
for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']
print(f'train annotations #{ann_df_c.shape[0]} for train set after augmentation')

add #7603 annotations to #23429 train set
train annotations #31032 for train set after augmentation


### crowdHuman

In [28]:
ch_file = 'json_files/CrowdHuman/train.json'

f = open(ch_file)
ch_data = json.load(f)
# ch_licenses = ch_data['licenses']
ch_categories = ch_data['categories']
ch_images = ch_data['images']
ch_annotations = ch_data['annotations']
ch_img_df = pd.DataFrame(ch_images)
ch_ann_df = pd.DataFrame(ch_annotations)

In [29]:
print(ch_data['categories'])

[{'supercategory': 'none', 'id': 1, 'name': 'person'}, {'supercategory': 'none', 'id': 2, 'name': 'mask'}]


In [30]:
bb_df = copy.deepcopy(ch_ann_df[ch_ann_df.category_id == 1])
ch_ann_df = bb_df.copy()
ch_ann_df

Unnamed: 0,area,iscrowd,image_id,bbox,hbox,vbox,category_id,id,ignore
0,86523,0,1,"[61, 123, 191, 453]","[123, 129, 63, 64]","[62, 126, 154, 446]",1,1,0
1,92378,0,1,"[165, 95, 187, 494]","[214, 97, 58, 74]","[175, 95, 140, 487]",1,2,0
2,96135,0,1,"[236, 104, 195, 493]","[318, 109, 58, 68]","[260, 106, 170, 487]",1,3,0
3,85852,0,1,"[452, 110, 169, 508]","[486, 119, 61, 74]","[455, 113, 141, 501]",1,4,0
4,62103,0,1,"[520, 95, 163, 381]","[559, 105, 53, 57]","[553, 98, 70, 118]",1,5,0
...,...,...,...,...,...,...,...,...,...
438781,286612,0,15000,"[3393, 590, 316, 907]","[3518, 595, 114, 133]","[3393, 590, 316, 907]",1,438782,0
438782,81890,0,15000,"[3448, 607, 190, 431]","[3511, 610, 93, 104]","[3452, 608, 184, 430]",1,438783,0
438783,240570,0,15000,"[3684, 554, 297, 810]","[3824, 559, 96, 105]","[3795, 559, 156, 794]",1,438784,0
438784,234855,0,15000,"[3797, 536, 255, 921]","[3909, 539, 89, 113]","[3889, 535, 112, 169]",1,438785,0


In [31]:
for idx, row in ch_img_df.iterrows():
    ch_img_df.at[ch_img_df.index[idx], 'file_name'] = 'CrowdHuman/Images/' + ch_img_df.at[ch_img_df.index[idx], 'file_name']
ch_img_df

Unnamed: 0,file_name,height,width,id
0,"CrowdHuman/Images/284193,faa9000f2678b5e.jpg",683,1024,1
1,"CrowdHuman/Images/273275,cd061000af95f691.jpg",480,600,2
2,"CrowdHuman/Images/273278,8d231000e09fc133.jpg",1017,1300,3
3,"CrowdHuman/Images/283554,2cd4d0007833968d.jpg",954,1300,4
4,"CrowdHuman/Images/283554,37ba1000a6ca8c4d.jpg",533,800,5
...,...,...,...,...
14995,"CrowdHuman/Images/273275,874d9000417e16ed.jpg",1192,1800,14996
14996,"CrowdHuman/Images/273278,d329e000260a8cc2.jpg",600,1800,14997
14997,"CrowdHuman/Images/282555,c5fd20007faf5f84.jpg",575,1024,14998
14998,"CrowdHuman/Images/273275,10b78d0006d7d7b9c.jpg",462,1838,14999


In [32]:
train_img_ch, val_img_ch, test_img_ch = splite_data(ch_img_df, train_size=436, val_size=94, test_size=94, examption_list=ch_list)

In [33]:
index_id_list = find_index_of_images(ch_img_df, image_name_list=train_img_ch)
selected_ch_img_df = create_img_df(ch_img_df, index_id_list)
selected_ch_ann_df = create_ann_df(ch_ann_df, index_id_list)
selected_ch_img_df

Unnamed: 0,file_name,height,width,id
14404,"CrowdHuman/Images/273278,1339ba0003a171bf2.jpg",600,450,14405
13855,"CrowdHuman/Images/273278,1001db000895beed8.jpg",447,640,13856
3591,"CrowdHuman/Images/273278,5a9ea000ba06ed2b.jpg",467,700,3592
4175,"CrowdHuman/Images/284193,25adc0004771188c.jpg",480,852,4176
12240,"CrowdHuman/Images/273275,b69d0008c7203d2.jpg",552,980,12241
...,...,...,...,...
7258,"CrowdHuman/Images/273278,12dd0000060b74783.jpg",1080,1920,7259
6416,"CrowdHuman/Images/273278,1190b7000c4ef9ce4.jpg",848,1200,6417
12623,"CrowdHuman/Images/282555,e2e15000a97c1b22.jpg",956,1300,12624
9086,"CrowdHuman/Images/282555,6668d00018c6d6f1.jpg",866,1300,9087


In [34]:
print(selected_ch_img_df.shape[0])
selected_ch_ann_df.shape[0]

436


9483

In [35]:
print(f'add #{selected_ch_img_df.shape[0]} images to #{aug_img_df.shape[0]} train set')
aug_img_df, selected_ch_ann_df = uodate_id_in_image_ann(selected_ch_img_df, aug_img_df, selected_ch_ann_df, dt='ch')
print(f'train img df after aug shape: {aug_img_df.shape[0]}')

add #436 images to #3400 train set
train img df after aug shape: 3836


In [36]:
selected_ch_ann_df.shape[0]

9483

In [37]:
selected_ch_ann_df = add_area(selected_ch_ann_df)
print(f'add #{selected_ch_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} train set')
ann_df_c = concat_ann_df(ann_df_c, selected_ch_ann_df)
ann_df_c = ann_df_c.drop(['hbox', 'vbox'], axis = 1)
print(f'train annotations #{ann_df_c.shape[0]} for train set after augmentation')

add #9483 annotations to #31032 train set
train annotations #40515 for train set after augmentation


### wheelchair

In [38]:
wheel_file = '/home/test_pc/mmlab/mmdetection/datasets/selected_oi/train.json'

f = open(wheel_file)
wheel_data = json.load(f)
wheel_categories = wheel_data['categories']
wheel_images = wheel_data['images']
wheel_annotations = wheel_data['annotations']
wheel_img_df = pd.DataFrame(wheel_images)
wheel_ann_df = pd.DataFrame(wheel_annotations)
print(wheel_img_df.shape[0])

20


In [39]:
wheel_img_df

Unnamed: 0,id,file_name,width,height,date_captured,license,coco_url,flickr_url
0,1,selected_oi/images/train/0001e595b536c9ec.jpg,1024,768,2021-04-18 13:58:02.810552,1,,
1,2,selected_oi/images/train/0001fa6ab562fd2a.jpg,1022,1024,2021-04-18 13:58:02.810552,1,,
2,3,selected_oi/images/train/0000b4b26ef88376.jpg,1024,717,2021-04-18 13:58:02.810552,1,,
3,4,selected_oi/images/train/0001c626b9afb50c.jpg,1024,675,2021-04-18 13:58:02.810552,1,,
4,5,selected_oi/images/train/0000a90019e380dc.jpg,1024,731,2021-04-18 13:58:02.810552,1,,
5,6,selected_oi/images/train/0000c4f95a9d5a54.jpg,1011,1024,2021-04-18 13:58:02.810552,1,,
6,7,selected_oi/images/train/00006bdb1eb5cd74.jpg,1024,681,2021-04-18 13:58:02.810552,1,,
7,8,selected_oi/images/train/0000bcb094764718.jpg,1024,576,2021-04-18 13:58:02.810552,1,,
8,9,selected_oi/images/train/0001b46b0b82ee29.jpg,1024,683,2021-04-18 13:58:02.810552,1,,
9,10,selected_oi/images/train/0000f53faa4d14c3.jpg,1024,683,2021-04-18 13:58:02.810552,1,,


In [40]:
print(wheel_data['categories'])

[{'id': 1, 'name': 'pedestrain', 'supercategory': 'pedestrain'}]


In [41]:
wheel_ann_df = add_area(wheel_ann_df)

print(wheel_ann_df.shape[0])

46


In [42]:
selected_img_df = copy.deepcopy(wheel_img_df)
selected_ann_df = copy.deepcopy(wheel_ann_df)

In [43]:
print(f'add #{selected_img_df.shape[0]} images to #{aug_img_df.shape[0]} train set')
aug_img_df, selected_ann_df = uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df)
print(f'train img df after aug shape: {aug_img_df.shape[0]}')


add #20 images to #3836 train set
train img df after aug shape: 3856


In [44]:
print(f'add #{selected_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} train set')
ann_df_c = concat_ann_df(ann_df_c, selected_ann_df)
print(f'train annotations #{ann_df_c.shape[0]} for train set after augmentation')

add #46 annotations to #40515 train set
train annotations #40561 for train set after augmentation


In [45]:
for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']

### WP

In [46]:
wp1_file = 'json_files/WiderPerson/train.json'

f = open(wp1_file)
wp_data = json.load(f)
wp_categories = wp_data['categories']
wp_images = wp_data['images']
wp_annotations = wp_data['annotations']
wp_img_df = pd.DataFrame(wp_images)
wp_ann_df = pd.DataFrame(wp_annotations)
wp_img_df.shape[0]

8000

In [47]:
wp_ann_df

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height
0,1,1,1,False,"[45.0, 235.0, 34.0, 83.0]",550,413
1,2,1,1,False,"[60.0, 209.0, 60.0, 147.0]",550,413
2,3,1,1,False,"[119.0, 214.0, 49.0, 122.0]",550,413
3,4,1,1,False,"[94.0, 220.0, 42.0, 106.0]",550,413
4,5,1,1,False,"[213.0, 201.0, 74.0, 180.0]",550,413
...,...,...,...,...,...,...,...
241804,241805,8000,1,False,"[233.0, 83.0, 15.0, 18.0]",385,257
241805,241806,8000,1,False,"[34.0, 88.0, 18.0, 40.0]",385,257
241806,241807,8000,1,False,"[66.0, 87.0, 15.0, 27.0]",385,257
241807,241808,8000,1,False,"[161.0, 79.0, 16.0, 21.0]",385,257


In [48]:
print(wp_data['categories'])

[{'id': 1, 'name': 'pedestrain', 'supercategory': 'pedestrain'}]


In [49]:
train_img_wp, val_img_wp, test_img_wp = splite_data(wp_img_df, train_size=351, val_size=75,test_size=75, examption_list=wp_list)
index_id_list = find_index_of_images(wp_img_df, image_name_list=train_img_wp)

In [50]:
selected_wp_img_df = create_img_df(wp_img_df, index_id_list)
selected_wp_img_df.shape[0]

351

In [51]:
selected_wp_ann_df = create_ann_df(wp_ann_df, index_id_list)
selected_wp_ann_df = add_area(selected_wp_ann_df)
selected_wp_ann_df.shape[0]

10843

In [52]:
print(f'add #{selected_wp_img_df.shape[0]} images to #{aug_img_df.shape[0]} train set')
aug_img_df, selected_wp_ann_df = uodate_id_in_image_ann(selected_wp_img_df, aug_img_df, selected_wp_ann_df)
print(f'train img df after aug shape: {aug_img_df.shape[0]}')


add #351 images to #3856 train set
train img df after aug shape: 4207


In [53]:
selected_wp_ann_df.shape[0]

10843

In [54]:
print(f'add #{selected_wp_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} train set')

ann_df_c = concat_ann_df(ann_df_c, selected_wp_ann_df)

for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']
print(f'train annotations #{ann_df_c.shape[0]} for train set after augmentation')

add #10843 annotations to #40561 train set
train annotations #51404 for train set after augmentation


In [55]:
ann_df_c.shape[0]

51404

In [56]:
ann_df_c

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area,ignore
0,1,1,1,1,"[1034.0, 366.0, 31.0, 75.0]",2048,1024,2325,
1,2,1,1,1,"[1911.0, 359.0, 10.0, 54.0]",2048,1024,540,
2,3,1,1,1,"[1137.0, 307.0, 8.0, 9.0]",2048,1024,72,
3,4,2,1,0,"[602.0, 413.0, 60.0, 146.0]",2048,1024,8760,
4,5,2,1,0,"[281.0, 381.0, 89.0, 216.0]",2048,1024,19224,
...,...,...,...,...,...,...,...,...,...
51399,51400,3966,1,0,"[314.0, 140.0, 27.0, 34.0]",550,366,918,
51400,51401,3966,1,0,"[315.0, 105.0, 16.0, 32.0]",550,366,512,
51401,51402,3966,1,0,"[256.0, 105.0, 28.0, 18.0]",550,366,504,
51402,51403,3966,1,0,"[227.0, 107.0, 20.0, 18.0]",550,366,360,


#### selected caltech

In [57]:
ca_file = '/home/test_pc/mmlab/mmdetection/datasets/rand_cal/train.json'

f = open(ca_file)
cl_data = json.load(f)
cl_categories = cl_data['categories']
cl_images = cl_data['images']
cl_annotations = cl_data['annotations']
cl_img_df = pd.DataFrame(cl_images)
cl_ann_df = pd.DataFrame(cl_annotations)
cl_img_df

Unnamed: 0,id,file_name,width,height,date_captured,license,coco_url,flickr_url
0,1,rand_cal/images/train/set02_V001_1309.png,640,480,2021-04-18 13:30:31.472309,1,,
1,2,rand_cal/images/train/set00_V004_1946.png,640,480,2021-04-18 13:30:31.472309,1,,
2,3,rand_cal/images/train/set04_V010_995.png,640,480,2021-04-18 13:30:31.472309,1,,
3,4,rand_cal/images/train/set00_V006_1897.png,640,480,2021-04-18 13:30:31.472309,1,,
4,5,rand_cal/images/train/set05_V008_1082.png,640,480,2021-04-18 13:30:31.472309,1,,
...,...,...,...,...,...,...,...,...
196,197,rand_cal/images/train/set06_V000_936.png,640,480,2021-04-18 13:30:31.472309,1,,
197,198,rand_cal/images/train/set04_V006_1005.png,640,480,2021-04-18 13:30:31.472309,1,,
198,199,rand_cal/images/train/set00_V002_1266.png,640,480,2021-04-18 13:30:31.472309,1,,
199,200,rand_cal/images/train/set03_V004_643.png,640,480,2021-04-18 13:30:31.472309,1,,


In [58]:
cl_ann_df = add_area(cl_ann_df)

In [59]:
selected_img_df = copy.deepcopy(cl_img_df)
selected_ann_df = copy.deepcopy(cl_ann_df)
aug_img_df.shape[0] + selected_img_df.shape[0]

4408

In [60]:
print(f'add #{selected_img_df.shape[0]} images to #{aug_img_df.shape[0]} train set')

aug_img_df, selected_ann_df = uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df)

print(f'train img df after aug shape: {aug_img_df.shape[0]}')


add #201 images to #4207 train set
train img df after aug shape: 4408


In [61]:
aug_img_df.shape[0]

4408

In [62]:
selected_ann_df.shape[0] + ann_df_c.shape[0]

51613

In [63]:
print(f'add #{selected_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} train set')

ann_df_c = concat_ann_df(ann_df_c, selected_ann_df)

for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']

print(f'train annotations #{ann_df_c.shape[0]} for train set after augmentation')

add #209 annotations to #51404 train set
train annotations #51613 for train set after augmentation


In [64]:
ann_df_c["iscrowd"].replace({1: True, 0:False}, inplace=True)
ann_df_c['ignore'] = ann_df_c['ignore'].fillna(0)
# ann_df_c = ann_df_c.drop(['hbox', 'vbox'], axis = 1)
ann_df_c

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area,ignore
0,1,1,1,True,"[1034.0, 366.0, 31.0, 75.0]",2048,1024,2325,0.0
1,2,1,1,True,"[1911.0, 359.0, 10.0, 54.0]",2048,1024,540,0.0
2,3,1,1,True,"[1137.0, 307.0, 8.0, 9.0]",2048,1024,72,0.0
3,4,2,1,False,"[602.0, 413.0, 60.0, 146.0]",2048,1024,8760,0.0
4,5,2,1,False,"[281.0, 381.0, 89.0, 216.0]",2048,1024,19224,0.0
...,...,...,...,...,...,...,...,...,...
51608,51609,4402,1,False,"[544, 190, 22, 53]",640,480,1166,0.0
51609,51610,4402,1,False,"[493, 195, 22, 42]",640,480,924,0.0
51610,51611,4403,1,False,"[512, 171, 28, 72]",640,480,2016,0.0
51611,51612,4403,1,False,"[542, 170, 95, 72]",640,480,6840,0.0


In [65]:
ann_df_c[ann_df_c.category_id != 1]

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area,ignore


In [66]:
ann_df_c["iscrowd"].replace({1: True, 0:False}, inplace=True)
# ann_df_c['ignore'] = ann_df_c['ignore'].fillna(0)
ann_df_c

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area,ignore
0,1,1,1,True,"[1034.0, 366.0, 31.0, 75.0]",2048,1024,2325,0.0
1,2,1,1,True,"[1911.0, 359.0, 10.0, 54.0]",2048,1024,540,0.0
2,3,1,1,True,"[1137.0, 307.0, 8.0, 9.0]",2048,1024,72,0.0
3,4,2,1,False,"[602.0, 413.0, 60.0, 146.0]",2048,1024,8760,0.0
4,5,2,1,False,"[281.0, 381.0, 89.0, 216.0]",2048,1024,19224,0.0
...,...,...,...,...,...,...,...,...,...
51608,51609,4402,1,False,"[544, 190, 22, 53]",640,480,1166,0.0
51609,51610,4402,1,False,"[493, 195, 22, 42]",640,480,924,0.0
51610,51611,4403,1,False,"[512, 171, 28, 72]",640,480,2016,0.0
51611,51612,4403,1,False,"[542, 170, 95, 72]",640,480,6840,0.0


In [67]:
aug_img_df

Unnamed: 0,id,file_name,width,height,date_captured,license,coco_url,flickr_url
0,1,CityPersons/leftImg8bit_trainvaltest//leftImg8bit/train/weimar/weimar_000117_000019_leftImg8bit.png,2048,1024,2019-07-25 11:20:43.195846,1.0,,
1,2,CityPersons/leftImg8bit_trainvaltest//leftImg8bit/train/weimar/weimar_000080_000019_leftImg8bit.png,2048,1024,2019-07-25 11:20:43.195846,1.0,,
2,3,CityPersons/leftImg8bit_trainvaltest//leftImg8bit/train/weimar/weimar_000128_000019_leftImg8bit.png,2048,1024,2019-07-25 11:20:43.195846,1.0,,
3,4,CityPersons/leftImg8bit_trainvaltest//leftImg8bit/train/weimar/weimar_000103_000019_leftImg8bit.png,2048,1024,2019-07-25 11:20:43.195846,1.0,,
4,5,CityPersons/leftImg8bit_trainvaltest//leftImg8bit/train/weimar/weimar_000063_000019_leftImg8bit.png,2048,1024,2019-07-25 11:20:43.195846,1.0,,
...,...,...,...,...,...,...,...,...
4403,4404,rand_cal/images/train/set06_V000_936.png,640,480,2021-04-18 13:30:31.472309,1.0,,
4404,4405,rand_cal/images/train/set04_V006_1005.png,640,480,2021-04-18 13:30:31.472309,1.0,,
4405,4406,rand_cal/images/train/set00_V002_1266.png,640,480,2021-04-18 13:30:31.472309,1.0,,
4406,4407,rand_cal/images/train/set03_V004_643.png,640,480,2021-04-18 13:30:31.472309,1.0,,


In [68]:
ann_df_c

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area,ignore
0,1,1,1,True,"[1034.0, 366.0, 31.0, 75.0]",2048,1024,2325,0.0
1,2,1,1,True,"[1911.0, 359.0, 10.0, 54.0]",2048,1024,540,0.0
2,3,1,1,True,"[1137.0, 307.0, 8.0, 9.0]",2048,1024,72,0.0
3,4,2,1,False,"[602.0, 413.0, 60.0, 146.0]",2048,1024,8760,0.0
4,5,2,1,False,"[281.0, 381.0, 89.0, 216.0]",2048,1024,19224,0.0
...,...,...,...,...,...,...,...,...,...
51608,51609,4402,1,False,"[544, 190, 22, 53]",640,480,1166,0.0
51609,51610,4402,1,False,"[493, 195, 22, 42]",640,480,924,0.0
51610,51611,4403,1,False,"[512, 171, 28, 72]",640,480,2016,0.0
51611,51612,4403,1,False,"[542, 170, 95, 72]",640,480,6840,0.0


In [69]:
print(f"CityPersons: #{aug_img_df[aug_img_df['file_name'].str.contains('CityPersons')].shape[0]}")
# selected_caltech
print(f"selected_caltech: #{aug_img_df[aug_img_df['file_name'].str.contains('rand_cal')].shape[0]}")
# WiderPerson
print(f"WiderPerson: #{aug_img_df[aug_img_df['file_name'].str.contains('WiderPerson')].shape[0]}")
# wheelchair
print(f"wheelchair: #{aug_img_df[aug_img_df['file_name'].str.contains('selected_oi')].shape[0]}")
# CrowdHuman
print(f"CrowdHuman: #{aug_img_df[aug_img_df['file_name'].str.contains('CrowdHuman')].shape[0]}")
# EuroCity
print(f"EuroCity: #{aug_img_df[aug_img_df['file_name'].str.contains('EuroCity')].shape[0]}")

CityPersons: #2529
selected_caltech: #201
WiderPerson: #351
wheelchair: #20
CrowdHuman: #436
EuroCity: #871


### Save to Json

In [70]:
aug_data_train = dict()
aug_data_train['info'] = cp_data['info']
aug_data_train['categories'] = cp_data['categories']

aug_data_train['images'] = aug_img_df.to_dict('records')
aug_data_train['annotations'] = ann_df_c.to_dict('records')

In [71]:
with open('/home/test_pc/mmlab/mmdetection/datasets/random_train.json', 'w') as fp:
    json.dump(aug_data_train, fp)

In [72]:
cp_val = '/home/test_pc/mmlab/mmdetection/datasets/random_train.json'
f = open(cp_val)
cp_val_data = json.load(f)
# cp_val_licenses = cp_val_data['licenses']
cp_val_cat = cp_val_data['categories']
cp_val_images = cp_val_data['images']
cp_val_annotations = cp_val_data['annotations']
cp_val_img_df = pd.DataFrame(cp_val_images)
cp_val_ann_df = pd.DataFrame(cp_val_annotations)

print(f"CityPersons: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('CityPersons')].shape[0]}")
# selected_caltech
print(f"selected_caltech: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('rand_cal')].shape[0]}")
# WiderPerson
print(f"WiderPerson: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('WiderPerson')].shape[0]}")
# wheelchair
print(f"wheelchair: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('selected_oi')].shape[0]}")
# CrowdHuman
print(f"CrowdHuman: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('CrowdHuman')].shape[0]}")
# EuroCity
print(f"EuroCity: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('EuroCity')].shape[0]}")

CityPersons: #2529
selected_caltech: #201
WiderPerson: #351
wheelchair: #20
CrowdHuman: #436
EuroCity: #871


## Validation

In [73]:
cp_file = '/home/test_pc/mmlab/mmdetection/datasets/cityperson_val.json'
f = open(cp_file)
cp_data = json.load(f)
# cp_licenses = cp_data['licenses']
cp_categories = cp_data['categories']
cp_images = cp_data['images']
cp_annotations = cp_data['annotations']
cp_img_df = pd.DataFrame(cp_images)
cp_ann_df = pd.DataFrame(cp_annotations)

In [74]:
# for idx, row in cp_img_df.iterrows():
#     cp_img_df.at[cp_img_df.index[idx], 'file_name'] = 'CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/' + cp_img_df.at[cp_img_df.index[idx], 'file_name']
cp_img_df


Unnamed: 0,id,file_name,height,width
0,1,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_000294_leftImg8bit.png,1024,2048
1,2,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_000576_leftImg8bit.png,1024,2048
2,3,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001016_leftImg8bit.png,1024,2048
3,4,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001236_leftImg8bit.png,1024,2048
4,5,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001751_leftImg8bit.png,1024,2048
...,...,...,...,...
420,421,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/munster_000168_000019_leftImg8bit.png,1024,2048
421,422,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/munster_000169_000019_leftImg8bit.png,1024,2048
422,423,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/munster_000170_000019_leftImg8bit.png,1024,2048
423,424,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/munster_000172_000019_leftImg8bit.png,1024,2048


In [75]:
# cp_ann_df['height'] = ""
# cp_ann_df['width'] = ""
# for idx, row in cp_img_df.iterrows():            
#     cp_ann_df.loc[cp_ann_df['image_id'] == idx + 1, 'height'] = cp_img_df.at[idx, 'height']
#     cp_ann_df.loc[cp_ann_df['image_id'] == idx + 1, 'width'] = cp_img_df.at[idx, 'width']

In [76]:
# cp_ann_df = cp_ann_df.drop(['vis_bbox', 'vis_ratio'], axis = 1)
# cp_ann_df = add_area(cp_ann_df)
cp_ann_df

Unnamed: 0,id,image_id,category_id,iscrowd,ignore,bbox,height,width,area
0,1,1,1,0,1,"[1839, 382, 31, 71]",1024,2048,2201
1,2,1,1,0,0,"[947, 406, 17, 40]",1024,2048,680
2,3,1,1,0,1,"[990, 374, 14, 17]",1024,2048,238
3,4,1,1,0,1,"[1914, 73, 88, 90]",1024,2048,7920
4,5,1,1,0,1,"[1834, 101, 65, 58]",1024,2048,3770
...,...,...,...,...,...,...,...,...,...
4726,4727,423,1,0,1,"[1095, 419, 14, 22]",1024,2048,308
4727,4728,424,1,0,1,"[877, 435, 28, 17]",1024,2048,476
4728,4729,424,1,0,1,"[1370, 391, 26, 18]",1024,2048,468
4729,4730,424,1,0,1,"[1468, 338, 11, 13]",1024,2048,143


In [77]:
aug_img_df = copy.deepcopy(cp_img_df)
ann_df_c = copy.deepcopy(cp_ann_df)

In [78]:

# aug_data_train['info'] = cp_info
# aug_data_train['categories'] = cp_cat
# aug_data_train['images'] = aug_img_df.to_dict('records')
# aug_data_train['annotations'] = ann_df_c.to_dict('records')
# with open('/home/test_pc/mmlab/mmdetection/datasets/cityperson_val.json', 'w') as fp:
#     json.dump(aug_data_train, fp)

### ECP

In [79]:
index_id_list = find_index_of_images(ecp_img_df, image_name_list=val_img_ecp)

selected_img_df = create_img_df(ecp_img_df, index_id_list)

selected_ann_df = create_ann_df(ecp_ann_df, index_id_list)
selected_ann_df = add_area(selected_ann_df)
selected_ann_df

Unnamed: 0,id,image_id,category_id,iscrowd,bbox,width,height,area
173911,173912,21301,1,False,"[1183.0, 377.0, 26.0, 70.0]",1920,1024,1820
173912,173913,21301,1,False,"[752.0, 438.0, 15.0, 39.0]",1920,1024,585
173913,173914,21301,1,True,"[626.0, 448.0, 31.0, 49.0]",1920,1024,1519
33957,33958,5368,1,False,"[51.0, 373.0, 16.0, 60.0]",1920,1024,960
33958,33959,5368,1,False,"[353.0, 381.0, 13.0, 29.0]",1920,1024,377
...,...,...,...,...,...,...,...,...
10277,10278,1427,1,True,"[1135.0, 554.0, 8.0, 25.0]",1920,1024,200
91217,91218,11302,1,False,"[1203.0, 443.0, 27.0, 62.0]",1920,1024,1674
91218,91219,11302,1,False,"[1548.0, 435.0, 18.0, 67.0]",1920,1024,1206
91219,91220,11302,1,False,"[1562.0, 439.0, 29.0, 66.0]",1920,1024,1914


In [80]:
selected_img_df

Unnamed: 0,id,file_name,width,height,date_captured,license,coco_url,flickr_url
21300,21301,EuroCity/ECP/day/img/train/toulouse/toulouse_00536.png,1920,1024,2019-11-03 07:44:18.143034,1,,
5367,5368,EuroCity/ECP/day/img/train/stuttgart/stuttgart_00329.png,1920,1024,2019-11-03 07:44:18.143034,1,,
9111,9112,EuroCity/ECP/day/img/train/roma/roma_00435.png,1920,1024,2019-11-03 07:44:18.143034,1,,
3649,3650,EuroCity/ECP/day/img/train/budapest/budapest_00705.png,1920,1024,2019-11-03 07:44:18.143034,1,,
9441,9442,EuroCity/ECP/day/img/train/roma/roma_00140.png,1920,1024,2019-11-03 07:44:18.143034,1,,
...,...,...,...,...,...,...,...,...
13980,13981,EuroCity/ECP/day/img/train/hamburg/hamburg_00097.png,1920,1024,2019-11-03 07:44:18.143034,1,,
10855,10856,EuroCity/ECP/day/img/train/milano/milano_00155.png,1920,1024,2019-11-03 07:44:18.143034,1,,
12956,12957,EuroCity/ECP/day/img/train/ljubljana/ljubljana_00020.png,1920,1024,2019-11-03 07:44:18.143034,1,,
1426,1427,EuroCity/ECP/day/img/train/torino/torino_00419.png,1920,1024,2019-11-03 07:44:18.143034,1,,


In [81]:
print(f'add #{selected_img_df.shape[0]} images to #{aug_img_df.shape[0]} train set')
aug_img_df, selected_ann_df = uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df)
print(f'train img df after aug shape: {aug_img_df.shape[0]}')

add #168 images to #425 train set
train img df after aug shape: 593


In [82]:
print(f'add #{selected_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} train set')
ann_df_c = concat_ann_df(ann_df_c, selected_ann_df)
for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']

print(f'train annotations #{ann_df_c.shape[0]} for train set after augmentation')

add #1523 annotations to #4731 train set
train annotations #6254 for train set after augmentation


### CH

In [83]:
index_id_list = find_index_of_images(ch_img_df, image_name_list=val_img_ch)

selected_img_df = create_img_df(ch_img_df, index_id_list)
selected_ann_df = create_ann_df(ch_ann_df, index_id_list)

selected_ann_df = add_area(selected_ann_df)

print(f'add #{selected_img_df.shape[0]} images to #{aug_img_df.shape[0]} val set')

aug_img_df, selected_ann_df = uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df, dt='ch')

print(f'train img df after aug shape: {aug_img_df.shape[0]}')

print(f'add #{selected_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} val set')
ann_df_c = concat_ann_df(ann_df_c, selected_ann_df)
for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']
print(f'val annotations #{ann_df_c.shape[0]} for val set after augmentation')

add #94 images to #593 val set
train img df after aug shape: 687
add #2412 annotations to #6254 val set
val annotations #8666 for val set after augmentation


In [84]:
ann_df_c = ann_df_c.drop(['hbox', 'vbox'], axis = 1)
ann_df_c

Unnamed: 0,id,image_id,category_id,iscrowd,ignore,bbox,height,width,area
0,1,1,1,0,1.0,"[1839, 382, 31, 71]",1024,2048,2201
1,2,1,1,0,0.0,"[947, 406, 17, 40]",1024,2048,680
2,3,1,1,0,1.0,"[990, 374, 14, 17]",1024,2048,238
3,4,1,1,0,1.0,"[1914, 73, 88, 90]",1024,2048,7920
4,5,1,1,0,1.0,"[1834, 101, 65, 58]",1024,2048,3770
...,...,...,...,...,...,...,...,...,...
8661,8662,648,1,0,0.0,"[333, 375, 86, 171]",870,580,14706
8662,8663,648,1,0,0.0,"[304, 373, 53, 96]",870,580,5088
8663,8664,648,1,0,0.0,"[72, 439, 165, 259]",870,580,42735
8664,8665,648,1,0,0.0,"[105, 396, 134, 241]",870,580,32294


In [85]:
aug_img_df.shape[0]

687

### wheelchair

In [86]:
wheel_file = '/home/test_pc/mmlab/mmdetection/datasets/selected_oi/val.json'

f = open(wheel_file)
wheel_data = json.load(f)
wheel_categories = wheel_data['categories']
wheel_images = wheel_data['images']
wheel_annotations = wheel_data['annotations']
wheel_img_df = pd.DataFrame(wheel_images)
wheel_ann_df = pd.DataFrame(wheel_annotations)
wheel_img_df.shape[0]

11

In [87]:
wheel_ann_df = add_area(wheel_ann_df)
selected_img_df = copy.deepcopy(wheel_img_df)
selected_ann_df = copy.deepcopy(wheel_ann_df)


print(f'add #{selected_img_df.shape[0]} images to #{aug_img_df.shape[0]} val set')


aug_img_df, selected_ann_df = uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df)
print(f'val img df after aug shape: {aug_img_df.shape[0]}')

add #11 images to #687 val set
val img df after aug shape: 698


In [88]:
print(f'add #{selected_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} val set')
ann_df_c = concat_ann_df(ann_df_c, selected_ann_df)

for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']

print(f'val annotations #{ann_df_c.shape[0]} for val set after augmentation')

add #36 annotations to #8666 val set
val annotations #8702 for val set after augmentation


## WP

In [89]:
index_id_list = find_index_of_images(wp_img_df, image_name_list=val_img_wp)

In [90]:
selected_wp_img_df = create_img_df(wp_img_df, index_id_list)
selected_wp_ann_df = create_ann_df(wp_ann_df, index_id_list)
selected_wp_ann_df = add_area(selected_wp_ann_df)
print(selected_wp_img_df.shape[0])
selected_wp_ann_df.shape[0]

75


2409

In [91]:
print(f'add #{selected_wp_img_df.shape[0]} images to #{aug_img_df.shape[0]} val set')
aug_img_df, selected_wp_ann_df = uodate_id_in_image_ann(selected_wp_img_df, aug_img_df, selected_wp_ann_df)
print(f'val img df after aug shape: {aug_img_df.shape[0]}')


add #75 images to #698 val set
val img df after aug shape: 773


In [92]:
print(f'add #{selected_wp_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} val set')
ann_df_c = concat_ann_df(ann_df_c, selected_wp_ann_df)

for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']
print(f'val annotations #{ann_df_c.shape[0]} for val set after augmentation')

add #2409 annotations to #8702 val set
val annotations #11111 for val set after augmentation


### selected_caltech

In [93]:
ca_file = '/home/test_pc/mmlab/mmdetection/datasets/rand_cal/val.json'

f = open(ca_file)
cl_data = json.load(f)
cl_categories = cl_data['categories']
cl_images = cl_data['images']
cl_annotations = cl_data['annotations']
cl_img_df = pd.DataFrame(cl_images)
cl_ann_df = pd.DataFrame(cl_annotations)
print(cl_img_df.shape[0])
cl_ann_df = add_area(cl_ann_df)

print(cl_ann_df.shape[0])

45
29


In [94]:
selected_img_df = copy.deepcopy(cl_img_df)
selected_ann_df = copy.deepcopy(cl_ann_df)
print(f'add #{selected_img_df.shape[0]} images to #{aug_img_df.shape[0]} val set')
aug_img_df, selected_ann_df = uodate_id_in_image_ann(selected_img_df, aug_img_df, selected_ann_df)
print(f'val img df after aug shape: {aug_img_df.shape[0]}')

add #45 images to #773 val set
val img df after aug shape: 818


In [95]:
print(f'add #{selected_ann_df.shape[0]} annotations to #{ann_df_c.shape[0]} val set')
ann_df_c = concat_ann_df(ann_df_c, selected_ann_df)

for idx, row in ann_df_c.iterrows():
    assert idx + 1 == row['id']
print(f'val annotations #{ann_df_c.shape[0]} for val set after augmentation')
ann_df_c["iscrowd"].replace({1: True, 0:False}, inplace=True)
ann_df_c['ignore'] = ann_df_c['ignore'].fillna(0)
ann_df_c

add #29 annotations to #11111 val set
val annotations #11140 for val set after augmentation


Unnamed: 0,id,image_id,category_id,iscrowd,ignore,bbox,height,width,area
0,1,1,1,False,1.0,"[1839, 382, 31, 71]",1024,2048,2201
1,2,1,1,False,0.0,"[947, 406, 17, 40]",1024,2048,680
2,3,1,1,False,1.0,"[990, 374, 14, 17]",1024,2048,238
3,4,1,1,False,1.0,"[1914, 73, 88, 90]",1024,2048,7920
4,5,1,1,False,1.0,"[1834, 101, 65, 58]",1024,2048,3770
...,...,...,...,...,...,...,...,...,...
11135,11136,815,1,False,0.0,"[489, 162, 15, 45]",480,640,675
11136,11137,815,1,False,0.0,"[440, 163, 22, 47]",480,640,1034
11137,11138,815,1,False,0.0,"[414, 168, 11, 33]",480,640,363
11138,11139,816,1,False,0.0,"[54, 151, 18, 80]",480,640,1440


In [96]:
aug_img_df

Unnamed: 0,id,file_name,height,width,date_captured,license,coco_url,flickr_url
0,1,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_000294_leftImg8bit.png,1024,2048,,,,
1,2,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_000576_leftImg8bit.png,1024,2048,,,,
2,3,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001016_leftImg8bit.png,1024,2048,,,,
3,4,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001236_leftImg8bit.png,1024,2048,,,,
4,5,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001751_leftImg8bit.png,1024,2048,,,,
...,...,...,...,...,...,...,...,...
813,814,rand_cal/images/val/set03_V004_1095.png,480,640,2021-04-18 13:30:22.721543,1.0,,
814,815,rand_cal/images/val/set00_V007_1255.png,480,640,2021-04-18 13:30:22.721543,1.0,,
815,816,rand_cal/images/val/set00_V006_1590.png,480,640,2021-04-18 13:30:22.721543,1.0,,
816,817,rand_cal/images/val/set02_V005_1664.png,480,640,2021-04-18 13:30:22.721543,1.0,,


In [97]:
aug_img_df['date_captured'] = ""
aug_img_df['license'] = ""
aug_img_df['coco_url'] = ""
aug_img_df['flickr_url'] = ""
aug_img_df

Unnamed: 0,id,file_name,height,width,date_captured,license,coco_url,flickr_url
0,1,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_000294_leftImg8bit.png,1024,2048,,,,
1,2,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_000576_leftImg8bit.png,1024,2048,,,,
2,3,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001016_leftImg8bit.png,1024,2048,,,,
3,4,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001236_leftImg8bit.png,1024,2048,,,,
4,5,CityPersons/leftImg8bit_trainvaltest/leftImg8bit/val_all_in_folder/frankfurt_000000_001751_leftImg8bit.png,1024,2048,,,,
...,...,...,...,...,...,...,...,...
813,814,rand_cal/images/val/set03_V004_1095.png,480,640,,,,
814,815,rand_cal/images/val/set00_V007_1255.png,480,640,,,,
815,816,rand_cal/images/val/set00_V006_1590.png,480,640,,,,
816,817,rand_cal/images/val/set02_V005_1664.png,480,640,,,,


In [98]:
ann_df_c[ann_df_c.category_id != 1]

Unnamed: 0,id,image_id,category_id,iscrowd,ignore,bbox,height,width,area


In [99]:
ann_df_c["iscrowd"].replace({1: True, 0:False}, inplace=True)
ann_df_c['ignore'] = ann_df_c['ignore'].fillna(0)
ann_df_c

Unnamed: 0,id,image_id,category_id,iscrowd,ignore,bbox,height,width,area
0,1,1,1,False,1.0,"[1839, 382, 31, 71]",1024,2048,2201
1,2,1,1,False,0.0,"[947, 406, 17, 40]",1024,2048,680
2,3,1,1,False,1.0,"[990, 374, 14, 17]",1024,2048,238
3,4,1,1,False,1.0,"[1914, 73, 88, 90]",1024,2048,7920
4,5,1,1,False,1.0,"[1834, 101, 65, 58]",1024,2048,3770
...,...,...,...,...,...,...,...,...,...
11135,11136,815,1,False,0.0,"[489, 162, 15, 45]",480,640,675
11136,11137,815,1,False,0.0,"[440, 163, 22, 47]",480,640,1034
11137,11138,815,1,False,0.0,"[414, 168, 11, 33]",480,640,363
11138,11139,816,1,False,0.0,"[54, 151, 18, 80]",480,640,1440


In [100]:
print(f"CityPersons: #{aug_img_df[aug_img_df['file_name'].str.contains('CityPersons')].shape[0]}")
# selected_caltech
print(f"selected_caltech: #{aug_img_df[aug_img_df['file_name'].str.contains('rand_cal')].shape[0]}")
# WiderPerson
print(f"WiderPerson: #{aug_img_df[aug_img_df['file_name'].str.contains('WiderPerson')].shape[0]}")
# wheelchair
print(f"wheelchair: #{aug_img_df[aug_img_df['file_name'].str.contains('selected_oi')].shape[0]}")
# CrowdHuman
print(f"CrowdHuman: #{aug_img_df[aug_img_df['file_name'].str.contains('CrowdHuman')].shape[0]}")
# EuroCity
print(f"EuroCity: #{aug_img_df[aug_img_df['file_name'].str.contains('EuroCity')].shape[0]}")

CityPersons: #425
selected_caltech: #45
WiderPerson: #75
wheelchair: #11
CrowdHuman: #94
EuroCity: #168


### Save json file

In [101]:
aug_data_train = dict()
aug_data_train['info'] = cp_info
aug_data_train['categories'] = cp_data['categories']
aug_data_train['images'] = aug_img_df.to_dict('records')
aug_data_train['annotations'] = ann_df_c.to_dict('records')
with open('/home/test_pc/mmlab/mmdetection/datasets/random_val.json', 'w') as fp:
    json.dump(aug_data_train, fp)

In [102]:
cp_val = '/home/test_pc/mmlab/mmdetection/datasets/random_val.json'
f = open(cp_val)
cp_val_data = json.load(f)
# cp_val_licenses = cp_val_data['licenses']
cp_val_cat = cp_val_data['categories']
cp_val_images = cp_val_data['images']
cp_val_annotations = cp_val_data['annotations']
cp_val_img_df = pd.DataFrame(cp_val_images)
cp_val_ann_df = pd.DataFrame(cp_val_annotations)

print(f"CityPersons: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('CityPersons')].shape[0]}")
# selected_caltech
print(f"selected_caltech: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('rand_cal')].shape[0]}")
# WiderPerson
print(f"WiderPerson: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('WiderPerson')].shape[0]}")
# wheelchair
print(f"wheelchair: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('selected_oi')].shape[0]}")
# CrowdHuman
print(f"CrowdHuman: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('CrowdHuman')].shape[0]}")
# EuroCity
print(f"EuroCity: #{cp_val_img_df[cp_val_img_df['file_name'].str.contains('EuroCity')].shape[0]}")

CityPersons: #425
selected_caltech: #45
WiderPerson: #75
wheelchair: #11
CrowdHuman: #94
EuroCity: #168


# data leakage check!

In [163]:
random_train = '/home/test_pc/mmlab/mmdetection/datasets/random_train.json'
sem_test = '/home/test_pc/mmlab/mmdetection/datasets/aug_test.json'

In [164]:
f_rtr = open(random_train)
random_train_data = json.load(f_rtr)
random_train_cat = random_train_data['categories']
random_train_images = random_train_data['images']
random_train_annotations = random_train_data['annotations']
random_train_img_df = pd.DataFrame(random_train_images)
random_train_ann_df = pd.DataFrame(random_train_annotations)

In [165]:
f_st = open(sem_test)
sem_test_data = json.load(f_st)
sem_test_cat = sem_test_data['categories']
sem_test_images = sem_test_data['images']
sem_test_annotations = sem_test_data['annotations']
sem_test_img_df = pd.DataFrame(sem_test_images)
sem_test_ann_df = pd.DataFrame(sem_test_annotations)

In [166]:
random_train_cp = random_train_img_df[random_train_img_df['file_name'].str.contains('CityPersons')]
sem_test_cp = sem_test_img_df[sem_test_img_df['file_name'].str.contains('CityPersons')]

In [167]:
cp_tr = set([el.rsplit('/', 1)[1] for el in list(random_train_cp['file_name'])])
cp_ts = set([el.rsplit('/', 1)[1] for el in list(sem_test_cp['file_name'])])
print("set1 intersection set2 : ", cp_tr.intersection(cp_ts))

set1 intersection set2 :  set()


In [168]:
random_train_cl = random_train_img_df[random_train_img_df['file_name'].str.contains('rand_cal')]
sem_test_cl = sem_test_img_df[sem_test_img_df['file_name'].str.contains('selected_caltech')]

In [169]:
cl_tr = set([el.rsplit('/', 1)[1] for el in list(random_train_cl['file_name'])])
cl_ts = set([el.rsplit('/', 1)[1] for el in list(sem_test_cl['file_name'])])
print("set1 intersection set2 : ", cl_tr.intersection(cl_ts))

set1 intersection set2 :  set()


In [170]:
random_train_wp = random_train_img_df[random_train_img_df['file_name'].str.contains('WiderPerson')]
sem_test_wp = sem_test_img_df[sem_test_img_df['file_name'].str.contains('WiderPerson')]

In [171]:
wp_tr = set([el.rsplit('/', 1)[1] for el in list(random_train_wp['file_name'])])
wp_ts = set([el.rsplit('/', 1)[1] for el in list(sem_test_wp['file_name'])])
print("set1 intersection set2 : ", wp_tr.intersection(wp_ts))

set1 intersection set2 :  set()


In [172]:
random_train_wl = random_train_img_df[random_train_img_df['file_name'].str.contains('selected_oi')]
sem_test_wl = sem_test_img_df[sem_test_img_df['file_name'].str.contains('wheelchair')]

In [173]:
wl_tr = set([el.rsplit('/', 1)[1] for el in list(random_train_wl['file_name'])])
wl_ts = set([el.rsplit('/', 1)[1] for el in list(sem_test_wl['file_name'])])
print("set1 intersection set2 : ", wl_tr.intersection(wl_ts))

set1 intersection set2 :  set()


In [174]:
random_train_ch = random_train_img_df[random_train_img_df['file_name'].str.contains('CrowdHuman')]
sem_test_ch = sem_test_img_df[sem_test_img_df['file_name'].str.contains('CrowdHuman')]

In [175]:
ch_tr = set([el.rsplit('/', 1)[1] for el in list(random_train_ch['file_name'])])
ch_ts = set([el.rsplit('/', 1)[1] for el in list(sem_test_ch['file_name'])])
print("set1 intersection set2 : ", ch_tr.intersection(ch_ts))

set1 intersection set2 :  set()


In [176]:
random_train_ec = random_train_img_df[random_train_img_df['file_name'].str.contains('EuroCity')]
sem_test_ec = sem_test_img_df[sem_test_img_df['file_name'].str.contains('EuroCity')]

In [177]:
ec_tr = set([el.rsplit('/', 1)[1] for el in list(random_train_ec['file_name'])])
ec_ts = set([el.rsplit('/', 1)[1] for el in list(sem_test_ec['file_name'])])
print("set1 intersection set2 : ", ec_tr.intersection(ec_ts))

set1 intersection set2 :  set()
