# Dataset preparation from Berkeley DeepDrive

In [None]:
# basic
import os
import pickle
import shutil
import sys
from tqdm import tqdm

# data
from collections import Counter
import json
import pandas as pd

## Config variables

In [None]:
root = '.'

train_images_path = os.path.join(root, 'bdd100k_images', 'bdd100k', 'images', '100k', 'train')
val_images_path = os.path.join(root, 'bdd100k_images', 'bdd100k', 'images', '100k', 'val')
test_images_path = os.path.join(root, 'bdd100k_images', 'bdd100k', 'images', '100k', 'test')

dataset_path = os.path.join(root, 'dataset')
train_dataset_path = os.path.join(root, 'dataset', 'train')
val_dataset_path = os.path.join(root, 'dataset', 'val')
test_dataset_path = os.path.join(root, 'dataset', 'test')

target_attribute_weathers = ['clear']
target_attribute_timeofdays = ['daytime']

## Load the train and val set

In [None]:
with open(os.path.join(root, 'bdd100k_det_20_labels_trainval', 'bdd100k', 'labels', 'det_20', 'det_train.json'), 'rb') as train_file:
    train_set_all = json.load(train_file)
    
# with open(os.path.join(root, 'bdd100k_det_20_labels_trainval', 'bdd100k', 'labels', 'det_20', 'det_val.json'), 'rb') as val_file:
#     val_set_all = json.load(val_file)

## Analyzing weathers and timeofday attributes

In [None]:
weathers = []
timeofdays = []
for image in train_set_all:
    if 'attributes' in image:
        weathers.append(image['attributes']['weather'])
        timeofdays.append(image['attributes']['timeofday'])
print(Counter(weathers))
print(Counter(timeofdays))

## Defining our target labels

In [None]:
all_labels = []

for image in train_set_all:
    if 'labels' in image:
        for label in image['labels']:
            all_labels.append(label['category'])
            
label_distribution = Counter(all_labels)
print(f'Label distribution: {label_distribution}')

target_labels = ['car', 'traffic sign', 'pedestrian']

## Populate train set

In [None]:
label_train_instances = [0, 0, 0]
threshold_instances = 300
train_set = []

for image in train_set_all:
    # making sure this image actually exists and has labels
    if os.path.exists(os.path.join(train_images_path, image['name'])) and 'labels' in image:
        if 'attributes' in image:
            # filter images with our required weather and timeofday attributes
            if image['attributes']['weather'] in target_attribute_weathers and image['attributes']['timeofday'] in target_attribute_timeofdays:
                
                # making sure there is atleast 1 instance that is not truncated or occluded in that image
                atleast_one=False
                for label in image['labels']:
                    if label['category'] in target_labels:
                        if label['attributes']['occluded'] and label['attributes']['truncated']:
                            atleast_one = True
                            break
                
                if atleast_one:
                    target_labels_presence = [True if target_label in [label['category'] for label in image['labels']] else False for target_label in target_labels]
                    if any(target_labels_presence):
                        if target_labels_presence[0]:
                            label_train_instances[0] += 1
                        if target_labels_presence[1]:
                            label_train_instances[1] += 1
                        if target_labels_presence[2]:
                            label_train_instances[2] += 1

                        train_set.append(image)
    
    # if we have atleast 100 images for each label, break.
    if all(True if count > threshold_instances else False for count in label_train_instances):
        break
        
print(f'train set size: {len(train_set)}')

## Populate val set

In [None]:
# label_val_instances = [0, 0, 0]
# threshold_instances = 40
# val_set = []

# for image in val_set_all:
#     # making sure this image actually exists
#     if os.path.exists(os.path.join(val_images_path, image['name'])) and 'labels' in image:
#         if 'attributes' in image:
#             # filter images with our required weather and timeofday attributes
#             if image['attributes']['weather'] in target_attribute_weathers and image['attributes']['timeofday'] in target_attribute_timeofdays:
#                 target_labels_presence = [True if target_label in [label['category'] for label in image['labels']] else False for target_label in target_labels]
#                 if any(target_labels_presence):
#                     if target_labels_presence[0]:
#                         label_val_instances[0] += 1
#                     if target_labels_presence[1]:
#                         label_val_instances[1] += 1
#                     if target_labels_presence[2]:
#                         label_val_instances[2] += 1
#                     val_set.append(image)
    
#     # if we have atleast 20 images for each label, break.
#     if all(True if count > threshold_instances else False for count in label_val_instances):
#         break
        
# print(f'val set size: {len(val_set)}')

## Instance distribution in train and val set

In [None]:
subset_labels = []
for image in train_set:
    if 'labels' in image:
        for label in image['labels']:
            subset_labels.append(label['category'])
            
label_distribution = Counter(subset_labels)
print(f"Train set label instance distribution:")
for target_label in target_labels:
    print(f"{target_label} : {label_distribution[target_label]}")

# subset_labels = []
# for image in val_set:
#     if 'labels' in image:
#         for label in image['labels']:
#             subset_labels.append(label['category'])
            
# label_distribution = Counter(subset_labels)
# print(f"Val set label instance distribution:")
# for target_label in target_labels:
#     print(f"{target_label} : {label_distribution[target_label]}")

## Copy the shortlisted images into our dataset folder, for ease

In [None]:
destination = os.path.join(dataset_path, 'train')
for image in train_set:
    source = os.path.join(train_images_path, image['name'])
    dest = shutil.copy(source, destination)
    
#print(os.listdir(destination))
print("train images folder size:", len(os.listdir(destination)))
    
# destination = os.path.join(dataset_path, 'val')
# for image in val_set:
#     source = os.path.join(val_images_path, image['name'])
#     dest = shutil.copy(source, destination)
    
# #print(os.listdir(destination))
# print("val images folder size:", len(os.listdir(destination)))

## Dump the train and val sets to file

In [None]:
train_set_hash = {}
val_set_hash = {}

for image in train_set:
    train_set_hash[image['name']] = image
    
with open('train_set.pkl', 'wb') as f:
    pickle.dump(train_set_hash, f)
    
# for image in val_set:
#     val_set_hash[image['name']] = image
    
# with open('val_set.pkl', 'wb') as f:
#     pickle.dump(val_set_hash, f)

In [None]:
# with open('train_set.pkl', 'rb') as f:
#     train_set_loaded = pickle.load(f)
    
# with open('val_set.pkl', 'rb') as f:
#     val_set_loaded = pickle.load(f)

In [None]:
for image in train_set:
    print(img.size)