In [3]:
import os
import random
import pickle

import numpy as np

from PIL import Image

In [21]:
def split_accident_dataset(root, train_ratio=0.4, validation_ratio=0.2):
    
    random.seed(42)

    all_data_dir = os.path.join(root, 'all_data')
    no_accident_dir = os.path.join(all_data_dir, '0_no_accident')
    accident_dir = os.path.join(all_data_dir, '1_accident')

    
    # Read the data from the accident and no_accident directories
    accident_data = []
    for filename in os.listdir(accident_dir):
        img_path = os.path.join(accident_dir, filename)
        with Image.open(img_path) as img:
            np_img = np.array(img)
            accident_data.append((np_img, 1))

    no_accident_data = []
    for filename in os.listdir(no_accident_dir):
        img_path = os.path.join(no_accident_dir, filename)
        with Image.open(img_path) as img:
            np_img = np.array(img)
            no_accident_data.append((np_img, 0))
    
    print("No Accident :: 0")
    print("Accident :: 1" )
    
    
    all_data = accident_data + no_accident_data
    random.shuffle(all_data)

    # Split the data into train, validation and test sets
    train_split_index = int(len(all_data) * train_ratio)
    validation_split_index = train_split_index + int(len(all_data) * validation_ratio)
        
    train_data = all_data[ : train_split_index]
    validation_data = all_data[train_split_index : validation_split_index]
    test_data = all_data[validation_split_index : ]

    # Save the datasets as pickle files
    with open(os.path.join(root, 'accident_train_data.pkl'), 'wb') as f:
        pickle.dump(train_data, f)

    with open(os.path.join(root, 'accident_validation_data.pkl'), 'wb') as f:
        pickle.dump(validation_data, f)
        
    with open(os.path.join(root, 'accident_test_data.pkl'), 'wb') as f:
        pickle.dump(test_data, f)
        
    
    return train_data, validation_data, test_data
        
def split_cifar_dataset(root, total_cnt=7000, train_ratio=0.6, validation_ratio=0.2):
    
    random.seed(42)

    cifar_dir = os.path.join(root, 'all_data/2_cifar7')

    label = 2
    cifar_data = []
    for folder in os.listdir(cifar_dir):
        
        folder_dir = os.path.join(cifar_dir, folder)
        
        if(os.path.isdir(folder_dir) == False):
            continue
        
        print(folder_dir, " :: ", label)
        
        for filename in os.listdir(folder_dir):
            img_path = os.path.join(folder_dir, filename)
            with Image.open(img_path) as img:
                np_img = np.array(img)
                cifar_data.append((np_img, label))
        label += 1

    random.shuffle(cifar_data)
    
    cifar_data = cifar_data[:total_cnt]
    

    # Split the data into train, validation and test sets
    train_split_index = int(len(cifar_data) * train_ratio)
    validation_split_index = train_split_index + int(len(cifar_data) * validation_ratio)

    train_data = cifar_data[:train_split_index]
    validation_data = cifar_data[train_split_index : validation_split_index]
    test_data = cifar_data[validation_split_index:]

    # Save the datasets as pickle files
    with open(os.path.join(root, 'cifar_train_data.pkl'), 'wb') as f:
        pickle.dump(train_data, f)

    with open(os.path.join(root, 'cifar_validation_data.pkl'), 'wb') as f:
        pickle.dump(validation_data, f)
    
    with open(os.path.join(root, 'cifar_test_data.pkl'), 'wb') as f:
        pickle.dump(test_data, f)
        
    return train_data, validation_data, test_data


In [22]:
accident_train_data, accident_validation_data, accident_test_data = split_accident_dataset(root="crime_canary_dataset", train_ratio=0.6, validation_ratio=0.2)

No Accident :: 0
Accident :: 1


In [23]:
tot_cifar_data = 7000
tot_accident_data = len(accident_train_data) + len(accident_validation_data) + len(accident_test_data)

cifar_train_ratio = (len(accident_train_data))/tot_accident_data
cifar_validation_ratio = (len(accident_validation_data))/tot_accident_data

cifar_train_data, cifar_validation_data, cifar_test_data = split_cifar_dataset(root="crime_canary_dataset", total_cnt=int(tot_cifar_data), train_ratio=cifar_train_ratio, validation_ratio=cifar_validation_ratio)

crime_canary_dataset/all_data/2_cifar7/cat  ::  2
crime_canary_dataset/all_data/2_cifar7/dog  ::  3
crime_canary_dataset/all_data/2_cifar7/bird  ::  4
crime_canary_dataset/all_data/2_cifar7/ship  ::  5
crime_canary_dataset/all_data/2_cifar7/frog  ::  6
crime_canary_dataset/all_data/2_cifar7/horse  ::  7
crime_canary_dataset/all_data/2_cifar7/deer  ::  8


In [5]:
def split_3_cat_datasets(root_dir, train_ratio=0.6, val_ratio=0.2):
    
    random.seed(42)
    
    main_folder = 'accident_dataset'
    
    all_datasets = {}
    folders = ['accident_detection', 'accident_severity', 'vehicles_in_accidents']
    for folder in folders:
        path = os.path.join(root_dir, main_folder)
        path = os.path.join(path, folder)
        subfolders = sorted(os.listdir(path))
        labels = {subfolder: i for i, subfolder in enumerate(subfolders)}
        
        print(folder)
        print("------------------------")
        print(labels)
        print()
    
        train_data = []
        val_data = []
        test_data = []

        for subfolder, label in labels.items():
            subfolder_path = os.path.join(path, subfolder)
         
            files = os.listdir(subfolder_path)
            random.shuffle(files)

            n = len(files)
            n_train = int(train_ratio * n)
            n_val = int(val_ratio * n)
            n_test = n - n_train - n_val

            # Create the train dataset
            for file in files[:n_train]:
                img_path = os.path.join(subfolder_path, file)
                with Image.open(img_path) as img:
                    np_img = np.array(img)
                    train_data.append((np_img, label))

            # Create the validation dataset
            for file in files[n_train:n_train + n_val]:
                img_path = os.path.join(subfolder_path, file)
                with Image.open(img_path) as img:
                    np_img = np.array(img)
                    val_data.append((np_img, label))

            # Create the test dataset
            for file in files[n_train + n_val:]:
                img_path = os.path.join(subfolder_path, file)
                with Image.open(img_path) as img:
                    np_img = np.array(img)
                    test_data.append((np_img, label))

        with open(os.path.join(root_dir, folder + '_train.pkl'), 'wb') as f:
            random.shuffle(train_data)
            pickle.dump(train_data, f)
        with open(os.path.join(root_dir, folder + '_validation.pkl'), 'wb') as f:
            random.shuffle(val_data)
            pickle.dump(val_data, f)
        with open(os.path.join(root_dir, folder + '_test.pkl'), 'wb') as f:
            random.shuffle(test_data)
            pickle.dump(test_data, f)

        all_datasets[folder + '_train'] = train_data
        all_datasets[folder + '_validation'] = val_data
        all_datasets[folder + '_test'] = test_data

    return all_datasets

In [6]:
all_datasets = split_3_cat_datasets(root_dir="datasets_for_comparison", train_ratio=0.6, val_ratio=0.2)

accident_detection
------------------------
{'0_no_accident': 0, '1_accident': 1}

accident_severity
------------------------
{'0_low': 0, '1_medium': 1, '2_high': 2}

vehicles_in_accidents
------------------------
{'0_motorcycle': 0, '1_light': 1, '2_heavy': 2}

