In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import time
import shutil
import cv2
import os

from sklearn.model_selection import train_test_split

In [2]:
car_df = pd.read_csv("./data/labeled_car_data.csv")

car_df.head()

Unnamed: 0,filename,min_x,min_y,max_x,max_y,target,unknown,car_names
0,000001.jpg,112,7,853,717,1,0,AM General Hummer SUV 2000
1,000002.jpg,48,24,441,202,1,0,AM General Hummer SUV 2000
2,000003.jpg,7,4,277,180,1,0,AM General Hummer SUV 2000
3,000004.jpg,33,50,197,150,1,0,AM General Hummer SUV 2000
4,000005.jpg,5,8,83,58,1,0,AM General Hummer SUV 2000


In [3]:
print(f"65 percent: {len(car_df.index) * 0.65}")
print(f"20 percent: {len(car_df.index) * 0.20}")
print(f"15 percent: {len(car_df.index) * 0.15}")

65 percent: 10520.25
20 percent: 3237.0
15 percent: 2427.75


In [4]:
unique_classes = list(np.unique(car_df['target'].to_numpy()))

In [5]:
# ============================================================================================================================
# Later, we will use the keras image_dataset_from_directory function to efficiently read in the image files into
# tensorflow datasets that will be used for model training. This function has an option to automatically infer the class
# labels based on a directory structure. When this option is used, it is very important that the folders for each class be
# labeled such that their alpha numeric ordering is the same as the final class labels you want them to have.
#
# This means if we want our classes to remain labeled in the same order as the .csv file we created in the previous notebook, 
# we need to make sure the folder for class 1 is the first folder found alpha-numerically, and the folder for class 196 is the
# last alpha-numerically.
#
# This function is used to create a dictionary mapping each target class to the appropriate 
# folder names class_001 - class_196, to fullfill the above mentioned requirements.
# ============================================================================================================================
def make_class_num_to_folder_ext_map(df):
    
    class_to_folder_ext = {}
    
    unique_classes = list(np.unique(car_df['target'].to_numpy()))
    
    for class_num in unique_classes:

        if len(str(class_num)) == 1:

            ext = "00" + str(class_num)

        elif len(str(class_num)) == 2:

            ext = "0" + str(class_num)

        else:
            
            ext = str(class_num)
            
        class_to_folder_ext[class_num] = ext
    
    return class_to_folder_ext

In [6]:
# ============================================================================================================================
# This function is used to create training, validation and test directories. Each of which containing 196 subfolders that
# are labeled class_001 to class_196. 
# ============================================================================================================================
def make_class_directories(df):
    
    base_train = './data/organized/train/'
    base_val = './data/organized/val/'
    base_test ='./data/organized/test/'
    
    base_paths = [base_train, base_val, base_test]
    unique_classes = list(np.unique(car_df['target'].to_numpy()))
    
    folder_ext_map = make_class_num_to_folder_ext_map(df)
    
    for path in base_paths:
        for class_name in unique_classes:
            os.makedirs(path + f"class_{folder_ext_map[class_name]}", exist_ok=True)

In [7]:
# ============================================================================================================================
# This function is used to three dataframes: train_df, val_df and test_df which will contain the Stanford Dataset car image
# filenames that have been split according to the desired training, validation and test proportions.
#
# Default proportions are 65% train, 20% validation, 15% test.
# ============================================================================================================================
def create_train_val_test_dfs(df, target_column = 'target', train_pct=0.65, val_pct=0.20, stratify_target=True):
    
    if stratify_target:
        
        # Split into a training dataframe and a validation + test dataframe
        train_df, val_test_df = train_test_split(df, train_size = train_pct, stratify=df[target_column], random_state=42)

        # Percentage of data that remains after allocating the train data
        remaining_pct = 1 - train_pct

        # Percentage to allocation to validation in the next train_test_splt
        val_pct = round(val_pct / remaining_pct, 3)

        # Split the validation + test dataframe into separate validation and test dataframes.
        val_df, test_df = train_test_split(val_test_df, train_size = val_pct, stratify=val_test_df[target_column], random_state=2)

    else:
        
        # Split into a training dataframe and a validation + test dataframe
        train_df, val_test_df = train_test_split(df, train_size = train_pct, random_state=42)
        
        # Percentage of data that remains after allocating the train data
        remaining_pct = 1 - train_pct
        
        # Percentage to allocation to validation in the next train_test_splt
        val_pct = round(val_pct / remaining_pct, 3)
        
        # Split the validation + test dataframe into separate validation and test dataframes.
        val_df, test_df = train_test_split(val_test_df, train_size = val_pct, random_state=2)
        
    return train_df, val_df, test_df

In [8]:
# ============================================================================================================================
# This function returns a dictionary that maps each target class to a list of filenames that contain examples of cars 
# from that class.
# ============================================================================================================================
def build_class_filename_dict(df, target_col = 'target'):
    
    # List of each unique target class in the dataset (196 classes of cars).
    unique_classes = list(np.unique(df['target'].to_numpy()))
    
    filename_dict = {}
    
    for target_class in unique_classes:
        
        filename_dict[target_class] = list(df.loc[df['target'] == target_class, 'filename'].to_numpy())
        
    return filename_dict

In [9]:
# ============================================================================================================================
# This function will take as input the train_df, test_df or val_df dataframe, and will copy the associated image files
# to the correct folders.
# ============================================================================================================================
def copy_files_to_organized_folders(df, dataset_type, target_col, base_source_folder, base_destination_folder):
    
    valid_dataset_types = ['train', 'test', 'val']
    
    if dataset_type not in valid_dataset_types:
        print("\n================================ Error =======================================")
        print(f"{dataset_type} is an invalid dataset_type")
        print("dataset_type parameter must be one of 'train', 'validation', or 'test'")
        print("=======================================================================\n")
        return -1
    
    # Create the base path for the train, val or test set.
    base_destination_path =f'{base_destination_folder}{dataset_type}/'
    
    # Dictionary mapping each target class to list of filenames in this dataset that contain examples of that class.
    filename_dict = build_class_filename_dict(df, target_col = target_col)
    
    # Dictionary mapping each target class
    class_folder_ext_map = make_class_num_to_folder_ext_map(df)
    
    for class_num, filenames in filename_dict.items(): 
        for filename in filenames:
            shutil.copy(base_source_folder + filename, base_destination_path + f"class_{class_folder_ext_map[class_num]}/")
            
    return

In [10]:
# ============================================================================================================================
# All other functions in this notebook are used to support this function. 
# 
# This function executes the following process:
#
# 1. Make directories for training, validation and test data. Each directory containing 196 sub folders labeled
# class_001 to class_196 to contain examples of images from each of the 196 target classes
#
# 2. Split the data into training, validation and test sets with the desired proportions.
#
# 3. Copy images files to the appropriate folders.
#
# ============================================================================================================================
def create_file_structure(df, verbose=False, target_col='target', base_image_source_folder="./data/car_ims/",
                          train_pct=0.65, val_pct=0.20, stratify_target=True, base_destination_folder = "./data/organized/"): 
    
    
    start_time = time.time()
    
    if verbose:
        print("\n==============================================================")
        print("Creating train, val and test data directories")
        print("Each folder has a sub folder for each of the target classes.")
        print("==============================================================\n")
        
    make_class_directories(df)
    
    if verbose:
        print("\n==============================================================")
        print("Splitting the data into training, validation and test sets...")
        print(f"Training: {round(train_pct * 100, 2)}%")
        print(f"Validation: {round(val_pct * 100, 2)}%")
        print(f"Test: {round((1 - (train_pct + val_pct))* 100, 2)}%")
        print("==============================================================\n")
    
    train_df, val_df, test_df = create_train_val_test_dfs(df, target_column = target_col,
                                                          train_pct=train_pct, val_pct=val_pct,
                                                          stratify_target=stratify_target) 
    
    if verbose:
        print("\n==============================================================")
        print("Copying the training files to the appropriate folders...")
        print("==============================================================\n")
        
         
    copy_files_to_organized_folders(train_df,
                                    dataset_type='train',
                                    target_col=target_col,
                                    base_source_folder = base_image_source_folder,
                                    base_destination_folder = base_destination_folder)
    
    if verbose:
        print("\n==============================================================")
        print("Copying the validation files to the appropriate folders...")
        print("==============================================================\n")
         
    copy_files_to_organized_folders(val_df,
                                    dataset_type='val',
                                    target_col=target_col,
                                    base_source_folder = base_image_source_folder,
                                    base_destination_folder = base_destination_folder)
    
    if verbose:
        print("\n==============================================================")
        print("Copying the test files to the appropriate folders...")
        print("==============================================================\n")
         
    copy_files_to_organized_folders(test_df,
                                    dataset_type='test',
                                    target_col=target_col,
                                    base_source_folder = base_image_source_folder,
                                    base_destination_folder = base_destination_folder)
    
    if verbose:
        print("\n==============================================================")
        print("Finished file organization!")
        print(f"Total Elapsed time: {time.time() - start_time}")
        print("==============================================================\n")
    
    return train_df, val_df, test_df

In [11]:
train_df, val_df, test_df = create_file_structure(car_df, verbose=True)


Creating train, val and test data directories
Each folder has a sub folder for each of the target classes.


Splitting the data into training, validation and test sets...
Training: 65.0%
Validation: 20.0%
Test: 15.0%


Copying the training files to the appropriate folders...


Copying the validation files to the appropriate folders...


Copying the test files to the appropriate folders...


Finished file organization!
Total Elapsed time: 16.46264338493347

