In [15]:
from pathlib import Path
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold

# Preprocessing

The dataset can be downloaded [here](https://www.kaggle.com/datasets/sshikamaru/udacity-self-driving-car-dataset).  Once downloaded, set your file path.

In [16]:
# Your file path here
folder_path = Path(r"C:\self-driving-car\data")

We can now read in the dataframe that contains bounding boxes and class labels for each image.

In [17]:
# annotations
an_df = pd.read_csv(folder_path / "_annotations.csv")

an_df.head()

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,291,247,370,331
1,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,pedestrian,270,235,293,321
2,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,0,266,13,327
3,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,25,258,106,304
4,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,111,259,135,289


In [18]:
class_list = an_df['class'].unique().tolist()
trafficLight_list = [item for item in class_list if 'trafficLight' in item]

print(f"The classes are: {class_list}")
print()
print(f"Of the {len(class_list)} total classes, {len(trafficLight_list)} are traffic lights.")

The classes are: ['car', 'pedestrian', 'biker', 'truck', 'trafficLight-Red', 'trafficLight', 'trafficLight-Green', 'trafficLight-RedLeft', 'trafficLight-GreenLeft', 'trafficLight-Yellow', 'trafficLight-YellowLeft']

Of the 11 total classes, 7 are traffic lights.


To make classification easier, lets convert all of the traffic light labels to 'trafficLight'.

In [19]:
to_trafficLight_list = ['trafficLight-Green', 'trafficLight-Red', 'trafficLight-GreenLeft',
                        'trafficLight-Yellow', 'trafficLight-RedLeft', 'trafficLight-YellowLeft']

an_df['class'] = an_df['class'].replace(to_trafficLight_list, 'trafficLight')

print(an_df['class'].value_counts())

class
car             127873
trafficLight     34277
pedestrian       21491
truck             7194
biker             3704
Name: count, dtype: int64


Next, notice there are many images with background (i.e. no object present).  These images are not represented in the dataframe.  We will create a new dataframe that contains all images.  This new dataframe will be important as it will be used to create a stratified train/test split.

In [20]:
# image location
image_path = folder_path / "export"

# get all file names
folder_files = set(os.listdir(image_path))

# file names in an_df
df_files = set(an_df['filename'])

# missing files
missing_files = folder_files - df_files

print(f"There is a total of {len(folder_files)} images but the dataframe only contains the {len(df_files)} images that contain an object.  These {len(missing_files)} images will be added.")

There is a total of 29800 images but the dataframe only contains the 26300 images that contain an object.  These 3500 images will be added.


In [21]:
# Create dataframe containing background images
missing_df = pd.DataFrame(missing_files, columns=['filename'])

missing_df['width'] = 512
missing_df['height'] = 512
missing_df['class'] = "empty"

missing_df.head()

Unnamed: 0,filename,width,height,class
0,1478895295619915949_jpg.rf.f772e0ae3974183c1bc...,512,512,empty
1,1478900697736227479_jpg.rf.8404d9f31ff24c23456...,512,512,empty
2,1478898162351724560_jpg.rf.ba26fdb7b7990f23a22...,512,512,empty
3,1478898754207135490_jpg.rf.YHqVVsjBc3Qo6GgL4q8...,512,512,empty
4,1478895557841565024_jpg.rf.vgOJRsGlh3MzaqvYIPy...,512,512,empty


In [22]:
# Concatenate the two dataframes to get a complete dataframe containing all images
df_full = pd.concat([an_df, missing_df]).reset_index(drop=True)

df_full

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,291.0,247.0,370.0,331.0
1,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,pedestrian,270.0,235.0,293.0,321.0
2,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,0.0,266.0,13.0,327.0
3,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,25.0,258.0,106.0,304.0
4,1478900859981702684_jpg.rf.6830635c7d919747563...,512,512,car,111.0,259.0,135.0,289.0
...,...,...,...,...,...,...,...,...
198034,1478895577836747420_jpg.rf.21a96e931edea44c63a...,512,512,empty,,,,
198035,1478895816064189218_jpg.rf.4e4c92626bc811e05b7...,512,512,empty,,,,
198036,1478898542830459757_jpg.rf.192c1ccaa5013aa03b4...,512,512,empty,,,,
198037,1478896035439710748_jpg.rf.a232d0403973df947f4...,512,512,empty,,,,


We will now create a stratified train/test split of the images.

In [23]:
# Need to set groups by file name otherwise an image can end up in both the train and test split.
groups = df_full['filename']

X = df_full.drop(columns=['class'])
y = df_full['class']

sgkf = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=724)
# Take the first fold as test set
tr_idx, te_idx = next(sgkf.split(X, y, groups=groups))
X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]


print(f"Train size: {len(tr_idx)}")
print(f"Test size: {len(te_idx)}")

# Verify no group overlap
print("Shared groups (should be empty):", set(groups[tr_idx]) & set(groups[te_idx]))

Train size: 131990
Test size: 66049
Shared groups (should be empty): set()


Now create train/test folders with the corresponding images and dataframe!

In [24]:
# Create location for train/test files
train_path = folder_path / 'train'
test_path = folder_path / 'test'

# create folders if they don't exist
os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

# create training/testing annotated files
df_full.iloc[tr_idx].to_csv(train_path / 'train_annotate.csv', index=False)
df_full.iloc[te_idx].to_csv(test_path / 'test_annotate.csv', index=False)

# copy files to train folder
for fname in X_tr['filename']:
    src_path = os.path.join(image_path, fname)
    dst_path = os.path.join(train_path, fname)

    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)   # preserves metadata
    else:
        print(f"WARNING: {fname} not found in source folder")


# copy files to test folder
for fname in X_te['filename']: 
    src_path = os.path.join(image_path, fname)
    dst_path = os.path.join(test_path, fname)

    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)   # preserves metadata
    else:
        print(f"WARNING: {fname} not found in source folder")