## 1. Imports

In [1]:
import os
os.chdir("..")

In [1]:
import shutil
import pandas as pd
from typing import List

## 2. Define necessary variables

In [2]:
train_images_dir =  os.path.join("data", "bdd100k_images_100k", "bdd100k", "images", "100k", "train")
val_images_dir =  os.path.join("data", "bdd100k_images_100k", "bdd100k", "images", "100k", "val")

train_labels_dir = os.path.join("data", "bdd100k_labels_txt", "bdd100k", "labels", "train")
val_labels_dir =  os.path.join("data", "bdd100k_labels_txt", "bdd100k", "labels", "val")

train_label_df = pd.read_csv(os.path.join("data", "bdd100k_labels_csv", "train_labels.csv"))
val_label_df = pd.read_csv(os.path.join("data", "bdd100k_labels_csv", "val_labels.csv"))

attributes = {
    "weather": [
        "clear", "rainy", "undefined", "snowy", "overcast","partly cloudy", "foggy"
    ],
    "scene": [
        "city street", "highway", "residential", "parking lot",
        "undefined", "tunnel", "gas stations"
    ],
    "timeofday": [
        "daytime", "dawn/dusk", "night", "undefined"
    ],
    "label": [
        "occluded", "truncated", "small", "medium", "large", "uncertain"
    ]
}

## 3. Save n_samples to disk (will be used for fiftyone)

In [3]:
def save_data(
    df: pd.DataFrame,
    image_dir: str,
    label_dir: str,
    attributes: List[str],
    save_path: str,
    n_samples: int=100
    ):
    
    for category in attributes:
        for sub_category in attributes[category]:
            
            if category == "label":
                if sub_category in ["occluded", "truncated", "small", "medium", "large"]:
                    sub_category_df = df.query(f"{sub_category} == True and uncertain == False")
                elif sub_category == "uncertain":
                    sub_category_df = df.query(f"{sub_category} == True")
            else:
                sub_category_df = df.query(f"{category} == @sub_category")
            
            if not len(sub_category_df):
                continue

            file_names = sub_category_df.sample(
                n=n_samples, replace=True, random_state=42
            )["file_name"].unique().tolist()

            if sub_category == "dawn/dusk":
                sub_category = "dawn_dusk"

            os.makedirs(os.path.join(save_path, "images", category, sub_category), exist_ok=True)
            os.makedirs(os.path.join(save_path, "labels", category, sub_category), exist_ok=True)

            for file_name in file_names:
                file_df = sub_category_df.query("file_name == @file_name")
                if isinstance(file_df["bbox"].iloc[0], str):
                    file_df = file_df.copy()
                    file_df["bbox"] = file_df["bbox"].apply(eval)

                image_path = os.path.join(image_dir, file_name)
                copy_image_path = os.path.join(save_path, "images", category, sub_category, file_name)
                
                write_label_path = os.path.join(
                    save_path, "labels", category, sub_category, file_name.replace(".jpg", ".txt")
                )
                
                shutil.copyfile(image_path, copy_image_path)

                with open(write_label_path, "w") as f:
                    for label, bbox in zip(file_df["label"], file_df["bbox"]):
                        f.write(f"{label} " + " ".join(map(str, bbox)) + "\n")

In [4]:
# train
train_save_path = os.path.join("data", "bdd100k_samples", "train")
save_data(
    train_label_df,
    train_images_dir,
    train_labels_dir,
    attributes,
    train_save_path,
    n_samples=100
)

# val
val_save_path = os.path.join("data", "bdd100k_samples", "val")
save_data(
    val_label_df,
    val_images_dir,
    val_labels_dir,
    attributes,
    val_save_path,
    n_samples=100
)