# Exploratory data analysis (EDA)

In this notebook, we download a sample of the BDD100K semantic segmentation dataset and use W&B Artifacts and Tables to version and analyze our data.

In [1]:
from fastai.vision.all import *
import wandb

import params

## Download data

Use `untar_data` function from fastai to download and unzip the datasets.

`path.ls()` shows that dataset folder contains subfolders of images, labels and licence file

In [2]:
URL = 'https://storage.googleapis.com/wandb_course/bdd_simple_1k.zip'

In [3]:
path = Path(untar_data(URL, force_download=True))

In [4]:
path.ls()

(#3) [Path('/Users/davidoc/.fastai/data/bdd_simple_1k/images'),Path('/Users/davidoc/.fastai/data/bdd_simple_1k/labels'),Path('/Users/davidoc/.fastai/data/bdd_simple_1k/LICENSE.txt')]

# Artefacts

Start a new `W&B run` and put everything into a raw `Artifact`.

In [5]:
# Login to W&B (in terminal)
# !wandb login 

In [6]:
run = wandb.init(project=params.WANDB_PROJECT, entity=params.ENTITY, job_type="upload")
raw_data_artifact = wandb.Artifact(params.RAW_DATA_AT, type="raw_data")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33md-oliver-cort[0m ([33mdoc93[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# Add Licence file to artifact (raw_data_artifact)
raw_data_artifact.add_file(path/'LICENSE.txt', name='LICENSE.txt')

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x173cce910>

In [8]:
# Add directories (folders) containing dataset images and labels to artifact (raw_data_artifact)
raw_data_artifact.add_dir(path/'images', name='images')
raw_data_artifact.add_dir(path/'labels', name='labels')

[34m[1mwandb[0m: Adding directory to artifact (/Users/davidoc/.fastai/data/bdd_simple_1k/images)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/Users/davidoc/.fastai/data/bdd_simple_1k/labels)... Done. 0.3s


# Tables

In [9]:
DEBUG = False # set this flag to True to use a small subset of data for testing

In [10]:
# Get names of image files in the dataset by using `get_image_files` function from fastai 
image_files = get_image_files(path/"images", recurse=False)

# sample a subset if DEBUG
if DEBUG: image_files = image_files[:10]

In [13]:
# Functions to help us process the data and upload it as a `Table` to W&B. 

def label_func(fname):
    return (fname.parent.parent/"labels")/f"{fname.stem}_mask.png"

def get_classes_per_image(mask_data, class_labels):
    unique = list(np.unique(mask_data))
    result_dict = {}
    for _class in class_labels.keys():
        result_dict[class_labels[_class]] = int(_class in unique)
    return result_dict

def _create_table(image_files, class_labels):
    
    labels = [str(class_labels[_lab]) for _lab in list(class_labels)]
    print('labels:',labels)

    # Create a W&B table (to summarise dataset) and define its columns
    # - Useful to add a "Dataset" column, to be able to get dataset statistics when doing W&B table report
    # - eg. File_Name:    a59131a5-00000000.jpg
    # - eg. File_Name_P1: a59131a5
    # - eg. File_Name_P2: 00000000
    table = wandb.Table(columns=["File_Name", "File_Name_P1", "File_Name_P2", "Images", "Dataset", "Split"] + labels)

    for i, image_file in progress_bar(enumerate(image_files), total=len(image_files)):
        image = Image.open(image_file)
        mask_data = np.array(Image.open(label_func(image_file)))
        class_in_image = get_classes_per_image(mask_data, class_labels)

        # Add data to W&B table (must match table columns defined above)
        table.add_data(
            str(image_file.name),
            image_file.stem.split('-')[0],
            image_file.stem.split('-')[1],
            wandb.Image(
                    image,
                    masks={
                        "predictions": {
                            "mask_data": mask_data,
                            "class_labels": class_labels,
                        }
                    }
            ),
            "bdd1k",
            "None", # we don't have a dataset split yet
            *[class_in_image[_lab] for _lab in labels]
        )
    
    return table

In [14]:
# Use the above functions to create a W&B Table containing the dataset
table = _create_table(image_files, params.BDD_CLASSES)

labels: ['background', 'road', 'traffic light', 'traffic sign', 'person', 'vehicle', 'bicycle']


In [15]:
# Add the `Table` to the `Artifact`
raw_data_artifact.add(table, "eda_table")

<wandb.sdk.artifacts.artifact_manifest_entry.ArtifactManifestEntry at 0x29ff34400>

# Log artifact to W&B and finish `run`. 

In [16]:
run.log_artifact(raw_data_artifact)
run.finish()

