In [3]:
# !pip install wandb 

## EDA

In this notebook, we downloaded a sample semantic segmentation dataset and use W&B Artifacts and Tables to version and analyse our data

In [1]:
from fastai.vision.all import * # to download and unzip our datasets
import params
import wandb

In [1]:
# /params.py
# WANDB_PROJECT = "mlops-course-001"
# ENTITY = None # set this to team name if working in a team
# BDD_CLASSES = {i:c for i,c in enumerate(['background', 'road', 'traffic light', 'traffic sign', 'person', 'vehicle', 'bicycle'])}
# RAW_DATA_AT = 'bdd_simple_1k'
# PROCESSED_DATA_AT = 'bdd_simple_1k_split'

In [4]:
URL = "https://storage.googleapis.com/wandb_course/bdd_simple_1k.zip"

In [5]:
path = Path(untar_data(URL, force_download=True))

In [6]:
(path/"labels").ls()

(#1001) [Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/7efc19f4-18c1ca3f_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/10a3e362-576d9181_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/3fb02950-68ad8961_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/47406c52-6f9c5dbf_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/8e74dd69-c75b794b_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/0d207cff-6d499379_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/9a888ffa-7b310001_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/22f27dc2-4525d946_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/1ade61c7-b865e37e_mask.png'),Path('/home/l3gion/.fastai/data/bdd_simple_1k/labels/0512a400-d2fa24da_mask.png')...]

In [7]:
import numpy as np

In [8]:
def label_func(fname):
    return (fname.parent.parent/"labels"/f"{fname.stem}_mask.png")

def get_classes_per_image(mask_data, class_labels):
    unique = list(np.unique(mask_data))
    result_dict = {}
    for _class in class_labels.keys():
        result_dict[class_labels[_class]] = int(_class in unique)
    return result_dict

In [9]:
def _create_table(image_files, class_labels):
    "create a table with the dataset "
    labels = [str(class_labels[_lab] for _lab in list(class_labels))]
    table = wandb.Table(columns=["File_Name", "P1","P2" "Images", "Dataset"] + labels)
    
    for i, image_file in progress_bar(enumerate(image_files), total = len(image_files)):
        image =Image.open(image_file)
        mask_data = np.array(Image.open(label_func(image_file)))
        class_in_image = get_classes_per_image(mask_data, class_labels)
        table.add_data(
            image_file.stem,
            image_file.stem.split('-')[0],
            image_file.stem.split('-')[1],
            wandb.Image(image,
                        masks = {
                            "predictions":{
                                "mask_data":mask_data,
                                 "class_labels":class_labels,
                            }
                        }),
            "bdd1k", 
            *[class_in_image[_lab] for _lab in labels]
        )
    return table


to help us process the data and upload it as a Table to W&B

In [17]:
def label_func(fname):
    return (fname.parent.parent/"labels")/f"{fname.stem}_mask.png"

def get_classes_per_image(mask_data, class_labels):
    unique = list(np.unique(mask_data))
    result_dict = {}
    for _class in class_labels.keys():
        result_dict[class_labels[_class]] = int(_class in unique)
    return result_dict

def _create_table(image_files, class_labels):
    "Create a table with the dataset"
    labels = [str(class_labels[_lab]) for _lab in list(class_labels)]
    table = wandb.Table(columns=["File_Name", "Images", "Split"] + labels)
    
    for i, image_file in progress_bar(enumerate(image_files), total=len(image_files)):
        image = Image.open(image_file)
        mask_data = np.array(Image.open(label_func(image_file)))
        class_in_image = get_classes_per_image(mask_data, class_labels)
        table.add_data(
            str(image_file.name),
            wandb.Image(
                    image,
                    masks={
                        "predictions": {
                            "mask_data": mask_data,
                            "class_labels": class_labels,
                        }
                    }
            ),
            "None", # we don't have a dataset split yet
            *[class_in_image[_lab] for _lab in labels]
        )
    
    return table
     

we will start a new W&B  **run** and put everything into a raw Artifacts

In [11]:
run = wandb.init(project = params.WANDB_PROJECT, entity = params.ENTITY, job_type = "upload")

[34m[1mwandb[0m: Currently logged in as: [33mtwelvve[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016675117866664854, max=1.0…

In [12]:
# CREATE AN ARTIFACT
artifact = wandb.Artifact(params.RAW_DATA_AT, type='raw_data' )

In [13]:
# ADD FILE TO ARTIFACT
artifact.add_file(path/'LICENSE.txt', name="LICENSE.txt")

ArtifactManifestEntry(path='LICENSE.txt', digest='X+6ZFkDOlnKesJCNt20yRg==', ref=None, birth_artifact_id=None, size=1594, extra={}, local_path='/home/l3gion/.local/share/wandb/artifacts/staging/tmphnuu63k7')

lets add the images and label masks.

In [14]:
# ADD FOLDERS TO ARTIFACT
artifact.add_dir(path/'images', name='images')
artifact.add_dir(path/'labels', name= 'labels')

[34m[1mwandb[0m: Adding directory to artifact (/home/l3gion/.fastai/data/bdd_simple_1k/images)... Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (/home/l3gion/.fastai/data/bdd_simple_1k/labels)... Done. 0.3s


lets get the file names of images in our 

In [15]:
image_files = get_image_files(path/"images", recurse = False)

In [18]:
table = _create_table(image_files, params.BDD_CLASSES)

In [19]:
# ADD TABLE TO ARTIFACT
artifact.add(table, "eda_table")

ArtifactManifestEntry(path='eda_table.table.json', digest='vkzAykfgjTCf0C2MTMfdXg==', ref=None, birth_artifact_id=None, size=588824, extra={}, local_path='/home/l3gion/.local/share/wandb/artifacts/staging/tmpylofk11v')

In [20]:
# LOG ARTIFACT
run.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f7a0d31f490>

In [21]:
# FINISH RUN 
run.finish()