<a href="https://colab.research.google.com/github/Armandpl/wandb_jetracer/blob/master/wandb_jetracer_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://i.imgur.com/gb6B4ig.png" width="400" alt="Weights & Biases" />

# 🔥 = W&B ➕ PyTorch ➕ Nvidia jetracer

In [22]:
!pip install wandb

Reason for being yanked: Version contained a bug that could cause files to be synced to the wrong run if runs were executed in parallel[0m
Collecting wandb==0.10
[?25l  Downloading https://files.pythonhosted.org/packages/38/fc/dab806861936fb2bcf3ac5de6543c06e54e08b8cd01e3a32b4f77ce61229/wandb-0.10.0-py2.py3-none-any.whl (1.6MB)
[K     |████████████████████████████████| 1.6MB 7.6MB/s 
Collecting watchdog>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/f2/5b/36b3b11e557830de6fc1dc06e9aa3ee274119b8cea9cc98175dbbf72cf87/watchdog-2.1.2-py3-none-manylinux2014_x86_64.whl (74kB)
[K     |████████████████████████████████| 81kB 11.6MB/s 
Installing collected packages: watchdog, wandb
  Found existing installation: wandb 0.10.30
    Uninstalling wandb-0.10.30:
      Successfully uninstalled wandb-0.10.30
Successfully installed wandb-0.10.0 watchdog-2.1.2


In [2]:
!wget -O xy_dataset.py https://raw.githubusercontent.com/Armandpl/wandb_jetracer/master/utils/xy_dataset.py
!pip install pytorch-lightning torchmetrics

--2021-05-24 13:25:38--  https://raw.githubusercontent.com/Armandpl/wandb_jetracer/master/utils/xy_dataset.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2618 (2.6K) [text/plain]
Saving to: ‘xy_dataset.py’


2021-05-24 13:25:38 (42.2 MB/s) - ‘xy_dataset.py’ saved [2618/2618]

Collecting pytorch-lightning
[?25l  Downloading https://files.pythonhosted.org/packages/32/b9/59ce5be6679884579c276f5f208587c3312e8323bd7ce27be278b7af98b3/pytorch_lightning-1.3.2-py3-none-any.whl (805kB)
[K     |████████████████████████████████| 808kB 8.9MB/s 
[?25hCollecting torchmetrics
[?25l  Downloading https://files.pythonhosted.org/packages/3b/e8/513cd9d0b1c83dc14cd8f788d05cd6a34758d4fd7e4f9e5ecd5d7d599c95/torchmetrics-0.3.2-py3-none-any.whl (274kB)
[K     |██████████████

In [1]:
import math
import os

import cv2
import numpy as np
import PIL
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchmetrics
import wandb

from xy_dataset import XYDataset

# 1. Pre-process dataset(s)

In [None]:
def make_dirs(output_dataset):
    out_dirs = [os.path.join(output_dataset, split) for split in ["train", "val", "test"]]
    [os.makedirs(curr_dir) for curr_dir in out_dirs]

    return out_dirs

In [None]:
def split_list_by_pct(data, pcts):
    sizes = [int(pct*len(data)) for pct in pcts]

    it = iter(data)
    return [[next(it) for _ in range(size)] for size in sizes]

In [None]:
import random

# Manually set pytorch seed to get the same dataset split everytime
torch.manual_seed(42)

config = dict(
    datasets=["suzuka:latest", "nurburgring:latest"],
    output_dataset="mix_ready",
    split_pcts=[0.7, 0.2, 0.1],
)

with wandb.init(project="racecar", config=config, entity="wandb", job_type="pre-process-dataset") as run:
    config = run.config

    # TODO: maybe make a tmp dir and delete afterwards, to only keep artifacts in artifacts dir
    # and to be able to run this cell more than once for the same output_dataset value
    out_dirs = make_dirs(config.output_dataset)

    # make sure the train/val/test pct are coherent
    assert math.fsum(config.split_pcts) == 1

    for dataset in config.datasets:
        artifact = run.use_artifact(dataset)
        artifact_dir = artifact.download()
        
        all_fnames = os.listdir(artifact_dir)
        random.shuffle(all_fnames)

        train, val, test = split_list_by_pct(all_fnames, config.split_pcts)
        sets = {
            "train": train,
            "val": val,
            "test": test
        }
        
        for out_dir, split in zip(out_dirs, ["train", "val", "test"]):
            for fname in sets[split]:
                source = os.path.join(artifact_dir, fname)
                dest = os.path.join(out_dir, fname)
                os.rename(source, dest)
    
    # upload artifacct
    artifact = wandb.Artifact(config.output_dataset, type='dataset')
    artifact.add_dir(config.output_dataset)
    run.log_artifact(artifact)

[34m[1mwandb[0m: Adding directory to artifact (./mix_ready)... Done. 0.7s


VBox(children=(Label(value=' 36.95MB of 36.95MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

# 2. Train

In [2]:
def show_label(image, coordinates, color=(0, 255, 0)):
    x, y = coordinates

    x = int((x + 1) / 2 * 224)
    y = int((y + 1) / 2 * 224)

    cv2.circle(image, (x, y), 5, color, 2)

    return image

In [3]:
class RoadRegression(pl.LightningModule):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.mse = torchmetrics.MeanSquaredError()
        self.mae = torchmetrics.MeanAbsoluteError()

        self.model = torchvision.models.__dict__[config.architecture](pretrained=config.pretrained)
        self.model.fc = nn.Linear(self.model.fc.in_features, 2)

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        return self.model(x)

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        images, targets = batch
        preds = self.forward(images)
        loss = getattr(F, self.config.loss)(preds, targets)

        # log
        self.log_dict({
            "train/mse": loss,
            "train/mae": self.mae(preds, targets)    
        })

        return loss

    def trainin_epoch_end(self):
        self.mae.reset()
        self.mse.reset()

    def validation_step(self, batch, batch_idx):
        images, targets = batch
        preds = self.forward(images)
        self.mse(preds, targets)
        self.mae(preds, targets)

    def validation_epoch_end(self, val_step_outputs):
        self.log_dict({
            "valid/mse": self.mse.compute(),
            "valid/mae": self.mae.compute()
        })
        self.mse.reset()
        self.mae.reset()
    
    def test_step(self, batch, batch_idx):
        images, targets = batch
        preds = self.forward(images)
        return (images, preds, targets)

    def test_epoch_end(self, test_step_outputs):
        print("test epoch end")
        images, predictions, targets = test_step_outputs[0]
        for i in range(1, len(test_step_outputs)):
            imgs, preds, targs = test_step_outputs[i]

            images = torch.cat((images, imgs), dim=0)
            predictions = torch.cat((predictions, preds), dim=0)
            targets = torch.cat((targets, targs), dim=0)

        # TODO: fetch the loss from config and add other metrics
        losses = F.mse_loss(predictions, targets, reduction='none')

        # display preds and targets on images
        images_with_preds = []
        for idx, image in enumerate(images):
            img = image.permute(1, 2, 0).cpu().numpy()*255

            img = cv2.cvtColor(
                img, cv2.COLOR_BGR2RGB
            )

            img = show_label(img, targets[i])
            img = show_label(img, predictions[i], (0, 0, 255))

            # TODO: check why the colors are weird?
            # because we normalize them in the dataset class?? but then the
            # circles colors are weird too?
            images_with_preds.append(img)

        my_data = [
            [wandb.Image(img), pred, target, loss.sum()] 
            for img, pred, target, loss
            in zip(images_with_preds, predictions, targets, losses)
        ]

        # create a wandb.Table() with corresponding columns
        columns= ["image", "prediction", "target", "loss"]
        test_table = wandb.Table(data=my_data, columns=columns)
        
        artifact = wandb.Artifact("test_predictions", type="test")
        artifact.add(test_table, "test_table")
        wandb.log_artifact(artifact)

        # TODO: use trainer.logger.expriment.log ?
        wandb.log_artifact({"test/examples", test_table})

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.config.learning_rate)
        return optimizer

In [4]:
def make_loaders(config):
    # Pull the dataset
    artifact = wandb.use_artifact(config.dataset)
    artifact_dir = artifact.download()

    datasets = [XYDataset(os.path.join(artifact_dir, split)) for split in ["train", "val", "test"]]

    # TODO: turn off shuffle for val and test dataloaders
    train, val, test = [make_loader(current_set, config.batch_size) for current_set in datasets]

    return train, val, test

In [5]:
def make_loader(dataset, batch_size):
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                         batch_size=batch_size, 
                                         shuffle=True,
                                         pin_memory=True, num_workers=2)
    return loader

In [6]:
config = dict(
    epochs=5,
    architecture="resnet34",
    pretrained=True,
    batch_size=64,
    learning_rate=1e-4,
    dataset="mix_ready:latest",
    test_dataset=None, # bypasses the dataset test split to use another artifact TODO: implement that
    train_augs=False,
    loss="mse_loss"
    )

with wandb.init(project="racecar", config=config, job_type="train", entity="wandb") as run:
    config = run.config

    train, val, test = make_loaders(config)

    road_regression = RoadRegression(config)
    print(road_regression)

    wandb_logger = WandbLogger()
    trainer = pl.Trainer(logger=wandb_logger, gpus=1, max_epochs=config.epochs)
    trainer.fit(road_regression, train, val)

    trainer.test(test_dataloaders=test)

    # finally we log the models to wandb
    trainer.save_checkpoint("model.pth")
    artifact = wandb.Artifact('model', type='model')
    artifact.add_file('model.pth')
    run.log_artifact(artifact)

[34m[1mwandb[0m: Currently logged in as: [33marmandpl[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.10.0
[34m[1mwandb[0m: Run data is saved locally in wandb/run-20210524_142058-3mb1w155
[34m[1mwandb[0m: Syncing run [33mlively-sound-257[0m





GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type              | Params
--------------------------------------------
0 | mse   | MeanSquaredError  | 0     
1 | mae   | MeanAbsoluteError | 0     
2 | model | ResNet            | 21.3 M
--------------------------------------------
21.3 M    Trainable params
0         Non-trainable params
21.3 M    Total params
85.143    Total estimated model params size (MB)


RoadRegression(
  (mse): MeanSquaredError()
  (mae): MeanAbsoluteError()
  (model): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): B

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

test epoch end


[34m[1mwandb[0m: Waiting for W&B process to finish, PID 1689
[34m[1mwandb[0m: Program failed with code 1.  Press ctrl-c to abort syncing.





[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: Find user logs for this run at: wandb/run-20210524_142058-3mb1w155/logs/debug.log
[34m[1mwandb[0m: Find internal logs for this run at: wandb/run-20210524_142058-3mb1w155/logs/debug-internal.log
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:             valid/mse 0.021809009835124016
[34m[1mwandb[0m:             valid/mae 0.10912871360778809
[34m[1mwandb[0m:                 epoch 4
[34m[1mwandb[0m:   trainer/global_step 119
[34m[1mwandb[0m:                 _step 6
[34m[1mwandb[0m:              _runtime 86
[34m[1mwandb[0m:            _timestamp 1621866145
[34m[1mwandb[0m:             train/mse 0.01781730353832245
[34m[1mwandb[0m:             train/mae 0.10602302849292755
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:             valid/mse █▄▂▂▁
[34m[1mwandb[0m:             valid/mae █▄▂▂▁
[34m[1mwandb[0m:                 epoc

AttributeError: ignored