In [1]:
## Load Libs

In [2]:
!pip install s3fs



In [3]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from pl_bolts.datamodules import CIFAR10DataModule
from pl_bolts.transforms.dataset_normalizations import cifar10_normalization
from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger, MLFlowLogger
from torch.optim.lr_scheduler import OneCycleLR
from torch.optim.swa_utils import AveragedModel, update_bn
from torchmetrics.functional import accuracy

In [4]:
seed_everything(7)

Global seed set to 7


7

In [14]:
PATH_DATASETS = os.environ.get('PATH_DATASETS', '.')
AVAIL_GPUS = min(1, torch.cuda.device_count())
BATCH_SIZE = 256 if AVAIL_GPUS else 64
NUM_WORKERS = int(os.cpu_count() / 2)

# Set Logging

For Experiment Tracking

## Tensorboard
For logging tensorboard details

In [6]:
import fsspec
import os

minio_url = 'http://' + os.environ['MINIO_ENDPOINT'] + ':' + os.environ['MINIO_PORT']

fsspec.config.conf['s3'] = {'anon': False,
                           'key': os.environ['MINIO_USER'],
                           'secret': os.environ['MINIO_PASS'],
                           'client_kwargs':{
                               'endpoint_url': minio_url,
                           }}

In [12]:
tf_logger = TensorBoardLogger('s3://dl-logs/test-model/', name='resnet')

## MLFlow Integration

In [17]:
mlflow_tracker_url = 'http://mlflow-service.mlflow.svc.cluster.local:5000'

mlflow_logger = MLFlowLogger(experiment_name='test-model', 
                             tracking_uri=mlflow_tracker_url)


## Data Loaders

In [18]:
train_transforms = torchvision.transforms.Compose([
    torchvision.transforms.RandomCrop(32, padding=4),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    cifar10_normalization(),
])

test_transforms = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    cifar10_normalization(),
])

cifar10_dm = CIFAR10DataModule(
    data_dir=PATH_DATASETS,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    train_transforms=train_transforms,
    test_transforms=test_transforms,
    #val_transforms=test_transforms,
)

  rank_zero_deprecation(
  rank_zero_deprecation(


In [19]:
def create_model():
    model = torchvision.models.resnet18(pretrained=False, num_classes=10)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    model.maxpool = nn.Identity()
    return model

## Create Lightning Module

In [20]:
class LitResnet(LightningModule):

    def __init__(self, lr=0.05):
        super().__init__()

        self.save_hyperparameters()
        self.model = create_model()

    def forward(self, x):
        out = self.model(x)
        return F.log_softmax(out, dim=1)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        self.log('train_loss', loss)
        return loss

    def evaluate(self, batch, stage=None):
        x, y = batch
        logits = self(x)
        loss = F.nll_loss(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = accuracy(preds, y)

        if stage:
            self.log(f'{stage}_loss', loss, prog_bar=True)
            self.log(f'{stage}_acc', acc, prog_bar=True)

    def validation_step(self, batch, batch_idx):
        self.evaluate(batch, 'val')

    def test_step(self, batch, batch_idx):
        self.evaluate(batch, 'test')

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(),
            lr=self.hparams.lr,
            momentum=0.9,
            weight_decay=5e-4,
        )
        steps_per_epoch = 45000 // BATCH_SIZE
        scheduler_dict = {
            'scheduler': OneCycleLR(
                optimizer,
                0.1,
                epochs=self.trainer.max_epochs,
                steps_per_epoch=steps_per_epoch,
            ),
            'interval': 'step',
        }
        return {'optimizer': optimizer, 'lr_scheduler': scheduler_dict}

## Train Loop

In [21]:
AVAIL_GPUS

0

In [23]:
model = LitResnet(lr=0.05)
model.datamodule = cifar10_dm

trainer = Trainer(
    progress_bar_refresh_rate=10,
    max_epochs=30,
    #gpus=AVAIL_GPUS,
    #gpus=[0],
    #accelerator='dp',
    logger=[tf_logger, mlflow_logger],
    callbacks=[LearningRateMonitor(logging_interval='step')],
)

trainer.fit(model, cifar10_dm)
trainer.test(model, datamodule=cifar10_dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Files already downloaded and verified
Files already downloaded and verified


Experiment with name test-model not found. Creating it.

  | Name  | Type   | Params
---------------------------------
0 | model | ResNet | 11.2 M
---------------------------------
11.2 M    Trainable params
0         Non-trainable params
11.2 M    Total params
44.696    Total estimated model params size (MB)


                                                              

Global seed set to 7


Epoch 0:  81%|████████  | 630/782 [05:54<01:25,  1.78it/s, loss=1.39, v_num=0066]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/157 [00:00<?, ?it/s][A
Epoch 0:  82%|████████▏ | 640/782 [05:56<01:18,  1.80it/s, loss=1.39, v_num=0066]
Epoch 0:  83%|████████▎ | 650/782 [05:57<01:12,  1.82it/s, loss=1.39, v_num=0066]
Epoch 0:  84%|████████▍ | 660/782 [05:58<01:06,  1.84it/s, loss=1.39, v_num=0066]
Epoch 0:  86%|████████▌ | 670/782 [06:00<01:00,  1.86it/s, loss=1.39, v_num=0066]
Epoch 0:  87%|████████▋ | 680/782 [06:01<00:54,  1.88it/s, loss=1.39, v_num=0066]
Epoch 0:  88%|████████▊ | 690/782 [06:03<00:48,  1.90it/s, loss=1.39, v_num=0066]
Epoch 0:  90%|████████▉ | 700/782 [06:04<00:42,  1.92it/s, loss=1.39, v_num=0066]
Epoch 0:  91%|█████████ | 710/782 [06:05<00:37,  1.94it/s, loss=1.39, v_num=0066]
Epoch 0:  92%|█████████▏| 720/782 [06:07<00:31,  1.96it/s, loss=1.39, v_num=0066]
Epoch 0:  93%|█████████▎| 730/782 [06:08<00:26,  1.98it/s, loss=1.39, v_num=0066]
Epoch 0: 

Traceback (most recent call last):
  File "/opt/conda/envs/computer_vision/lib/python3.8/multiprocessing/queues.py", line 245, in _feed
    send_bytes(obj)
  File "/opt/conda/envs/computer_vision/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/opt/conda/envs/computer_vision/lib/python3.8/multiprocessing/connection.py", line 411, in _send_bytes
    self._send(header + buf)
Traceback (most recent call last):
  File "/opt/conda/envs/computer_vision/lib/python3.8/multiprocessing/queues.py", line 245, in _feed
    send_bytes(obj)
  File "/opt/conda/envs/computer_vision/lib/python3.8/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/opt/conda/envs/computer_vision/lib/python3.8/multiprocessing/connection.py", line 411, in _send_bytes
    self._send(header + buf)
Traceback (most recent call last):
  File "/opt/conda/envs/computer_vision/lib/python3.8/multipr

[{'test_loss': 1.3836634159088135, 'test_acc': 0.5184000134468079}]