In [1]:
import numpy as np

import torch
from torch.nn import functional as F
from torch import nn
from torch.utils.data import DataLoader, random_split

import torchvision
from torchvision.datasets import MNIST
from torchvision import transforms


# remove slow mirror from list of MNIST mirrors
MNIST.mirrors = [mirror for mirror in MNIST.mirrors
                 if not mirror.startswith("http://yann.lecun.com")]

In [2]:
import pytorch_lightning as pl
from torchmetrics.functional import accuracy
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.loggers import TensorBoardLogger
import wandb

# pl.seed_everything(hash("setting random seeds") % 2**32 - 1)
wandb.login()

2022-04-25 18:44:15.196177: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-25 18:44:15.196365: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[34m[1mwandb[0m: Currently logged in as: [33mdminn[0m (use `wandb login --relogin` to force relogin)


True

In [3]:
CONFIG = {
    'num_epochs': 3,
    'batch_size': 128,
    'learning_rate': 2e-5,
}

In [4]:
# transforms
transform = transforms.Compose(
    [transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))])

# datasets
trainset = MNIST('./data',
    download=True,
    train=True,
    transform=transform)

testset = MNIST('./data',
    download=True,
    train=False,
    transform=transform)

# dataloaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=CONFIG['batch_size'],
                                        shuffle=True, num_workers=2)

testloader = torch.utils.data.DataLoader(testset, batch_size=CONFIG['batch_size'],
                                        shuffle=False, num_workers=2)

In [5]:
class LitConvNet(pl.LightningModule):
    def __init__(self, **config):
        super(LitConvNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
        self.save_hyperparameters()

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def loss(self, images, labels):
        outputs = self(images)
        loss = F.cross_entropy(outputs, labels)
        logits = F.log_softmax(outputs, dim=1)
        return logits, loss
    
    def training_step(self, batch, batch_index):
        images, labels = batch
        logits, loss = self.loss(images, labels)
        preds = torch.argmax(logits, 1)
        
        # log metrics
        self.log('train/loss', loss, on_epoch = True)
        self.log('train/acc', accuracy(preds, labels), on_epoch = True)
        
        # log images
#         img_grid = torchvision.utils.make_grid(images, on_)
#         self.logger.experiment.add_image('train/images', img_grid)
        return {'loss': loss}

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams['learning_rate'])

In [6]:
logger = TensorBoardLogger("tensorboard_logs", name="LitConvNet")
trainer = pl.Trainer(logger = logger,
                     log_every_n_steps = 50,
#                      gpus = 1,
                     max_epochs = CONFIG['num_epochs'],
                    )
model = LitConvNet(**CONFIG)
trainer.fit(model, train_dataloaders=trainloader, val_dataloaders=testloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")

  | Name  | Type      | Params
------------------------------------
0 | conv1 | Conv2d    | 156   
1 | pool  | MaxPool2d | 0     
2 | conv2 | Conv2d    | 2.4 K 
3 | fc1   | Linear    | 30.8 K
4 | fc2   | Linear    | 10.2 K
5 | fc3   | Linear    | 850   
------------------------------------
44.4 K    Trainable params
0         Non-trainable params
44.4 K    Total params
0.178     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 0:  90%|████████████████████████████████▏   | 420/469 [00:16<00:01, 25.87it/s, loss=2.06, v_num=3]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [7]:
# # write to tensorboard
# writer.add_image('four_fashion_mnist_images', img_grid)

# # add computation graph
# writer.add_graph(net, images)

# # log loss
# writer.add_scalar('training loss', running_loss / 1000, epoch * len(trainloader) + i)

# # log predictions
# writer.add_figure('predictions vs. actuals', plot_classes_preds(net, inputs, labels),...