<a href="https://colab.research.google.com/github/01PrathamS/MLOps/blob/main/logging_with_tensorboard.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torchvision
from torchvision import transforms
from torchvision import datasets
from torch.utils.data import DataLoader, random_split

from torch import nn

In [2]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, ), (0.5, )),
])

train_dataset = datasets.MNIST(root="./data", download=True, train=True, transform=transform)
test_dataset = datasets.MNIST(root="./data", download=True, train=False, transform=transform)

class_names = train_dataset.classes

train_dataset, val_dataset = random_split(train_dataset, [50000, 10000])


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:01<00:00, 5071941.81it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 129682.83it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:01<00:00, 1245479.11it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 3981301.73it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [3]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [4]:
class MNISTModel(nn.Module):

  def __init__(self, input_shape: int,
               hidden_units: int,
               output_shape: int):
    super().__init__()
    self.block_1 = nn.Sequential(
        nn.Conv2d(in_channels=input_shape,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=hidden_units,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2,
                     stride=2)
    )

    self.block_2 = nn.Sequential(
        nn.Conv2d(hidden_units,
                  hidden_units,
                  3,
                  padding=1),
        nn.ReLU(),
        nn.Conv2d(hidden_units,
                  hidden_units,
                  3,
                  padding=1),
        nn.ReLU(),
        nn.MaxPool2d(2)
    )

    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Linear(in_features=hidden_units*7*7,
                  out_features=output_shape),
    )

  def forward(self, x: torch.Tensor):
    x = self.block_1(x)
    x = self.block_2(x)
    x = self.classifier(x)
    return x


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(42)
model = MNISTModel(input_shape=1,
                   hidden_units=10,
                   output_shape=len(class_names)).to(device)
model

MNISTModel(
  (block_1): Sequential(
    (0): Conv2d(1, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block_2): Sequential(
    (0): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(10, 10, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=490, out_features=10, bias=True)
  )
)

In [6]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model.parameters(),
                            lr=0.01)

In [7]:
def accuracy_fn(y_true, y_pred):
  correct = torch.eq(y_true, y_pred).sum().item()
  acc = (correct / len(y_pred)) * 100
  return acc

In [12]:
def train_step(model: torch.nn.Module,
               data_loader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               accuracy_fn,
               device: torch.device = device):
  train_loss, train_acc = 0, 0
  model.to(device)
  for batch, (X, y) in enumerate(data_loader):
    X, y = X.to(device), y.to(device)
    y_pred = model(X)

    loss = loss_fn(y_pred, y)
    train_loss += loss
    train_acc += accuracy_fn(y_true=y,
                             y_pred=y_pred.argmax(dim=1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  train_loss /= len(data_loader)
  train_acc /= len(data_loader)
  print(f"Train loss: {train_loss:.5f} | Train accuracy: {train_acc:.2f}%")

  return train_loss, train_acc

In [9]:
def test_step(data_loader: torch.utils.data.DataLoader,
              model: torch.nn.Module,
              loss_fn: torch.nn.Module,
              accuracy_fn,
              device: torch.device = device):

  test_loss, test_acc = 0, 0
  model.to(device)
  model.eval()
  with torch.inference_mode():
    for X, y in data_loader:
      X, y = X.to(device), y.to(device)

      test_pred = model(X)

      test_loss += loss_fn(test_pred, y)
      test_acc += accuracy_fn(y_true=y,
                              y_pred=test_pred.argmax(dim=1))

    test_loss /= len(data_loader)
    test_acc /= len(data_loader)

    print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}%\n")

  return test_loss, test_acc

In [10]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

In [14]:
import time
from tqdm import tqdm

start_time = time.time()

results = {
    "train_loss": [],
    "train_acc" : [],
    "test_loss": [],
    "test_acc": []
}

epochs = 10

for epoch in tqdm(range(epochs)):
  print(f"Epoch: {epochs}\n------------")
  train_loss, train_acc = train_step(data_loader=train_dataloader,
                                     model=model,
                                     loss_fn=loss_fn,
                                     optimizer=optimizer,
                                     accuracy_fn=accuracy_fn,
                                     device=device
                                     )

  test_loss, test_acc = test_step(data_loader=test_dataloader,
                                  model=model,
                                  loss_fn=loss_fn,
                                  accuracy_fn=accuracy_fn,
                                  device=device)

  print(
        f"Epoch: {epoch+1} | "
        f"train_loss: {train_loss:.4f} | "
        f"train_acc: {train_acc:.4f} | "
        f"test_loss: {test_loss:.4f} | "
        f"test_acc: {test_acc:.4f}"
      )

  results["train_loss"].append(train_loss)
  results["train_acc"].append(train_loss)
  results["test_loss"].append(test_loss)
  results["test_acc"].append(test_acc)

  writer.add_scalars(main_tag="Loss",
                     tag_scalar_dict={"train_loss": train_loss,
                                      "test_loss": test_acc},
                     global_step=epoch)

  writer.add_scalars(main_tag="Accuracy",
                     tag_scalar_dict={"train_acc": train_acc,
                                      "test_acc": test_acc},
                     global_step=epoch)

  writer.add_graph(model=model,
                   input_to_model=torch.randn(10, 1, 28, 28).to(device))

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 10
------------
Train loss: 0.12126 | Train accuracy: 96.22%
Test loss: 0.17483 | Test accuracy: 94.25%

Epoch: 1 | train_loss: 0.1213 | train_acc: 96.2212 | test_loss: 0.1748 | test_acc: 94.2492


 10%|█         | 1/10 [00:17<02:39, 17.73s/it]

Epoch: 10
------------
Train loss: 0.09694 | Train accuracy: 96.98%


 20%|██        | 2/10 [00:35<02:22, 17.76s/it]

Test loss: 0.10394 | Test accuracy: 96.54%

Epoch: 2 | train_loss: 0.0969 | train_acc: 96.9790 | test_loss: 0.1039 | test_acc: 96.5355
Epoch: 10
------------
Train loss: 0.08198 | Train accuracy: 97.45%


 30%|███       | 3/10 [00:53<02:04, 17.78s/it]

Test loss: 0.06882 | Test accuracy: 97.94%

Epoch: 3 | train_loss: 0.0820 | train_acc: 97.4548 | test_loss: 0.0688 | test_acc: 97.9433
Epoch: 10
------------
Train loss: 0.07294 | Train accuracy: 97.71%


 40%|████      | 4/10 [01:10<01:45, 17.57s/it]

Test loss: 0.06452 | Test accuracy: 97.93%

Epoch: 4 | train_loss: 0.0729 | train_acc: 97.7127 | test_loss: 0.0645 | test_acc: 97.9333
Epoch: 10
------------
Train loss: 0.06639 | Train accuracy: 97.98%


 50%|█████     | 5/10 [01:28<01:28, 17.77s/it]

Test loss: 0.05561 | Test accuracy: 98.22%

Epoch: 5 | train_loss: 0.0664 | train_acc: 97.9846 | test_loss: 0.0556 | test_acc: 98.2228
Epoch: 10
------------
Train loss: 0.06069 | Train accuracy: 98.13%


 60%|██████    | 6/10 [01:45<01:10, 17.54s/it]

Test loss: 0.05155 | Test accuracy: 98.28%

Epoch: 6 | train_loss: 0.0607 | train_acc: 98.1266 | test_loss: 0.0516 | test_acc: 98.2827
Epoch: 10
------------
Train loss: 0.05636 | Train accuracy: 98.29%


 70%|███████   | 7/10 [02:03<00:52, 17.48s/it]

Test loss: 0.04809 | Test accuracy: 98.45%

Epoch: 7 | train_loss: 0.0564 | train_acc: 98.2945 | test_loss: 0.0481 | test_acc: 98.4525
Epoch: 10
------------
Train loss: 0.05263 | Train accuracy: 98.35%


 80%|████████  | 8/10 [02:21<00:35, 17.67s/it]

Test loss: 0.04690 | Test accuracy: 98.34%

Epoch: 8 | train_loss: 0.0526 | train_acc: 98.3545 | test_loss: 0.0469 | test_acc: 98.3427
Epoch: 10
------------
Train loss: 0.05008 | Train accuracy: 98.49%


 90%|█████████ | 9/10 [02:38<00:17, 17.55s/it]

Test loss: 0.04839 | Test accuracy: 98.48%

Epoch: 9 | train_loss: 0.0501 | train_acc: 98.4905 | test_loss: 0.0484 | test_acc: 98.4824
Epoch: 10
------------
Train loss: 0.04702 | Train accuracy: 98.59%


100%|██████████| 10/10 [02:56<00:00, 17.66s/it]

Test loss: 0.04286 | Test accuracy: 98.54%

Epoch: 10 | train_loss: 0.0470 | train_acc: 98.5865 | test_loss: 0.0429 | test_acc: 98.5423





In [16]:
# %load_ext tensorboard
# %tensorboard --logdir runs

In [18]:
def create_writer(experiment_name: str,
                  model_name: str,
                  extra: str=None) -> torch.utils.tensorboard.SummaryWriter():

    from datetime import datetime
    import os

    timestamp = datetime.now().strftime("%Y-%m-%d")
    if extra:
      log_dir = os.path.join("runs", timestamp, experiment_name, model_name, extra)
    else:
      log_dir = os.path.join("runs", timestamp, experiment_name, model_name)

    print(f"[INFO] Created SummaryWriter, saving to: {log_dir}...")

    return SummaryWriter(log_dir=log_dir)


[INFO] Created SummaryWriter, saving to: runs/2024-05-14/data_10_percent/effnetb0/5_epochs...
