# Recitation 0: Weights and Biases

In this recitation, you will learn about the importance of performance visualization and model tracking using [WandB](https://wandb.ai/), a tool for performance visualization, model and data version controlling and hyperparameter tuning.

## Installation and Libraries

In [None]:
## Installing WandB
!pip install wandb -qqq

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets
from torchvision.transforms import ToTensor

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

from tqdm import tqdm

Device:  cuda


In [None]:
import wandb, os
os.environ['WANDB_API_KEY'] = "" #your key here
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnzafloris[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Helper functions and Model

In [None]:
data_train = datasets.CIFAR10(
    root = 'data',
    train = True,
    transform = ToTensor(),
    download = True,
)
data_test = datasets.CIFAR10(
    root = 'data',
    train = False,
    download = True,
    transform = ToTensor()
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


100%|██████████| 170M/170M [00:03<00:00, 46.9MB/s]


Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified


In [None]:
def build_data(batch_size, data_train, data_test):
    train_loader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

In [None]:
class Network(nn.Module):

  def __init__(self):

    super(Network, self).__init__()

    self.CNN = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=2),
            nn.BatchNorm2d(64),
            nn.ReLU(),

            nn.AvgPool2d(kernel_size=9),
            nn.Flatten()
    )

    self.classification = nn.Linear(576, 10)
  def forward(self, x):

    x_cnn = self.CNN(x)
    res = self.classification(x_cnn)

    return res

model = Network().to(device)
print(model)

Network(
  (CNN): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): AvgPool2d(kernel_size=9, stride=9, padding=0)
    (4): Flatten(start_dim=1, end_dim=-1)
  )
  (classification): Linear(in_features=576, out_features=10, bias=True)
)


In [None]:
train_loader, test_loader = build_data(64, data_train, data_test)

for x, y in train_loader:
  break
model(x.to(device)).shape

torch.Size([64, 10])

In [None]:
def get_optim(optimizer, learning_rate, model):
  if optimizer=='sgd':
    return optim.SGD(model.parameters(), lr=learning_rate)
  else:
    return optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def train_epoch(model, loader, optimizer, criterion, scaler):
    num_correct = 0
    total_loss = 0

    for i, (x, y) in enumerate(loader):
          optimizer.zero_grad()

          x = x.cuda()
          y = y.cuda()

          with torch.cuda.amp.autocast():
              outputs = model(x)
              loss = criterion(outputs, y)

          total_loss += float(loss)

          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()
    ep_loss = float(total_loss / len(loader))

    return model, ep_loss

In [None]:
def train(model, finish= True):

  # Dont worry about all this, you'll be very familiar with it after HW1

  best_acc = 0

  for epoch in range(run_config['epochs']):
      batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

      num_correct = 0
      total_loss = 0

      for i, (x, y) in enumerate(train_loader):
          optimizer.zero_grad()

          x = x.cuda()
          y = y.cuda()

          with torch.cuda.amp.autocast():
              outputs = model(x)
              loss = criterion(outputs, y)

          num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
          total_loss += float(loss)

          batch_bar.set_postfix(
              acc="{:.04f}%".format(100 * num_correct / ((i + 1) * run_config['batch_size'])),
              loss="{:.04f}".format(float(total_loss / (i + 1))),
              num_correct=num_correct,
              lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))

          scaler.scale(loss).backward()
          scaler.step(optimizer)
          scaler.update()


          batch_bar.update()
      batch_bar.close()

      train_loss = float(total_loss / len(train_loader))
      train_acc = 100 * num_correct / (len(train_loader) * run_config['batch_size'])
      lr = float(optimizer.param_groups[0]['lr'])

      print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
          epoch + 1,
          run_config['epochs'],
          train_acc ,
          train_loss,
          lr
          )
      )

      # What to log

      metrics = {
          "train_loss":train_loss,
          "train_acc": train_acc,
          'lr': lr
      }

      # Log to run
      wandb.log(metrics)

      # Updating the model version

      if train_acc > best_acc:
        best_acc = train_acc

        # Saving the model and optimizer states

        torch.save({
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict()
              }, "Model.pth")

        # ALTERNATIVE 1: Saving Files as Artifacts
        # Creating Artifact
        model_artifact = wandb.Artifact(run_config['model'], type='model')

        # Adding model file to Artifact
        model_artifact.add_file("Model.pth")

        # Saving Artifact to WandB
        run.log_artifact(model_artifact)

        # ALTERNATIVE 2: Saving Files as Files
        wandb.save("Model.pth")

  if finish:
    wandb.finish()

## Simple Usage

You can run the training function and log the performance metrics of your choice into the WandB GUI. This simple method will allow you to monitor trends in a specefic run configuration as well as comparing different runs

In [None]:
run_config = {
    'model': '1-2dcnn',
    'optimizer':'sgd',
    'lr': 2e-3,
    'batch_size':64,
    'epochs': 5
}

train_loader, test_loader = build_data(run_config['batch_size'], data_train, data_test)

optimizer = get_optim(run_config['optimizer'], run_config['lr'], model)

criterion = nn.CrossEntropyLoss()

scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [None]:
run = wandb.init(
    entity="wandb-starter",
    project="wandb-quickstart",
    job_type="model-training",
    name=run_config['model'],
    config=run_config
    )

[34m[1mwandb[0m: Currently logged in as: [33mnzafloris[0m ([33mwandb-starter[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
train(model)

  with torch.cuda.amp.autocast():


Epoch 1/5: Train Acc 38.2393%, Train Loss 1.7846, Learning Rate 0.0020




Epoch 2/5: Train Acc 39.0046%, Train Loss 1.7628, Learning Rate 0.0020




Epoch 3/5: Train Acc 39.7438%, Train Loss 1.7442, Learning Rate 0.0020




Epoch 4/5: Train Acc 40.4092%, Train Loss 1.7260, Learning Rate 0.0020




Epoch 5/5: Train Acc 40.8608%, Train Loss 1.7105, Learning Rate 0.0020


0,1
lr,▁▁▁▁▁
train_acc,▁▃▅▇█
train_loss,█▆▄▂▁

0,1
lr,0.002
train_acc,40.86077
train_loss,1.71053


## Resume a previous run

In [None]:
RESUME_LOGGING = True ### Change to true to test the code for resuming an existing run

In [None]:
if RESUME_LOGGING:
  run_id = "4jtvb0pu" ### Replace with run id string
  run = wandb.init(
      id     = run_id, ### Insert specific run id here if you want to resume a previous run
      resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
      project = "wandb-quickstart", ### Project should be created in your wandb account
  )

  print(run.dir)


/content/wandb/run-20241219_162020-n8cxgzyc/files


In [None]:
### Test code to try appending metrics to previously logged metrics in the run
### Uncomment to try out

# test_new_metrics = {
#       "train_loss":1.5,
#       "train_acc": 40,
#       'lr': 0.001
#   }

# wandb.log(test_new_metrics)

## HyperParameter Sweeps


[Sweeps](https://docs.wandb.ai/guides/sweeps) are a way of automating hyperparameter tuning in Deep Learning Models. You set up the values that you want your sweep to try and then check the affect of changing each parameter on each value on the model.

In [None]:
# Initialize the sweep and set the method (grid, random or bayes"ian")

sweep_config = {
    'method': 'random'
    }

In [None]:
# What is the objective of the sweep (minimize loss, maximize accuracy)

metric = {
    'name':'loss',
    'goal':'minimize'
}
sweep_config['metric'] = metric

In [None]:
# Hyperparameters to work with

parameters_dict = {
    'optimizer':{
        'values': ['sgd', 'adam']
    },
    'learning_rate':{
        'distribution':'uniform',
        'min':2e-4,
        'max':1e-1
    },
    'batch_size': {
        'distribution': 'q_log_uniform_values',
        'q':4,
        'min': 16,
        'max': 128
    },
    'epochs':{
        'value': 5
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
# Initalizing the sweep

sweep_id = wandb.sweep(sweep_config, project="CIFAR-Sweep2")

Create sweep with ID: tz3x5p3x
Sweep URL: https://wandb.ai/nzafloris/CIFAR-Sweep2/sweeps/tz3x5p3x


In [None]:
def train_sweep(config = None):
    with wandb.init(config=config) as run:
        run.name=f"Jeel_{wandb.config.learning_rate}_{wandb.config.batch_size}_{wandb.config.optimizer}"
        config = wandb.config

        train_loader, test_loader = build_data(config.batch_size, data_train, data_test)

        model = Network().to(device)

        optimizer = get_optim(config.optimizer, config.learning_rate, model)

        criterion = nn.CrossEntropyLoss()

        scaler = torch.cuda.amp.GradScaler()

        for epoch in range(config.epochs):

            model, loss = train_epoch(model, train_loader, optimizer, criterion, scaler)

            wandb.log({'loss': loss})

In [None]:
# Running the sweep

wandb.agent(sweep_id, train_sweep, count=2)

## Artifact and Model Versioning

Artifacts are a method of managing versions for data and models. You can use the artifacts to keep and compare versions of your model while training making it easier to share data and models between team members.

In [None]:
run_config = {
    'model': '1-2dcnn',
    'optimizer':'adam',
    'lr': 5e-3,
    'batch_size':20,
    'epochs': 5
}

train_loader, test_loader = build_data(run_config['batch_size'], data_train, data_test)
optimizer = get_optim(run_config['optimizer'], run_config['lr'], model)
criterion = nn.CrossEntropyLoss()
scaler = torch.cuda.amp.GradScaler()

In [None]:
run = wandb.init(
    project="wandb-quickstart",
    job_type="model-training",
    name=run_config['model'],
    config=run_config
    )

In [None]:
train(model,finish= False) #run should not finish for using artifact

In [None]:
## Retreiving the model

# Getting the latest version of the artifact
artifact = run.use_artifact('{}:latest'.format(run_config['model']))
# Downloading the artifact
artifact_dir = artifact.download()
# Loading the model
model_dict = torch.load(os.path.join(artifact_dir, 'Model'))



# Loading weights
model.load_state_dict(model_dict['model_state_dict'])
# Loading optimizer state
optimizer.load_state_dict(model_dict['optimizer_state_dict'])

In [None]:
# Finishing runs
wandb.finish()