In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3, 4, 5, 6, 7"
import sys
import numpy as np
import subprocess
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets
from torchvision import transforms as tt
from torchvision import models
from torchinfo import summary

main_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, main_dir)

from dataloading import data_loader
from models import VGG11
from utils import get_free_gpus

In [2]:
import wandb

wandb.login()

# define hyperparameters
num_classes = 100
num_epochs = 25
batch_size = 64
learning_rate = 0.005
ngpu = 4
parallelism = "DataParallel"
weight_decay = 0.005
momentum = 0.9
num_workers = 4
# model_name="vgg11_code"
model_name="resnet18"

wandb.init(
    project="pmp_testing",
    config={
        "num_classes": num_classes,
        "num_epochs": num_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "ngpu": ngpu,
        "parallelism": parallelism,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "model": model_name
    },
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33masbjorn-lorenzen[0m ([33mmaagedak[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# Device configuration
# Decide which device we want to run on
if ngpu > 1:
    selected_gpus = get_free_gpus(ngpu)
    device = torch.device(f"cuda:{selected_gpus[0]}" if (torch.cuda.is_available()) else "cpu")
else: 
    device = torch.device(f"cuda" if (torch.cuda.is_available() and ngpu > 0) else "cpu")


[0, 0, 0, 0, 0, 0, 0, 0]
Available GPUs are: [0, 1, 2, 3, 4, 5, 6, 7]
[4, 5, 6, 7]


In [4]:

train_loader, valid_loader = data_loader(data_dir='./data',
                                         batch_size=batch_size)

test_loader = data_loader(data_dir='./data',
                              batch_size=batch_size,
                              test=True)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [None]:

if model_name == "vgg11_code":
    model = VGG11(ngpu,num_classes) #.to(device)

if model_name == "vgg19":
    model = models.vgg19(progress=True, num_classes=100)#.to(device)

if model_name == "resnet152":
    model = models.resnet152(num_classes=100)

if model_name == "resnet18":
    model = models.resnet18(num_classes=100)

# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1) and parallelism == "DataParallel":
    model = nn.DataParallel(model, selected_gpus).to(device)
else:
    model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum)  


# Train the model
total_step = len(train_loader)

In [6]:
total_step = len(train_loader)

for epoch in range(num_epochs):
    total_correct = 0
    total_samples = 0

    model.train()
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Get train accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Acc: {} %' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item(), 100*total_correct/total_samples))
            
    model.eval()
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            val_loss = criterion(outputs, labels)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total)) 
    
    wandb.log(
        {
            "train/accuracy": 100 * total_correct/total_samples,
            "train/loss": loss.item(),
            "validation/accuracy": 100 * correct / total,
            "validation/loss": loss.item(),
        }
    )

RuntimeError: module must have its parameters and buffers on device cuda:4 (device_ids[0]) but found one of them on device: cpu

In [None]:
with torch.no_grad():
    correct = 0 
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))   