In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3, 4, 5, 6, 7"
import numpy as np
import subprocess
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets
from torchvision import transforms as tt
from torchvision import models
from torchinfo import summary

In [None]:
import wandb

wandb.login()

# define hyperparameters
num_classes = 100
num_epochs = 25
batch_size = 16
learning_rate = 0.005
ngpu = 4
parallelism = "DataParallel"
weight_decay = 0.005
momentum = 0.9

wandb.init(
    project="pmp_testing",
    config={
        "num_classes": num_classes,
        "num_epochs": num_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "ngpu": ngpu,
        "parallelism": parallelism,
        "weight_decay": weight_decay,
        "momentum": momentum,
    },
)

In [None]:
device = 'cuda'

In [None]:
def get_free_gpus(threshold=10):
    try:
        # assert(torch.cuda.device_count() == 8)
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'],
            stdout=subprocess.PIPE,
            text=True,
            check=True
        )

        utilizations = [int(x.strip()) for x in result.stdout.split('\n') if x.strip()]
        print(utilizations)

        free_gpus = [i for i, util in enumerate(utilizations) if util < 10]
        return free_gpus

    except Exception as e:
        print(f"something went wrong getting free gpus: {e}")


free_gpus = get_free_gpus()
if free_gpus:
    print(f"Available GPUs are: {free_gpus}")
    selected_gpus = free_gpus[-ngpu:]
    print(selected_gpus)

    for gpu_idx in selected_gpus:
        print(torch.cuda.device(gpu_idx))
        print(torch.cuda.get_device_properties(gpu_idx))

# device = torch.device(f"cuda:{selected_gpus[0]}" if (torch.cuda.is_available() and ngpu > 0) else "cpu")


something went wrong getting free gpus: 


In [None]:
# from https://pytorch.org/tutorials/intermediate/dist_tuto.html

def run(rank, size):
    """ Distributed function to be implemented later. """
    pass

def init_process(rank, size, fn, backend='gloo'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)


world_size = ngpu
processes = []
mp.set_start_method("spawn")
for rank in range(world_size):
    p = mp.Process(target=init_process, args=(rank, world_size, run))
    p.start()
    processes.append(p)

for p in processes:
    p.join()

In [None]:
def data_loader(data_dir,
                batch_size,
                random_seed=42,
                valid_size=0.1,
                shuffle=True,
                test=False):
  
    normalize = tt.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    transform = tt.Compose([
            tt.RandomCrop(32, padding=4,padding_mode='reflect'),
            tt.RandomHorizontalFlip(),
            tt.Resize((224,224)),
            tt.ToTensor(),
            normalize,
    ])

    if test:
        dataset = datasets.CIFAR100(
          root=data_dir, train=False,
          download=True, transform=transform,
        )

        data_loader = torch.utils.data.DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle
        )

        return data_loader

    # load the dataset
    train_dataset = datasets.CIFAR100( root=data_dir, train=True, download=True, transform=transform)

    valid_dataset = datasets.CIFAR100(root=data_dir, train=True,download=True, transform=transform,)

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler, pin_memory=True)
 
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler, pin_memory=True)

    return (train_loader, valid_loader)


# flower 102 dataset 
train_loader, valid_loader = data_loader(data_dir='./data',
                                         batch_size=batch_size)

test_loader = data_loader(data_dir='./data',
                              batch_size=batch_size,
                              test=True)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


First, we get the model and examine it

In [None]:
model = models.vgg19(progress=True, num_classes=100)#.to(device)
# print(summary(model))
# model.to(f"cuda:{selected_gpus[0]}")

# if (device.type == 'cuda') and (ngpu > 1):
#     model = nn.parallel.DistributedDataParallel(model, device_ids=selected_gpus)
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum)  


# Train the model
total_step = len(train_loader)

In [None]:
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))
            
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total)) 
    
with torch.no_grad():
    correct = 0 
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))  

Epoch [1/25], Step [704/704], Loss: 4.3102
Accuracy of the network on the 5000 validation images: 2.92 %
Epoch [2/25], Step [704/704], Loss: 4.5064
Accuracy of the network on the 5000 validation images: 4.44 %
Epoch [3/25], Step [704/704], Loss: 4.2271
Accuracy of the network on the 5000 validation images: 6.02 %
Epoch [4/25], Step [704/704], Loss: 3.2166
Accuracy of the network on the 5000 validation images: 11.36 %
Epoch [5/25], Step [704/704], Loss: 3.7912
Accuracy of the network on the 5000 validation images: 14.88 %
Epoch [6/25], Step [704/704], Loss: 2.1552
Accuracy of the network on the 5000 validation images: 17.56 %
Epoch [7/25], Step [704/704], Loss: 3.4424
Accuracy of the network on the 5000 validation images: 20.84 %
Epoch [8/25], Step [704/704], Loss: 3.5063
Accuracy of the network on the 5000 validation images: 18.32 %
Epoch [9/25], Step [704/704], Loss: 2.1956
Accuracy of the network on the 5000 validation images: 22.6 %
Epoch [10/25], Step [704/704], Loss: 2.7161
Accura