In [34]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3, 4, 5, 6, 7"
import sys
import numpy as np
import subprocess
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision import datasets
from torchvision import transforms as tt
from torchvision import models
from torchinfo import summary

main_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, main_dir)
torch.set_num_threads(4)
from dataloading import data_loader
from models import VGG11
from utils import get_free_gpus

In [None]:
import wandb

wandb.login()

# define hyperparameters
num_classes = 100
num_epochs = 25
batch_size = 64
learning_rate = 0.005
ngpu = 4
# parallelism = "DataParallel"
parallelism = "Sync_pmp"
weight_decay = 0.005
momentum = 0.9
num_workers = 4
model_name="vgg11_code"
# model_name="resnet18"

wandb.init(
    project="pmp_testing",
    config={
        "num_classes": num_classes,
        "num_epochs": num_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "ngpu": ngpu,
        "parallelism": parallelism,
        "weight_decay": weight_decay,
        "momentum": momentum,
        "model": model_name
    },
)

os.environ["WANDB_PROJECT"] = wandb.run.project
os.environ["WANDB_RUN_ID"] = wandb.run.id



In [43]:
# Device configuration
# Decide which device we want to run on
if ngpu > 1:
    selected_gpus = get_free_gpus(ngpu)
    device = torch.device(f"cuda:{selected_gpus[0]}" if (torch.cuda.is_available()) else "cpu")
else: 
    device = torch.device(f"cuda" if (torch.cuda.is_available() and ngpu > 0) else "cpu")


[0, 0, 0, 0, 0, 0, 0, 0]
Available GPUs are: [0, 1, 2, 3, 4, 5, 6, 7]
[4, 5, 6, 7]


In [44]:

train_loader, valid_loader = data_loader(data_dir='./data',
                                         batch_size=batch_size,
                                         num_workers=num_workers)

test_loader = data_loader(data_dir='./data',
                              batch_size=batch_size,
                              num_workers=num_workers,
                              test=True)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [None]:

if model_name == "vgg11_code":
    model = models.vgg11(progress=True, num_classes=100)#.to(device)
    # model = VGG11(ngpu,num_classes) #.to(device)

# if model_name == "vgg19":
#     model = models.vgg19(progress=True, num_classes=100)#.to(device)

# if model_name == "resnet152":
#     model = models.resnet152(num_classes=100)

# if model_name == "resnet18":
#     model = models.resnet18(num_classes=100)

# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1) and parallelism == "DataParallel":
    model = nn.DataParallel(model, selected_gpus).to(device)
else:
    model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay, momentum=momentum)  


# Train the model
total_step = len(train_loader)

In [None]:
total_step = len(train_loader)

for epoch in range(num_epochs):
    total_correct = 0
    total_samples = 0

    model.train()
    for i, (images, labels) in enumerate(train_loader):  
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Get train accuracy
        _, predicted = torch.max(outputs.data, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Acc: {} %' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item(), 100*total_correct/total_samples))
            
    model.eval()
    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            val_loss = criterion(outputs, labels)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs
    
        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total)) 
    
    wandb.log(
        {
            "train/accuracy": 100 * total_correct/total_samples,
            "train/loss": loss.item(),
            "validation/accuracy": 100 * correct / total,
            "validation/loss": loss.item(),
        }
    )

Epoch [1/25], Step [704/704], Loss: 3.7034, Acc: 2.2288888888888887 %
Accuracy of the network on the 5000 validation images: 4.62 %
Epoch [2/25], Step [704/704], Loss: 4.1296, Acc: 6.066666666666666 %
Accuracy of the network on the 5000 validation images: 7.04 %
Epoch [3/25], Step [704/704], Loss: 4.1430, Acc: 8.733333333333333 %


KeyboardInterrupt: 

In [37]:
with torch.no_grad():
    correct = 0 
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))   

KeyboardInterrupt: 

In [47]:
named_layers = dict(model.named_modules())
print(named_layers)
print(model.layers)

{'': VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=Fal

AttributeError: 'VGG' object has no attribute 'layers'

In [None]:
# assume model is vgg11

def get_model_stages(model, stage_index):
    if stage_index == 0:
        model_stage = nn.Sequential(*list(model.features.children())[:6])
    if stage_index == 1:
        model_stage = nn.Sequential(*list(model.features.children())[6:11])
    if stage_index == 2:
        model_stage = nn.Sequential(*list(model.features.children())[11:])
    if stage_index == 3:
        model_stage = nn.Sequential(model.avgpool, *list(model.classifier.children()))
    
    print(f"Model stage {stage_index} contains:")
    for m in model_stage.children():
        print(m)

def manual_model_split(model):
    selected_gpus = get_free_gpus(ngpu)
    devices = [torch.device(f"cuda:{selected_gpus[i]}") for i in selected_gpus]
    print(f"Devices: {devices}")
    stage_0 = get_model_stages(model, 0).to(devices[0])
    stage_1 = get_model_stages(model, 1).to(devices[1])
    stage_2 = get_model_stages(model, 2).to(devices[2])
    stage_3 = get_model_stages(model, 3).to(devices[3])
    


In [58]:
new_stage = manual_model_split(model, 0)

Model stage 0 contains:
Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)


In [53]:
[print(m) for m in model.features.children()]

Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
ReLU(inplace=True)
MaxPool2

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [51]:
l = [module for module in model.modules() if not isinstance(module, nn.Sequential)]
print(l)

[VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
