In [2]:
import time
import torch
import torchvision
import model as baseline
 
from torchvision import transforms
 
def get_train_loader(image_size, batch_size, num_worker):
    transform_train = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.RandomVerticalFlip(),
        transforms.RandomRotation(45),
        transforms.RandomAffine(45),
        transforms.ColorJitter(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])])
    train_datasets = torchvision.datasets.ImageFolder(
        root=f'data/test', transform=transform_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_datasets)
    shuffle = False
    pin_memory = True
    train_loader = torch.utils.data.DataLoader(
        dataset=train_datasets, batch_size=batch_size, pin_memory=pin_memory,
        num_workers=num_worker, shuffle=shuffle, sampler=train_sampler)
    return train_loader

ModuleNotFoundError: No module named 'model'

In [None]:
def main():
    ngpus_per_node = torch.cuda.device_count()
    world_size = ngpus_per_node
 
    torch.multiprocessing.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, ))


In [None]:
def main_worker(gpu, ngpus_per_node):
     
    image_size = 224
    batch_size = 512
    num_worker = 8
    epochs = 1
 
    batch_size = int(batch_size / ngpus_per_node)
    num_worker = int(num_worker / ngpus_per_node)
    
    torch.distributed.init_process_group(
            backend='nccl',
            init_method='tcp://127.0.0.1:3456',
            world_size=ngpus_per_node,
            rank=gpu)
    model = baseline.ResnetModel()
    torch.cuda.set_device(gpu)
    model = model.cuda(gpu)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
 
    train_loader = get_train_loader(
        image_size=image_size,
        batch_size=batch_size,
        num_worker=num_worker)
 
    optimizer = torch.optim.SGD(
        params=model.parameters(),
        lr=0.001,
        momentum=0.9)
    criterion = torch.nn.CrossEntropyLoss().to(gpu)
 
    model.train()
    for epoch in range(epochs):
 
        start_time = time.time()
        for j, (images, labels) in enumerate(train_loader):
            images, labels = images.to(gpu), labels.to(gpu)
            
            optimizer.zero_grad()
            logits, _, _ = model(images)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
 
 
            print(f'epoch : {epoch} | step : {j} / {len(train_loader)} | mp : {gpu}')
        end_time = time.time()
        print('total time :', end_time - start_time)

---