In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, DistributedSampler
import torch.optim as optim


class ResNet18ModelParallel(nn.Module):
    def __init__(self):
        super(ResNet18ModelParallel, self).__init__()
        original_model = models.resnet18(pretrained=True)
        
        # Split the model at the convolutional part and the fully connected part
        self.part1 = nn.Sequential(
            *list(original_model.children())[:-2]
        ).to('cuda:0')  # Assuming you have at least two GPUs
        
        self.part2 = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            *list(original_model.children())[-2:]
        ).to('cuda:1')  # Second part on the second GPU
        
    def forward(self, x):
        x = self.part1(x.to('cuda:0'))  # Move input to GPU 0 and forward through part1
        x = x.to('cuda:1')  # Move intermediate output to GPU 1
        x = self.part2(x)  # Forward through part2 on GPU 1
        return x

def main():
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)

    model = ResNet18ModelParallel()
    
    criterion = nn.CrossEntropyLoss().to('cuda:1')  # Assuming the loss calculation happens on the second GPU
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    
    model.train()
    for epoch in range(10):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels.to('cuda:1'))  # Move labels to GPU 1 where the output resides
            loss.backward()
            optimizer.step()
            
            print(f'Loss: {loss.item()}')
                
    print('Finished Training')

if __name__ == '__main__':
    main()


cuda:0: The first part of the ResNet18 model, consisting of the convolutional layers, is placed on the first GPU. This device is used for processing the initial stages of the model's forward pass.  

cuda:1: The second part of the model, including the adaptive average pooling, flattening, and the fully connected layers, is placed on the second GPU. This device is used for completing the forward pass and for calculating the loss and backpropagation.