In [1]:
# DDP, Multi-processing example. Distributed Sampler
# https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html

In [2]:
import os
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import torchvision.transforms as transforms

In [3]:
class MLPLazy(nn.Module):

    def __init__(self, nx, hidden_layer_dims, ny, gpu):
        super(MLPLazy, self).__init__()
        self.hidden_layer_dims = hidden_layer_dims
        
        linear_layers = []
        last_dim = nx
        for next_dim in hidden_layer_dims:
            linear_layer = nn.Linear(last_dim, next_dim).cuda(gpu)
            linear_layers.append(linear_layer)
            last_dim = next_dim
        # should push to ModuleList so that params stay on cuda
        self.linear_layers = nn.ModuleList(linear_layers)
        
        self.scorer = nn.Linear(last_dim, ny)

    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        last_X = X
        for i, linear_layer in enumerate(self.linear_layers):
            # shape (m, self.hidden_layer_dims[i])
            last_X = linear_layer(last_X)
            # shape (m, self.hidden_layer_dims[i])
            last_X = torch.relu(last_X)
        # shape (m, ny)
        z = self.scorer(last_X)
        # shape (m, ny)
        a = torch.softmax(z, dim=1)
        return z, a

In [4]:
def run_train(model, train_loader, valid_loader, loss_criterion, optimizer, args, gpu):
    '''
    Train model and report losses on train and dev sets per epoch
    '''
    
    history = {
        'train_losses': [],
        'valid_losses': [],        
        'valid_accuracy': [],
    }
    
    for epoch_i in range(args.epochs):

        # train
        model.train()
        sum_batch_losses = torch.tensor([0.], dtype=torch.float, device=gpu)
        for batch_i, batch_data in enumerate(train_loader):
            batch_X = batch_data['X'].cuda(gpu, non_blocking=True)
            batch_y = batch_data['y'].cuda(gpu, non_blocking=True)
            logits, activations = model(batch_X)
            loss = loss_criterion(logits, batch_y) #return scalar
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            sum_batch_losses += loss # don't call item yet to use pinned memory
        num_batches = batch_i + 1.
        # append tensor scalar
        history['train_losses'].append(sum_batch_losses/num_batches)

        # validate
        val_sum_batch_losses, val_sum_batch_accuracies, val_num_batches = run_pred(model, valid_loader, loss_criterion, gpu)
        history['valid_losses'].append(val_sum_batch_losses / val_num_batches)
        history['valid_accuracy'].append(val_sum_batch_accuracies / val_num_batches)
    
    # Use pinned memory in dataloader for faster data transfer
    # do not add synchronization point until training loop has ended
    # https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/4
    # https://developer.nvidia.com/blog/how-optimize-data-transfers-cuda-cc/
    itemize = lambda x: [tensor_val.item() for tensor_val in x]
    history['train_losses'] = itemize(history['train_losses'])
    history['valid_losses'] = itemize(history['valid_losses'])
    history['valid_accuracy'] = itemize(history['valid_accuracy'])
    return history

In [5]:
@torch.no_grad()
def run_pred(model, test_loader, loss_criterion, gpu):
    '''Propogate forward on dev or test set, report loss and accuracy.'''
    
    # evaluate
    model.eval()
    sum_batch_losses = torch.tensor([0.], dtype=torch.float, device=gpu)
    sum_batch_accuracies = torch.tensor([0.], dtype=torch.float, device=gpu)
    for batch_i, batch_data in enumerate(test_loader):
        batch_X = batch_data['X'].cuda(gpu, non_blocking=True)
        batch_y = batch_data['y'].cuda(gpu, non_blocking=True)
        logits, activations = model(batch_X)
        loss = loss_criterion(logits, batch_y)
        sum_batch_losses += loss
        _, max_index = torch.max(logits, dim=1)
        accuracy = torch.mean(max_index.eq(batch_y).type(torch.FloatTensor))
        sum_batch_accuracies += accuracy
    num_batches_computed = batch_i + 1.
    return sum_batch_losses, sum_batch_accuracies, num_batches_computed

In [6]:
from torch.utils.data import Dataset, DataLoader

class ToyDataset(Dataset):
    """Toy dataset construction."""

    def __init__(self, data_dir):
        """
        Args:
            data_dir (string): Path to the directory with data files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # shape (m, nx)
        self.X = np.load(os.path.join(data_dir, 'features.npy'))
        # shape (m, ny=1)
        self.y = np.load(os.path.join(data_dir, 'labels.npy'))
        

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        else:
            X = torch.from_numpy(self.X[idx, :]).type(torch.FloatTensor)
            y = torch.tensor(self.y[idx], dtype=torch.long)
            sample = {'X': X, 'y': y}

        return sample

In [7]:
from datetime import datetime

def main_train(gpu, args):
    torch.manual_seed(args.seed)
    
    ################################################################
    # load datasets
    training_set = ToyDataset(data_dir=os.path.join(args.data_dir, args.dataset_dir, 'train'))
    training_generator = torch.utils.data.DataLoader(dataset=training_set, 
                                                     batch_size=args.batch_size, 
                                                     shuffle=True, 
                                                     num_workers=0, 
                                                     pin_memory=True)
    
    validation_set = ToyDataset(data_dir=os.path.join(args.data_dir, args.dataset_dir, 'valid'))
    validation_generator = torch.utils.data.DataLoader(dataset=validation_set, 
                                                       batch_size=args.batch_size, 
                                                       shuffle=True, 
                                                       num_workers=0, 
                                                       pin_memory=True)
    
    nx = training_set.X.shape[1]
    ny = max(training_set.y) + 1
    
    print('Train set X shape:', training_set.X.shape)
    print('Train set y shape:', training_set.y.shape)
    print('Valid set X shape:', validation_set.X.shape)
    print('Valid set y shape:', validation_set.y.shape)
    ################################################################
    
    model = MLPLazy(nx, args.hidden_layer_dims, ny, gpu)
    torch.cuda.set_device(gpu)
    model.to(device=gpu)
    ################################################################
    batch_size = args.batch_size
    loss_criterion = nn.CrossEntropyLoss(reduction='mean')
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
    
    start = datetime.now()
    history = run_train(model, training_generator, validation_generator, loss_criterion, optimizer, args, gpu)
    
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
        
    return history

In [8]:
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
args = parser.parse_args('')

args.data_dir = '/datadrive'
args.dataset_dir = 'toy_mlp_1'
args.seed = 123
args.batch_size = 1000
# https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
args.hidden_layer_dims = [10, 10]
args.lr = 0.01
args.epochs = 20

In [9]:
!nvidia-smi

Tue Oct 27 04:29:49 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00006C84:00:00.0 Off |                    0 |
| N/A   69C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 0000AA21:00:00.0 Off |                    0 |
| N/A   37C    P8    33W / 149W |      0MiB / 11441MiB |      0%      Default |
|       

In [10]:
cuda0 = torch.device('cuda:0')
cuda1 = torch.device('cuda:1') 
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [11]:
main_train(0, args)

Train set X shape: (9000, 10)
Train set y shape: (9000,)
Valid set X shape: (500, 10)
Valid set y shape: (500,)
Training complete in: 0:00:03.985048


{'train_losses': [1.6582450866699219,
  1.6510987281799316,
  1.644874095916748,
  1.6394009590148926,
  1.634544849395752,
  1.6302032470703125,
  1.6262813806533813,
  1.6227096319198608,
  1.6194416284561157,
  1.6164271831512451,
  1.6136350631713867,
  1.6110327243804932,
  1.608588457107544,
  1.6062806844711304,
  1.6041074991226196,
  1.602038025856018,
  1.600063681602478,
  1.5981789827346802,
  1.596370816230774,
  1.5946385860443115],
 'valid_losses': [1.6564011573791504,
  1.6499884128570557,
  1.6443711519241333,
  1.6393944025039673,
  1.6349437236785889,
  1.630926251411438,
  1.6272802352905273,
  1.6239466667175293,
  1.6208895444869995,
  1.6180613040924072,
  1.6154333353042603,
  1.6129724979400635,
  1.610644817352295,
  1.6084356307983398,
  1.6063393354415894,
  1.6043424606323242,
  1.6024408340454102,
  1.6006133556365967,
  1.5988612174987793,
  1.597172498703003],
 'valid_accuracy': [0.1899999976158142,
  0.1860000044107437,
  0.18199999630451202,
  0.180000

In [12]:
!nvidia-smi

Tue Oct 27 04:29:55 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 00006C84:00:00.0 Off |                    0 |
| N/A   69C    P0    62W / 149W |    281MiB / 11441MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           On   | 0000AA21:00:00.0 Off |                    0 |
| N/A   37C    P8    33W / 149W |      3MiB / 11441MiB |      0%      Default |
|       

In [13]:
cuda0 = torch.device('cuda:0')
cuda1 = torch.device('cuda:1') 
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |  351232 B  |   87133 KB |   87133 KB |
|       from large pool |       0 B  |       0 B  |       0 KB |       0 KB |
|       from small pool |       0 B  |  351232 B  |   87133 KB |   87133 KB |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |  351232 B  |   87133 KB |   87133 KB |
|       from large pool |       0 B  |       0 B  |       0 KB |       0 KB |
|       from small pool |       0 B  |  351232 B  |   87133 KB |   87133 KB |
|---------------------------------------------------------------

In [14]:
print(torch.cuda.memory_summary(cuda1))

|                  PyTorch CUDA memory summary, device ID 1                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------