In [1]:
import sys
sys.path.append("../")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.autograd.profiler as profiler

import os
import numpy as np
from sklearn.datasets import make_classification
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

from utils.moduleCodeProfiler import rankByCriteria

In [2]:
!nvidia-smi

Sun Nov 22 17:34:45 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 000047DD:00:00.0 Off |                    0 |
| N/A   64C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
cuda0 = torch.device('cuda:0') 

In [4]:
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
args = parser.parse_args('')

args.data_dir = '~/datadrive'
args.dataset_dir = 'toy_mlp_1'
args.seed = 123
args.batch_size = 1000
# https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
args.hidden_layer_dims = [10, 10, 10, 10]
args.lr = 0.01
args.epochs = 20

## Toy Data Generation

In [5]:
# construct and save toydataset

m_train = 9000
m_total = m_train

X, y = make_classification(n_samples=m_total, n_features=10, n_informative=10, n_redundant=0, n_repeated=0, n_classes=5, n_clusters_per_class=2, weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, shift=0.0, scale=1.0, shuffle=True, random_state=args.seed)
# y = np.expand_dims(y, -1)

np.random.seed(args.seed)
permutation = np.random.permutation(m_total)
print('First 10 training indices', permutation[:10])
print('X shape', X.shape)
print('y shape', y.shape)

train_indices = permutation[0:m_train]

dataset_dir = 'toy_mlp_1'
os.makedirs(os.path.join(args.data_dir, dataset_dir, 'train'), mode = 0o777, exist_ok = True) 

np.save(os.path.join(args.data_dir, dataset_dir, 'train', 'features.npy'), X[train_indices])
np.save(os.path.join(args.data_dir, dataset_dir, 'train', 'labels.npy'), y[train_indices])

First 10 training indices [1603 8472 2213  498 1038 8399 3324 7535 1519 1959]
X shape (9000, 10)
y shape (9000,)


In [6]:
class ToyDataset(Dataset):
    """Toy dataset construction."""

    def __init__(self, data_dir):
        """
        Args:
            data_dir (string): Path to the directory with data files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        # shape (m, nx)
        self.X = np.load(os.path.join(data_dir, 'features.npy'))
        # shape (m, ny=1)
        self.y = np.load(os.path.join(data_dir, 'labels.npy'))
        

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        else:
            X = torch.from_numpy(self.X[idx, :]).type(torch.FloatTensor)
            y = torch.tensor(self.y[idx], dtype=torch.long)
#             y = torch.from_numpy(self.y[idx, :]).type(torch.FloatTensor)
            sample = {'X': X, 'y': y}

        return sample

## Model

In [7]:
class MLPLazy(nn.Module):

    def __init__(self, nx, hidden_layer_dims, ny):
        super(MLPLazy, self).__init__()
        self.hidden_layer_dims = hidden_layer_dims
        
        linear_layers = []
        last_dim = nx
        for next_dim in hidden_layer_dims:
            linear_layer = nn.Linear(last_dim, next_dim)
            linear_layers.append(linear_layer)
            last_dim = next_dim
        # should push to ModuleList so that params stay on cuda
        self.linear_layers = nn.ModuleList(linear_layers)
        self.scorer = nn.Linear(last_dim, ny)

    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        last_X = X
        for i, linear_layer in enumerate(self.linear_layers):
            # shape (m, self.hidden_layer_dims[i])
            last_X = linear_layer(last_X)
            # shape (m, self.hidden_layer_dims[i])
            last_X = torch.relu(last_X)
        # shape (m, ny)
        z = self.scorer(last_X)
        # shape (m, ny)
        a = torch.softmax(z, dim=1)
        return z, a

## Worflow

In [8]:
def check_weights_precision(model):
    '''specific to checking MLP'''
    for i, layer in enumerate(model.linear_layers):
        print(f'layer {i}, weight dtype {layer.weight.dtype}')
        print(f'layer {i}, bias dtype {layer.bias.dtype}')
    print(f'scorer weight dtype {model.scorer.weight.dtype}')
    print(f'scorer bias dtype {model.scorer.bias.dtype}')

In [9]:
def get_max_memory_alloc():
    devices_max_memory_alloc = {}
    for i in range(torch.cuda.device_count()):
        device = torch.device(f'cuda:{i}')
        devices_max_memory_alloc[device] = torch.cuda.max_memory_allocated(device) / 1e6
        torch.cuda.reset_max_memory_allocated(device)
    return devices_max_memory_alloc

In [10]:
def main_train(args, gpu=0, debug=False):

    torch.manual_seed(args.seed)
    
    ################################################################
    # load datasets
    training_set = ToyDataset(data_dir=os.path.join(args.data_dir, args.dataset_dir, 'train'))
    training_generator = torch.utils.data.DataLoader(dataset=training_set, 
                                                        batch_size=args.batch_size, 
                                                        shuffle=True, 
                                                        num_workers=0, 
                                                        pin_memory=True)

    nx = training_set.X.shape[1]
    ny = max(training_set.y) + 1
    ################################################################

    model = MLPLazy(nx, args.hidden_layer_dims, ny)  # single
    loss_criterion = nn.CrossEntropyLoss(reduction='mean')
    torch.cuda.set_device(gpu)
    model.to(device=gpu)    

    opt = torch.optim.SGD(model.parameters(), lr=args.lr)  # half
    if debug:
        print('\nmodel weights at init')
        check_weights_precision(model)

    history = {'train_losses': [], 'max_memory_allocation':[]}

    for e in range(2):
        model.train()
        sum_batch_losses = torch.tensor([0.], dtype=torch.float, device=gpu)
        batch_max_memory_alloc = []
        for batch_i, batch_data in enumerate(training_generator):
            batch_max_memory_alloc.append(get_max_memory_alloc())

            batch_X = batch_data['X'].cuda(gpu, non_blocking=True) # single
            batch_y = batch_data['y'].cuda(gpu, non_blocking=True) # long
            logits, activations = model(batch_X) # single

            loss = loss_criterion(logits, batch_y)  # single

            opt.zero_grad()
            loss.backward()  # single
            opt.step()
                            
            sum_batch_losses += loss

        num_batches = batch_i + 1.
        history['train_losses'].append(sum_batch_losses/num_batches)
        history['max_memory_allocation'] += batch_max_memory_alloc
    
    itemize = lambda x: [tensor_val.item() for tensor_val in x]
    history['train_losses'] = itemize(history['train_losses'])    

    return history, model

# Train

In [11]:
!nvidia-smi

Sun Nov 22 17:34:48 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 000047DD:00:00.0 Off |                    0 |
| N/A   63C    P8    29W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [12]:
with profiler.profile(profile_memory=True, record_shapes=True, use_cuda=False, with_stack=True) as prof:
    with profiler.record_function("forward"):
        history, model = main_train(args, debug=True)


model weights at init
layer 0, weight dtype torch.float32
layer 0, bias dtype torch.float32
layer 1, weight dtype torch.float32
layer 1, bias dtype torch.float32
layer 2, weight dtype torch.float32
layer 2, bias dtype torch.float32
layer 3, weight dtype torch.float32
layer 3, bias dtype torch.float32
scorer weight dtype torch.float32
scorer bias dtype torch.float32




# Profiler Results

In [13]:
rankByCriteria(prof, model, criteria='cuda_memory_usage', per_thread=False, per_inp_shapes=False, include_external=False)

Ranked by cuda_memory_usage

43.98 Mb
##############################################
model, aten::empty, forward, (26) last_X = torch.relu(last_X)
5.55 Mb
##############################################
model, aten::addmm, forward, (24) last_X = linear_layer(last_X)
2.78 Mb
##############################################
model.scorer, aten::addmm, forward, (93) return F.linear(input, self.weight, self.bias)
model, aten::addmm, forward, (24) last_X = linear_layer(last_X)
2.78 Mb
##############################################
model, aten::resize_, forward, (24) last_X = linear_layer(last_X)
2.78 Mb
##############################################
model.scorer, aten::resize_, forward, (93) return F.linear(input, self.weight, self.bias)
model, aten::resize_, forward, (24) last_X = linear_layer(last_X)
2.78 Mb
##############################################
model, aten::relu, forward, (26) last_X = torch.relu(last_X)
2.78 Mb
##############################################
model, aten::threshold, 

# Training Results

In [13]:
history

{'train_losses': [1.613153338432312,
  1.6128677129745483,
  1.6125847101211548,
  1.6123082637786865,
  1.6120353937149048,
  1.611763834953308,
  1.6114978790283203,
  1.6112381219863892,
  1.6109875440597534,
  1.6107460260391235,
  1.6105140447616577,
  1.6102869510650635,
  1.610068917274475,
  1.6098575592041016,
  1.6096538305282593,
  1.609453797340393,
  1.6092605590820312,
  1.6090681552886963,
  1.6088788509368896,
  1.6086915731430054],
 'max_memory_allocation': [{device(type='cuda', index=0): 0.005632},
  {device(type='cuda', index=0): 0.481792},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.486912},
  {device(type='cuda', index=0): 0.487424},
  {device(type='cuda', index=0): 0.487

# Nvidia Results

In [14]:
# Ran without profiler
!nvidia-smi

Sun Nov 22 16:31:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           On   | 000047DD:00:00.0 Off |                    0 |
| N/A   44C    P0    56W / 149W |    338MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [15]:
# Ran without profiler
print(torch.cuda.memory_summary(cuda0))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |  496640 B  |  141505 KB |  141505 KB |
|       from large pool |       0 B  |       0 B  |       0 KB |       0 KB |
|       from small pool |       0 B  |  496640 B  |  141505 KB |  141505 KB |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |  496640 B  |  141505 KB |  141505 KB |
|       from large pool |       0 B  |       0 B  |       0 KB |       0 KB |
|       from small pool |       0 B  |  496640 B  |  141505 KB |  141505 KB |
|---------------------------------------------------------------