In [2]:
import sys
sys.path.append("../")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.autograd.profiler as profiler

import apex.fp16_utils as fp16

import os
import time, gc
from progressbar import progressbar
import numpy as np
from sklearn.datasets import make_classification
from collections import defaultdict
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
import matplotlib.pyplot as plt
%matplotlib inline

from utils.moduleCodeProfiler import rankByCriteria

In [3]:
!nvidia-smi

Wed Nov 25 22:48:38 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 0000BCA9:00:00.0 Off |                  Off |
| N/A   30C    P0    26W / 250W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [4]:
cuda0 = torch.device('cuda:0') 

In [5]:
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
args = parser.parse_args('')

args.data_dir = '~/datadrive'
args.dataset_dir = 'toy_mlp_1'
args.seed = 123
args.batch_size = 1000
# Large
# args.hidden_layer_dims = [5000, 5000, 5000, 5000, 5000, 5000, 5000]
# Medium
# args.hidden_layer_dims = [500, 500, 500, 500, 500, 500, 500]
# Small
args.hidden_layer_dims = [50, 50, 50, 50, 50, 50, 50]
args.lr = 0.01
args.epochs = 2000

# Model

In [8]:
class MLPLazy(nn.Module):

    def __init__(self, nx, hidden_layer_dims, ny):
        super(MLPLazy, self).__init__()
        self.hidden_layer_dims = hidden_layer_dims
        
        linear_layers = []
        last_dim = nx
        for next_dim in hidden_layer_dims:
            linear_layer = nn.Linear(last_dim, next_dim)
            linear_layers.append(linear_layer)
            last_dim = next_dim
        # should push to ModuleList so that params stay on cuda
        self.linear_layers = nn.ModuleList(linear_layers)
        self.scorer = nn.Linear(last_dim, ny)

    def forward(self, X):
        '''
        X has shape (m, nx)
        '''
        last_X = X
        for i, linear_layer in enumerate(self.linear_layers):
            # shape (m, self.hidden_layer_dims[i])
            last_X = linear_layer(last_X)
            # shape (m, self.hidden_layer_dims[i])
            last_X = torch.relu(last_X)
        # shape (m, ny)
        z = self.scorer(last_X)
        # shape (m, ny)
        a = torch.softmax(z, dim=1)
        return z, a

# Small Experiments

In [24]:
mlp = MLPLazy(100, [5000, 5000, 5000, 5000, 5000, 5000, 5000], 1)
torch.cuda.set_device('cuda:0')
mlp.to(device='cuda:0') 
mlp.half()

torch.manual_seed(42)
X = torch.randn((500, 100), dtype=torch.float16, device='cuda:0')
start_time = time.time()
for i in range(5000):
    y_hat, _ = mlp(X)
    loss = torch.sum(10 - y_hat)

torch.cuda.synchronize()
print(time.time() - start_time)

12.939019203186035


In [23]:
mlp32 = MLPLazy(100, [5000, 5000, 5000, 5000, 5000, 5000, 5000], 1)
torch.cuda.set_device('cuda:0')
mlp32.to(device='cuda:0')

torch.manual_seed(42)
X = torch.randn((500, 100), device='cuda:0')
start_time = time.time()
for i in range(5000):
    y_hat, _ = mlp32(X)
    loss = torch.sum(10 - y_hat)
torch.cuda.synchronize()
print(time.time() - start_time)

62.056530475616455


In [1]:
# with sad k80 16-bit emulation rather than native 16bit!
import torch
import time
X = torch.randn((2000,2000)).cuda()
Y = torch.randn((2000,2000)).cuda()
t0 = time.time()
for i in range(5000):
	X@Y
t1 = time.time()
torch.cuda.synchronize()
print(t1 - t0)
X = torch.randn((2000,2000)).cuda().half()
Y = torch.randn((2000,2000)).cuda().half()
t0 = time.time()
for i in range(5000):
	X@Y
t1 = time.time()
torch.cuda.synchronize()
print(t1 - t0)

31.640761137008667
52.30562925338745


In [16]:
# with V100!
import torch
import time
X = torch.randn((2000,2000)).cuda()
Y = torch.randn((2000,2000)).cuda()
t0 = time.time()
for i in range(5000):
	X@Y
t1 = time.time()
torch.cuda.synchronize()
print(t1 - t0)
X = torch.randn((2000,2000)).cuda().half()
Y = torch.randn((2000,2000)).cuda().half()
t0 = time.time()
for i in range(5000):
	X@Y
t1 = time.time()
torch.cuda.synchronize()
print(t1 - t0)

5.22260594367981
0.993229866027832
