In [93]:
import time
import gc 
from pathlib import Path
import torch 
from torch import nn, optim
from torch.utils.data import DataLoader
import torch.utils.benchmark as benchmark
from torchvision import datasets, transforms

import contextlib
from torch.cuda import Stream

from mnist_model import NeuralNetwork
from mnist_train import train_epoch, evaluate

import matplotlib.pyplot as plt
import numpy as np

In [94]:
# use available device

device = torch.device("cuda:2")

In [None]:
#perform inference on cuda 
#import trained model 

model =NeuralNetwork().to(device)

print(model)


NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (net): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=10, bias=True)
  )
)


In [None]:
model.load_state_dict(torch.load('mnist_model.pth')) # Load the saved model weights


<All keys matched successfully>

In [None]:
model.eval() # Set the model to evaluation mode


NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (net): Sequential(
    (0): Linear(in_features=784, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=10, bias=True)
  )
)

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f75983bc3f0>

In [None]:
s=Stream()


In [None]:
def inference_profiling_async():
    tensors= [torch.randn(1,28,28,pin_memory=True) for _ in range(10000)]

    # perform inference on data in gpu
    start_async = time.time()


    s=Stream()

    with torch.cuda.stream(s):
        # perform async copy 
        
        result= []
        for tensor in tensors:
            tensor=tensor.to(device,non_blocking=True)
            result.append(tensor)
            

        tensors_h2d_event=s.record_event() # record event when copy is done

    #meaning we have no gradients to compute. ie only doing forward pass
    with torch.no_grad():
        for tensor in result:
            output_sync=model(tensor)
        
        model_infer_event=torch.cuda.current_stream().record_event() # record event when inference is done

    tensors_h2d_event.synchronize() # wait for all kernels in all streams to complete
    model_infer_event.synchronize() # wait for all kernels in all streams to complete

    end_async = time.time()
    async_time = end_async - start_async
    print(f"Asynchronous copy and inference time: {async_time:.6f} seconds")


In [None]:
def inference_profiling_sync():
    #perform sync copy (called explicitly)
    tensors= [torch.randn(1,28,28,pin_memory=False) for _ in range(10000)]

    start_sync = time.time()
    #meaning we have no gradients to compute. ie only doing forward pass
    with torch.no_grad():
        
        result= []
        # send tensors to GPU 
        for tensor in tensors:
            tensor=tensor.to(device,non_blocking=False)
            result.append(tensor)
       
        for tensor in result:
            output_sync=model(tensor)
    end_sync = time.time()
    sync_time = end_sync - start_sync
    print(f"Synchronous copy and inference time: {sync_time:.6f} seconds")


In [None]:
t2=benchmark.Timer(
    stmt="inference_profiling_async()",
    setup="from __main__ import inference_profiling_async",
    num_threads=torch.get_num_threads(),
    label="ASYNC copy + inference",
    sub_label="transfer to GPU + inference",
)

t3=benchmark.Timer(
    stmt="inference_profiling_sync()",
    setup="from __main__ import inference_profiling_sync",
    num_threads=torch.get_num_threads(),
    label="SYNC copy + inference",
    sub_label="transfer to GPU + inference",
)

In [None]:
print(t2.timeit(10))
print(t3.timeit(10))

Asynchronous copy and inference time: 1.575929 seconds
Asynchronous copy and inference time: 1.468861 seconds
Asynchronous copy and inference time: 1.455780 seconds
Asynchronous copy and inference time: 1.457217 seconds
Asynchronous copy and inference time: 1.455045 seconds
Asynchronous copy and inference time: 1.455118 seconds
Asynchronous copy and inference time: 1.454524 seconds
Asynchronous copy and inference time: 1.456453 seconds
Asynchronous copy and inference time: 1.457391 seconds
Asynchronous copy and inference time: 1.457894 seconds
Asynchronous copy and inference time: 1.453143 seconds
Asynchronous copy and inference time: 1.452112 seconds
<torch.utils.benchmark.utils.common.Measurement object at 0x7f76b9a058b0>
ASYNC copy + inference: transfer to GPU + inference
setup: from __main__ import inference_profiling_async
  1.60 s
  1 measurement, 10 runs , 24 threads
Synchronous copy and inference time: 1.483008 seconds
Synchronous copy and inference time: 1.477060 seconds
Synch

In [None]:
#implicit synchronization
start_implicit = time.time()

tensors= [torch.randn(1,1,28,28) for _ in range(1000)]
with torch.no_grad():
    for tensor in tensors:
        tensor=tensor.to("cuda") #implicit synchronization here 
        output_sync=model(tensor)
end_implicit = time.time()
implicit_time = end_implicit - start_implicit
print(f"Implicit synchronization copy and inference time: {implicit_time:.6f} seconds")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:2 and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)