In [None]:
# https://pytorch.org/docs/stable/notes/cuda.html
# Async Execution Section

In [1]:
import os
import torch

In [2]:
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')

In [3]:
# try to do something on cuda0, and cuda1
# see if there's speed up compared to calling torch.cuda.synchronize() in the middle

In [4]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

torch.manual_seed(123)
for i in range(10):
    wtx = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    x = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    dotx = torch.mm(wtx, x)
    wty = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(-1)
    y = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(0)
    doty = torch.mm(wty, y)
    
    
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

1702.708740234375


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

torch.manual_seed(123)
for i in range(10):
    wtx = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    torch.cuda.synchronize()
    x = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    torch.cuda.synchronize()
    dotx = torch.mm(wtx, x)
    torch.cuda.synchronize()
    wty = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(-1)
    torch.cuda.synchronize()
    y = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(0)
    torch.cuda.synchronize()
    doty = torch.mm(wty, y)
    torch.cuda.synchronize()
    
    
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

1793.8739013671875


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

torch.manual_seed(123)
for i in range(10):
    wtx = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    torch.cuda.synchronize()
    x = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    torch.cuda.synchronize()
    dotx = torch.mm(wtx, x)
    torch.cuda.synchronize()
    wty = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    torch.cuda.synchronize()
    y = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    torch.cuda.synchronize()
    doty = torch.mm(wty, y)
    torch.cuda.synchronize()
    
    
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

769.6156616210938


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

torch.manual_seed(123)
for i in range(10):
    wtx = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    torch.cuda.synchronize()
    x = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    torch.cuda.synchronize()
    dotx = torch.mm(wtx, x)
    torch.cuda.synchronize()
    wty = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    torch.cuda.synchronize()
    y = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    torch.cuda.synchronize()
    doty = torch.mm(wty, y)
    torch.cuda.synchronize()
    
    
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

759.4615478515625


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

torch.manual_seed(123)
for i in range(10):
    wtx = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    x = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    dotx = torch.mm(wtx, x)
    wty = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(-1)
    y = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(0)
    doty = torch.mm(wty, y)
    
    
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

2047.1427001953125


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

torch.manual_seed(123)
for i in range(10):
    wtx = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(-1)
    x = torch.randn(30000, dtype=torch.float).cuda(cuda0).unsqueeze(0)
    dotx = torch.mm(wtx, x)
    wty = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(-1)
    y = torch.randn(30000, dtype=torch.float).cuda(cuda1).unsqueeze(0)
    doty = torch.mm(wty, y)
    torch.cuda.synchronize()
    

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

2133.711181640625


In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')

In [2]:
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')

In [9]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float).cuda(cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float).cuda(cuda0)
        x = torch.mm(wtx, x)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

20298.763671875


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float).cuda(cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float).cuda(cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((3000, 1), dtype=torch.float).cuda(cuda0)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float).cuda(cuda0)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

40677.71875


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float).cuda(cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float).cuda(cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((3000, 1), dtype=torch.float).cuda(cuda1)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float).cuda(cuda1)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

42232.28515625


In [3]:
# https://discuss.pytorch.org/t/gpu-operations-seem-not-asynchronous/63330/4
os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda1)
    
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda1)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

1599.0179443359375


In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda1)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda1)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

1649.1849365234375


## For Synchronize with CUDA_LAUNCH_BLOCKING (2 GPU)

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda1)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda1)
        y = torch.mm(wty, y) 

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

333257.34375


## Manually add Sync Points (2 GPU)

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        torch.cuda.synchronize() 
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda1)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda1)
        y = torch.mm(wty, y)
        torch.cuda.synchronize() 

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

314793.5625


## For Synchronize with CUDA_LAUNCH_BLOCKING (1 GPU)

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        y = torch.mm(wty, y) 

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

330762.9375


## Manually add Sync Points (1 GPU)

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        torch.cuda.synchronize() 
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        y = torch.mm(wty, y)
        torch.cuda.synchronize() 

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

317634.40625


## Async (2 GPU)

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda1)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda1)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

305783.21875


## Async (1 GPU)

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x) 
        
    y = torch.randn((3000, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wty = torch.randn((3000, 3000), dtype=torch.float, device=cuda0)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

309210.46875


## Async (2 GPU) Big Data Set

In [1]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
m = 30000 # 40000 will raise CUDA memory error 
m = 3000 # 40000 will raise CUDA memory error 
torch.manual_seed(123)
for i in range(1):
    x = torch.randn((m, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((m, m), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x)
        
    y = torch.randn((m, 1), dtype=torch.float, device=cuda1)
    for j in range(J):
        wty = torch.randn((m, m), dtype=torch.float, device=cuda1)
        y = torch.mm(wty, y)

    
end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

KeyboardInterrupt: 

## Async (1 GPU) Big Data Set

In [None]:
import os
import torch
cuda0 = torch.device('cuda:0') # same as cuda0 = torch.device('cuda:0') or cuda = torch.device('cuda')
cuda1 = torch.device('cuda:1') # same as cuda1 = torch.device('cuda:1')


os.environ["CUDA_LAUNCH_BLOCKING"] = '0'

torch.cuda.synchronize() 
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()

J = 20000
m = 50500 # 40000 will raise CUDA memory error 
torch.manual_seed(123)
for i in range(10):
    x = torch.randn((m, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wtx = torch.randn((m, m), dtype=torch.float, device=cuda0)
        x = torch.mm(wtx, x) 
        
    y = torch.randn((m, 1), dtype=torch.float, device=cuda0)
    for j in range(J):
        wty = torch.randn((m, m), dtype=torch.float, device=cuda0)
        y = torch.mm(wty, y)

end_event.record()
torch.cuda.synchronize()  # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(elapsed_time_ms)

In [None]:
# J = 2000
# async two devices 23476.904296875

# sync two devices  29139.279296875
# async one device  30838.9453125
# sync one device   31943.115234375