In [1]:
from matdeeplearn.common import data
import os
import torch
import matplotlib.pyplot as plt
from pprint import pprint

from matdeeplearn.preprocessor.helpers import *
from matdeeplearn.preprocessor.transforms import *
from matdeeplearn.common.graph_data import CustomData

from torch_geometric.loader import DataLoader
from ase import neighborlist

%pprint
%load_ext autoreload
%autoreload 2

torch.set_printoptions(profile="full")

  from .autonotebook import tqdm as notebook_tqdm


Pretty printing has been turned OFF


In [2]:
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
dataset = data.get_dataset("/nethome/sbaskaran31/projects/Sidharth/hMOF/raw_5k/mdl")

In [72]:
sample: CustomData = dataset[0]

In [5]:
def calculate_all_neighbor_edges(data: CustomData, device: torch.device):
    return calculate_edges_master(
        "ocp",
        True,
        5.0,
        250,
        1,
        data.structure_id,
        data.cell,
        data.pos,
        data.z,
        False,
        False,
        device=device,
    )


In [78]:
ts1 = VirtualNodes(**{"device": "cpu", "virtual_box_increment": 3, "attrs": ["rr", "rv"], "rr_cutoff": 5.0, "rv_cutoff": 5.0, "cutoff_radius" : 5.0, "n_neighbors" : 50,
    "edge_calc_method": "ocp",
    "num_offsets": 1,
    "edge_steps" : 25,
    "all_neighbors": True,
    "use_degree": False})

CPU calculation benchmarks


In [79]:
with profile(
    activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True
) as prof_cpu:
    ts1(sample)

STAGE:2023-04-02 17:10:00 1729111:1729111 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
[W CPUAllocator.cpp:231] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
STAGE:2023-04-02 17:10:00 1729111:1729111 ActivityProfilerController.cpp:300] Completed Stage: Collection


In [81]:
import cProfile

In [82]:
cProfile.run("ts1(sample)", sort="cumtime")

         12906 function calls (12878 primitive calls) in 1.963 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.965    1.965 {built-in method builtins.exec}
        1    0.000    0.000    1.965    1.965 <string>:1(<module>)
        1    0.000    0.000    1.965    1.965 transforms.py:76(__call__)
        2    0.000    0.000    1.604    0.802 helpers.py:54(calculate_edges_master)
        2    0.196    0.098    1.327    0.663 helpers.py:1029(radius_graph_pbc)
       16    0.618    0.039    0.618    0.039 {built-in method torch.masked_select}
        2    0.206    0.103    0.277    0.139 helpers.py:1292(get_pbc_distances)
        2    0.221    0.110    0.221    0.111 helpers.py:188(get_mask)
        6    0.129    0.022    0.129    0.022 {method 'repeat' of 'torch._C._TensorBase' objects}
        2    0.000    0.000    0.101    0.050 helpers.py:727(custom_edge_feats)
        2    0.000    0.000 

In [80]:
print(prof_cpu.key_averages().table(sort_by="cpu_time_total", row_limit=10))
print(prof_cpu.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
               aten::masked_select        46.33%      54.986ms        55.11%      65.417ms       4.089ms       1.02 Mb    -298.85 Mb            16  
                       aten::index        13.84%      16.430ms        13.98%      16.590ms     535.161us       2.29 Mb       2.17 Mb            31  
                       aten::copy_        12.16%      14.434ms        12.16%      14.434ms     148.804us           0 b           0 b            97  
           aten::repeat_interleave         2.56%       3.044ms         8.29%       9.834ms     378.231us  

In [73]:
with torch.autograd.profiler.profile(use_cuda=False) as prof:
    with record_function("VN_Transform_CPU"):
        ts1(sample)
print(prof)


STAGE:2023-04-02 17:08:21 1729111:1729111 ActivityProfilerController.cpp:294] Completed Stage: Warm Up


----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                              Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       aten::zeros         0.00%      11.000us         0.01%      28.000us      28.000us             1  
                       aten::empty         0.00%      16.000us         0.00%      16.000us      16.000us             1  
                       aten::zero_         0.00%       1.000us         0.00%       1.000us       1.000us             1  
                  VN_Transform_CPU         2.03%      10.169ms        99.99%     500.809ms     500.809ms             1  
                       aten::empty         0.00%      22.000us         0.00%      22.000us      22.000us             1  
                      aten::aran

STAGE:2023-04-02 17:08:22 1729111:1729111 ActivityProfilerController.cpp:300] Completed Stage: Collection


In [None]:
with profile(
    activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True
) as prof_cpu:
    with record_function("VN_TRANSFORM_CPU"):
        calculate_all_neighbor_edges(sample, torch.device("cpu"))

GPU calculation benchmarks


In [38]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
) as prof_gpu:
    with record_function("OCP_All_Neighbor_GPU"):
        for i in range(10):
            calculate_all_neighbor_edges(dataset[i], torch.device("cuda:7"))

STAGE:2023-04-02 13:58:59 1729111:1729111 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-04-02 13:58:59 1729111:1729111 ActivityProfilerController.cpp:300] Completed Stage: Collection


In [None]:
print(prof_gpu.key_averages().table(sort_by="cuda_time_total", row_limit=10))


In [75]:
ts2 = VirtualNodes(**{"device": "cuda:7", "virtual_box_increment": 3, "attrs": ["rr", "rv"], "rr_cutoff": 5.0, "rv_cutoff": 5.0, "cutoff_radius" : 5.0, "n_neighbors" : 50,
    "edge_calc_method": "ocp",
    "num_offsets": 1,
    "edge_steps" : 25,
    "all_neighbors": True,
    "use_degree": False})

In [76]:
with torch.autograd.profiler.profile(use_cuda=True) as prof:
    with record_function("OCP_All_Neighbor_GPU"):
        ts2(sample)
print(prof)

STAGE:2023-04-02 17:09:09 1729111:1729111 ActivityProfilerController.cpp:294] Completed Stage: Warm Up
STAGE:2023-04-02 17:09:10 1729111:1729111 ActivityProfilerController.cpp:300] Completed Stage: Collection


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:7! (when checking argument for argument mat2 in method wrapper_bmm)

In [49]:
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()
calculate_all_neighbor_edges(sample, torch.device("cuda:7"))
end.record()

# Waits for everything to finish running
torch.cuda.current_stream().synchronize()

print(start.elapsed_time(end))


790.1875


In [45]:
from time import time


In [50]:
start = time()
calculate_all_neighbor_edges(sample, torch.device("cpu"))
end = time()


In [51]:
print(end - start)


0.7506468296051025
