# Latency and memory

## How to run this file on the lisa cluster

Start by pulling the lastest version of the repository, if necessary re-add the ssh-key to the agent.  ([See here](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent?platform=linux)), i.e.

```sh
eval "$(ssh-agent -s)"
ssh-add ~/.ssh/id_ed25519
```

Then activate the correct modules (`2022` and `Anaconda3/2022.05`) and activate the `gvp` source.  Finally, actually run the notebook with 

```sh
jupyter nbconvert "latency and memory.ipynb" --to notebook --execute --inplace --allow-errors
```

Make sure the environment contains the following packages: `ipywidgets`, `jupyter`, `notebook`, `torch`, `gvp`

## Setup

* Import things necessary for running GVP and SMLP, 
* Setup which task we analyse, 
* ...

### Imports

In [1]:
# Supresses the following warning:
#   UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class.
#   This should only matter to you if you are using storages directly.
#   To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
import time
import numpy as np
import torch
# import torch.nn as nn

from e3nn.o3 import Irreps
from torch.profiler import profile, ProfilerActivity

from run_atom3d import get_datasets, get_model
from steerable_mlp import ConvModel

### Task setup

In [3]:
DATA_DIR = "../data/"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != "cuda":
    exit()

TASK = "LBA"
"""Task to test for model latency."""
LBA_SPLIT = 30
# SPLIT = 60

DATASET_SPLIT = "train"
"""Dataset partition from which the sample is chosen."""
# Select from these values
DATASET_SPLIT = ["train", "val", "test"].index(DATASET_SPLIT)
SAMPLE_INDEX = 10
"""Dataset index to use for testing the latency and memory."""

BEST_GVP_MODEL = "./best_models/LBA_lba-split=30_47.pt"
"""Path to the best trained relevant model."""
BEST_SMLP_MODEL = "./sMLPmodels/LBA-lba_split=30-epoch=10.ckpt"
"""Path to the best trained relevant model."""
USE_DENSE = False

In [4]:
datasets = get_datasets(TASK, DATA_DIR, LBA_SPLIT)

gvp_model = get_model(TASK).to(device)
# gvp_model = nn.DataParallel(gvp_model)  # Add parallel to fix OOM issues?
gvp_model.load_state_dict(torch.load(BEST_GVP_MODEL), strict=False)

dataset_sample = datasets[DATASET_SPLIT][SAMPLE_INDEX].to(device)
# This is required for taking the scattermean of the graph
dataset_sample.batch = torch.zeros_like(dataset_sample.atoms)
# Fix for our steerable.
dataset_sample.z = dataset_sample.atoms
dataset_sample.pos = dataset_sample.x

print(dataset_sample)

Data(x=[551, 3], edge_index=[2, 13194], atoms=[551], edge_s=[13194, 16], edge_v=[13194, 1, 3], label=6.85, lig_flag=[551], batch=[551], z=[551], pos=[551, 3])


In [5]:
def balanced_irreps(hidden_features: int, lmax: int) -> Irreps:
    """Divide subspaces equally over the feature budget"""
    N = int(hidden_features / (lmax + 1))
    irreps = []
    for l, irrep in enumerate(Irreps.spherical_harmonics(lmax)):
        n = int(N / (2 * l + 1))
        irreps.append(str(n) + "x" + str(irrep[1]))
    irreps = "+".join(irreps)
    irreps = Irreps(irreps)
    gap = hidden_features - irreps.dim
    if gap > 0:
        irreps = Irreps("{}x0e".format(gap)) + irreps
        irreps = irreps.simplify()
    return irreps

irreps_in = Irreps("32x0e")
irreps_hidden = balanced_irreps(128, 1)
irreps_edge = Irreps.spherical_harmonics(1)
irreps_out = Irreps("16x0e") if USE_DENSE else Irreps("1x0e")
smlp_model = ConvModel(
    irreps_in=irreps_in,
    irreps_hidden=irreps_hidden,
    irreps_edge=irreps_edge,
    irreps_out=irreps_out,
    depth=3,
    dense=USE_DENSE,
).to(device)
smlp_model.load_state_dict(torch.load(BEST_SMLP_MODEL), strict=False)

_IncompatibleKeys(missing_keys=['embedder.weight', 'layers.0.conv.tp.weight', 'layers.0.conv.tp.output_mask', 'layers.0.conv.radial_net.net.0.freq', 'layers.0.conv.radial_net.net.1.weight', 'layers.0.conv.radial_net.net.1.bias', 'layers.0.conv.radial_net.net.3.weight', 'layers.0.conv.radial_net.net.3.bias', 'layers.0.gate.mul.weight', 'layers.0.gate.mul.output_mask', 'layers.1.conv.tp.weight', 'layers.1.conv.tp.output_mask', 'layers.1.conv.radial_net.net.0.freq', 'layers.1.conv.radial_net.net.1.weight', 'layers.1.conv.radial_net.net.1.bias', 'layers.1.conv.radial_net.net.3.weight', 'layers.1.conv.radial_net.net.3.bias', 'layers.1.gate.mul.weight', 'layers.1.gate.mul.output_mask', 'layers.2.conv.tp.weight', 'layers.2.conv.tp.output_mask', 'layers.2.conv.radial_net.net.0.freq', 'layers.2.conv.radial_net.net.1.weight', 'layers.2.conv.radial_net.net.1.bias', 'layers.2.conv.radial_net.net.3.weight', 'layers.2.conv.radial_net.net.3.bias'], unexpected_keys=['epoch', 'global_step', 'pytorch-li

In [6]:
# Burn a few times to prevent extra overhead for first CUDA profiling
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             profile_memory=True) as prof:
    for _ in range(30):
        gvp_model(dataset_sample)
        smlp_model(dataset_sample)

STAGE:2023-05-26 13:58:53 16604:16604 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-26 13:58:56 16604:16604 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-26 13:58:56 16604:16604 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [7]:
def test_latency_memory(model, dataset_sample):
    memory_start = torch.cuda.memory_allocated()
    latencies = []
    for _ in range(100):
        latency_start = time.perf_counter()
        out = model(dataset_sample)
        latency_end = time.perf_counter()
        latencies.append(latency_end - latency_start)
    memory_end = torch.cuda.memory_allocated()
    return latencies, memory_end - memory_start

## GVP

In [8]:
gvp_model.train()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             profile_memory=True) as prof_train:
    withgrad_latency, withgrad_memory = test_latency_memory(gvp_model, dataset_sample)
    # out = gvp_model(dataset_sample)


gvp_model.eval()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            profile_memory=True) as prof_test:
    with torch.no_grad():
        gradless_latency, gradless_memory = test_latency_memory(gvp_model, dataset_sample)
        # out = gvp_model(dataset_sample)

print(f"{np.mean(withgrad_latency)=}")
print(f"{np.std(withgrad_latency)=}")
print(f"{np.mean(gradless_latency)=}")
print(f"{np.std(gradless_latency)=}")
print()
print(f"{withgrad_memory=}")
print(f"{gradless_memory=}")

STAGE:2023-05-26 13:59:04 16604:16604 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-26 13:59:07 16604:16604 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-26 13:59:07 16604:16604 ActivityProfilerController.cpp:321] Completed Stage: Post Processing
STAGE:2023-05-26 13:59:26 16604:16604 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-26 13:59:28 16604:16604 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-26 13:59:29 16604:16604 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


np.mean(withgrad_latency)=0.024698157098609955
np.std(withgrad_latency)=0.0011156942033995645
np.mean(gradless_latency)=0.01930151361506432
np.std(gradless_latency)=0.002001638903043534

withgrad_memory=486084096
gradless_memory=512


In [9]:
print(f"{withgrad_latency=}")
print(f"{gradless_latency=}")

withgrad_latency=[0.026040896074846387, 0.030528782168403268, 0.02490377682261169, 0.024250203976407647, 0.024611416971310973, 0.023828176083043218, 0.024133834056556225, 0.025092604802921414, 0.025831161998212337, 0.02391382586210966, 0.024131011916324496, 0.024148999946191907, 0.0241327250842005, 0.02377885510213673, 0.024043763056397438, 0.024459959007799625, 0.027284455951303244, 0.02391892788000405, 0.024027460953220725, 0.02448016102425754, 0.02417381713166833, 0.023691244889050722, 0.024085141951218247, 0.02426473516970873, 0.02725474303588271, 0.02418029890395701, 0.025153805036097765, 0.02556224400177598, 0.025014425860717893, 0.02392437681555748, 0.024515022058039904, 0.024441973073408008, 0.027083539171144366, 0.02395139099098742, 0.026500088861212134, 0.02435100614093244, 0.024559972807765007, 0.023913527838885784, 0.024194865953177214, 0.02421195711940527, 0.027140071149915457, 0.024123532930389047, 0.024464973947033286, 0.024243619991466403, 0.024427304044365883, 0.024026

In [10]:
print(prof_train.key_averages().table(sort_by="cuda_time_total", row_limit=10, top_level_events_only=True))
print(prof_train.key_averages().table(sort_by="cuda_memory_usage", row_limit=10, top_level_events_only=True))

This report only display top-level ops statistics
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::linear         2.78%      49.864ms        34.58%     619.394ms      56.825us       0.000us         0.00%     568.331ms      52.140us     

In [11]:
print(prof_test.key_averages().table(sort_by="cuda_time_total", row_limit=10, top_level_events_only=True))
print(prof_test.key_averages().table(sort_by="cuda_memory_usage", row_limit=10, top_level_events_only=True))

This report only display top-level ops statistics
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::linear         3.04%      42.559ms        37.00%     517.518ms      47.479us       0.000us         0.00%     471.874ms      43.291us     

## Steerable

In [12]:
gvp_model.train()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             profile_memory=True) as prof_train:
    withgrad_latency, withgrad_memory = test_latency_memory(smlp_model, dataset_sample)
    # out = gvp_model(dataset_sample)


gvp_model.eval()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            profile_memory=True) as prof_test:
    with torch.no_grad():
        gradless_latency, gradless_memory = test_latency_memory(smlp_model, dataset_sample)
        # out = gvp_model(dataset_sample)

print(f"{np.mean(withgrad_latency)=}")
print(f"{np.std(withgrad_latency)=}")
print(f"{np.mean(gradless_latency)=}")
print(f"{np.std(gradless_latency)=}")
print()
print(f"{withgrad_memory=}")
print(f"{gradless_memory=}")

STAGE:2023-05-26 14:00:14 16604:16604 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-26 14:00:15 16604:16604 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-26 14:00:15 16604:16604 ActivityProfilerController.cpp:321] Completed Stage: Post Processing
STAGE:2023-05-26 14:00:24 16604:16604 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-26 14:00:25 16604:16604 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-26 14:00:25 16604:16604 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


np.mean(withgrad_latency)=0.006889338770415634
np.std(withgrad_latency)=0.0018428532743030978
np.mean(gradless_latency)=0.005986865872982889
np.std(gradless_latency)=0.00036821399083455513

withgrad_memory=706749952
gradless_memory=512


In [13]:
print(f"{withgrad_latency=}")
print(f"{gradless_latency=}")

withgrad_latency=[0.024855504045262933, 0.008508198894560337, 0.006764301098883152, 0.006735835922881961, 0.006660043029114604, 0.0066592448856681585, 0.006657347083091736, 0.006586449919268489, 0.006643848028033972, 0.006682778941467404, 0.006601940840482712, 0.006634454010054469, 0.006640654988586903, 0.006620875094085932, 0.0066335059236735106, 0.006654773838818073, 0.006619638996198773, 0.006591200130060315, 0.00656069815158844, 0.006580753950402141, 0.006620119092985988, 0.006598950829356909, 0.006587903015315533, 0.006698202108964324, 0.00660217204131186, 0.006719080964103341, 0.0066456361673772335, 0.007667589001357555, 0.006654075114056468, 0.006620369851589203, 0.006602704990655184, 0.006595744052901864, 0.006620327010750771, 0.006526427110657096, 0.00806512008421123, 0.00676415185444057, 0.006743261124938726, 0.006637979997321963, 0.006595023209229112, 0.006599198095500469, 0.006670881062746048, 0.006608510855585337, 0.006728142965584993, 0.006622391054406762, 0.0067000319249

In [14]:
print(prof_train.key_averages().table(sort_by="cuda_time_total", row_limit=10, top_level_events_only=True))
print(prof_train.key_averages().table(sort_by="cuda_memory_usage", row_limit=10, top_level_events_only=True))

This report only display top-level ops statistics
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                forward         6.03%      31.595ms        52.15%     273.027ms     390.039us       0.000us         0.00%     202.191ms     288.844us     

In [15]:
print(prof_test.key_averages().table(sort_by="cuda_time_total", row_limit=10, top_level_events_only=True))
print(prof_test.key_averages().table(sort_by="cuda_memory_usage", row_limit=10, top_level_events_only=True))

This report only display top-level ops statistics
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                forward         6.60%      30.224ms        50.75%     232.283ms     331.833us       0.000us         0.00%     201.313ms     287.590us     

## Conclusion

It seems that the profiling does not take into account the fact that gradients have to be stored.  We think this, because the additional memory allocated to CUDA is more than 450 MB and more than 700 MB respectively when storing the gradients, but the profiler only gives 250 MB and 630 MB (per iteration) at most.