## Imports

In [2]:
import torch
import torch_sparse
from torchsparsegradutils import sparse_mm
import math

In [3]:
!nvidia-smi

Thu Feb  1 16:07:51 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.86.01    Driver Version: 515.86.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  On   | 00000000:83:00.0 Off |                    0 |
| N/A   41C    P0    73W / 300W |    489MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Parameters

In [4]:
batch_size = 1
num_neurons = 100000
synapses_per_neuron = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Create a dense vector
indices = []
for i in range(num_neurons):
    synapses = torch.randint(0, num_neurons, (synapses_per_neuron,))
    synapse_root = torch.ones_like(synapses) * i
    indices.append(torch.stack((synapses, synapse_root)))
indices = torch.cat(indices, dim=1).to(device)
values = torch.randn(num_neurons * synapses_per_neuron).to(device)

indices, values = torch_sparse.coalesce(
    indices, values, num_neurons, num_neurons
)

coo_matrix = (
    torch.sparse_coo_tensor(indices, values, (num_neurons, num_neurons))
    .coalesce()
    .to(device)
)
csr_matrix = coo_matrix.to_sparse_csr().to(device)
# dense_matrix = coo_matrix.to_dense().to(device)
dense_vector_batched = torch.randn(num_neurons, batch_size).to(device)
bias = torch.randn(num_neurons, 1).to(device)

  csr_matrix = coo_matrix.to_sparse_csr().to(device)


## On GPU

In [9]:
%timeit torch.mm(coo_matrix, dense_vector_batched) + bias

349 µs ± 129 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [10]:
%timeit torch.addmm(bias, coo_matrix, dense_vector_batched)

345 µs ± 63.7 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
%timeit torch.mm(csr_matrix, dense_vector_batched) + bias

163 µs ± 151 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [12]:
%timeit torch.addmm(bias, csr_matrix, dense_vector_batched)

164 µs ± 106 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [9]:
%timeit torch.sparse.mm(coo_matrix, dense_vector_batched) + bias

349 µs ± 58.3 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
%timeit torch.sparse.addmm(bias, coo_matrix, dense_vector_batched)

344 µs ± 36.8 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
%timeit torch.sparse.mm(csr_matrix, dense_vector_batched) + bias

162 µs ± 149 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [17]:
%timeit torch.sparse.addmm(bias, csr_matrix, dense_vector_batched)

164 µs ± 190 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [13]:
%timeit sparse_mm(coo_matrix, dense_vector_batched) + bias

349 µs ± 49.9 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [14]:
%timeit sparse_mm(csr_matrix, dense_vector_batched) + bias

162 µs ± 98.1 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [15]:
%timeit torch_sparse.spmm(indices, values, num_neurons, num_neurons, dense_vector_batched) + bias

782 µs ± 154 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
# %timeit torch.mv(dense_matrix, dense_vector_batched)

In [18]:
del(weight, out)
torch.cuda.empty_cache()

NameError: name 'weight' is not defined

In [30]:
%%time
weight = coo_matrix.clone().requires_grad_(True)
out = torch.sparse.mm(weight, dense_vector_batched) + bias
out.sum().backward()

CPU times: user 1.73 ms, sys: 0 ns, total: 1.73 ms
Wall time: 1.79 ms


In [27]:
%%time
weight = csr_matrix.clone().requires_grad_(True)
out = torch.sparse.mm(weight, dense_vector_batched) + bias
out.sum().backward()

CPU times: user 1.77 ms, sys: 0 ns, total: 1.77 ms
Wall time: 1.86 ms


In [23]:
weight

tensor(crow_indices=tensor([      0,     108,     214,  ..., 9994846,
                            9994958, 9995039]),
       col_indices=tensor([  917,  1160,  2078,  ..., 95933, 95997, 98054]),
       values=tensor([-0.6188, -2.0300, -0.6389,  ...,  1.2913, -0.5155,
                       0.7060]), device='cuda:0', size=(100000, 100000),
       nnz=9995039, layout=torch.sparse_csr, requires_grad=True)

## On CPU

In [16]:
coo_matrix = coo_matrix.to("cpu")
csr_matrix = csr_matrix.to("cpu")
# dense_matrix = dense_matrix.to('cpu')
dense_vector_batched = dense_vector_batched.to("cpu")
indices = indices.to("cpu")
values = values.to("cpu")

In [17]:
%timeit torch.mm(coo_matrix, dense_vector_batched)

1.46 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit torch.mm(csr_matrix, dense_vector_batched)

1.23 ms ± 18.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [None]:
%timeit torch.sparse.mm(coo_matrix, dense_vector_batched)

18.3 ms ± 133 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%timeit torch.sparse.mm(csr_matrix, dense_vector_batched)

2.04 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%timeit torch_sparse.spmm(indices, values, num_neurons, num_neurons, dense_vector_batched)

8 ms ± 111 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
# %timeit torch.mv(dense_matrix, dense_vector_batched)

## Varying sparsity pattern with same sparsity ratio

In [30]:
def idx_1D_to_2D(x, sheet_size):
    return torch.stack((x // sheet_size[1], x % sheet_size[1]))


def idx_2D_to_1D(x, sheet_size):
    return x[0] * sheet_size[1] + x[1]

In [31]:
batch_size = 16
num_neurons = 10000
synapses_per_neuron = 100
sheet_size = (100, 100)
connectivity_std = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

x = torch.randn(num_neurons, batch_size).to(device)
bias = torch.randn(num_neurons, 1).to(device)

In [32]:
# Create a sparse tensor for the weight matrix
indices = []

# Create adjacency matrix with normal distribution randomized weights
for i in range(num_neurons):
    synapses = torch.randint(0, num_neurons, (synapses_per_neuron,))
    synapse_root = torch.full_like(synapses, i)
    indices.append(torch.stack((synapses, synapse_root)))
indices = torch.cat(indices, dim=1)
# Xavier initialization of values (synapses_per_neuron is the fan-in/out)
values = torch.randn(num_neurons * synapses_per_neuron) * math.sqrt(
    1 / synapses_per_neuron
)

random_weight = (
    torch.sparse_coo_tensor(
        indices, values, (num_neurons, num_neurons), check_invariants=True
    )
    .coalesce()
    .to(device)
)

In [39]:
# Create a sparse tensor for the weight matrix
indices = []

# Create adjacency matrix with normal distribution randomized weights
for i in range(sheet_size[0]):
    for j in range(sheet_size[1]):
        synapses = (
            torch.randn(2, synapses_per_neuron)
            * torch.tensor((connectivity_std, connectivity_std))[:, None]
            + torch.tensor((i, j))[:, None]
        ).long()
        synapses = synapses.clamp(
            torch.tensor((0, 0))[:, None],
            torch.tensor((sheet_size[0] - 1, sheet_size[1] - 1))[:, None],
        )
        synapses = idx_2D_to_1D(synapses, sheet_size)
        synapse_root = torch.full_like(
            synapses, idx_2D_to_1D(torch.tensor((i, j)), sheet_size)
        )
        indices.append(torch.stack((synapses, synapse_root)))
indices = torch.cat(indices, dim=1)
# Sort indices by synapses
# indices = indices[:, torch.argsort(indices[0])]
# Xavier initialization of values (synapses_per_neuron is the fan-in/out)
values = torch.randn(indices.shape[1]) * math.sqrt(1 / synapses_per_neuron)

topographic_weight = (
    torch.sparse_coo_tensor(indices, values, (num_neurons, num_neurons))
    .coalesce()
    .to(device)
)

In [40]:
%timeit torch.sparse.addmm(bias, random_weight, x)

145 µs ± 85 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [41]:
%timeit torch.sparse.addmm(bias, topographic_weight, x)

132 µs ± 44.7 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [36]:
random_weight._nnz(), topographic_weight._nnz()

(995005, 908210)