In [1]:
import torch.sparse as sparse
import numpy.random as random
import numpy as np
from torchviz import make_dot
import torch
from torch import autograd
from d2l import torch as d2l

In [2]:
def synthetic_data(w, b, num_examples):  #@save
    """Generate y = Xw + b + noise."""
    X = torch.normal(0, 1, (num_examples, len(w)))
    y = torch.matmul(X, w)
    y += torch.normal(0, 0.01, y.shape)
    return X, y.reshape((-1, 1))

true_w = torch.tensor([2, -3.4, 4.9, -1, 3.2, 0.35])
true_b = 4.2
features, labels = synthetic_data(true_w, true_b, 1000)

In [3]:
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    # The examples are read at random, in no particular order
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i +
                                                   batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]

In [4]:
def forward(features, W_hidden, w_out):
    return torch.matmul(torch.matmul(features, W_hidden), w_out)

In [5]:
def forward_sparse(features, W_hidden, w_out):
    # import pdb; pdb.set_trace()
    hidden = sparse.mm(W_hidden, features.t())
    return sparse.mm(w_out.t(), hidden)
    # sparse.mm is very specific which matrices it gets. The first needs to sparse, the second needs to be strided/dense. So one needs to do a bunch of .t().

In [6]:
def squared_loss(y_hat, y):  #@save
    """Squared loss."""
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

In [7]:
def top_kast_forward(w, kast = 0.5):
    """Selects 50% largest coefficients, not exactly like in paper, but for testing it should be fine."""
    # import pdb; pdb.set_trace()
    threshold = np.quantile(w.detach().numpy().reshape(-1), 0.5)
    mask = w < threshold
    w[mask] = 0
    return w.to_sparse().requires_grad_(True)

In [19]:
def compute_mask(w, kast = 0.5):
    # import pdb; pdb.set_trace()
    if w.is_sparse:
        threshold = np.quantile(w.values().detach(), kast)
        mask = w.values().detach() > threshold
    else:
        threshold = np.quantile(w.reshape(-1).detach(), kast)
        mask = w.reshape(-1).detach() > threshold
    return mask.reshape(w.shape)

In [9]:
def sgd(params, lr, batch_size):  #@save
    """Minibatch stochastic gradient descent."""
    with torch.no_grad():
        for param in params:
            # import pdb; pdb.set_trace()
            param -= lr * param.grad / batch_size
            param.grad.zero_()

In [14]:
lr = 0.003
num_epochs = 10
net = forward_sparse
loss = squared_loss
batch_size = 10

W_hidden = torch.normal(0, 0.01, size=(6, 6), requires_grad = False)
w_out = torch.normal(0, 0.01, size=(6, 1), requires_grad = False)
# W_hidden.retain_grad() # .to_sparse() makes parameters not leave parameters anymore, so to train those parameters you need to retain the grad.
# w_out.retain_grad()

In [20]:
for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, features, labels):
        # Make sparse forward pass
        mask_hidden_forward = compute_mask(W_hidden)
        mask_out_forward = compute_mask(w_out)

        W_hidden_forward = torch.tensor(W_hidden * mask_hidden_forward, requires_grad = True).to_sparse()
        W_hidden_forward.retain_grad()
        w_out_forward = torch.tensor(w_out * mask_out_forward, requires_grad = True).to_sparse()
        w_out_forward.retain_grad()

        # mask_hidden_backward = compute_mask(W_hidden, kast=0.7).to_sparse()
        # mask_out_backward = compute_mask(w_out, kast=0.7).to_sparse()
        # W_hidden_backward = (W_hidden * mask_hidden_backward).retain_grad()
        # w_out_backward = (w_out * mask_out_backward).retain_grad()
        # import pdb; pdb.set_trace()
        y_hat = net(X, W_hidden_forward, w_out_forward)
        l = loss(y_hat, y)  # Minibatch loss in `X` and `y`
        l.sum().backward()
        # import pdb; pdb.set_trace()
        sgd([W_hidden_forward, w_out_forward], lr, batch_size)  # Update parameters using their gradient

        # Inverse the masks. So that you can change the values in dense Parameter Matrix
        mask_hidden_forward = mask_hidden_forward == False
        mask_out_forward = mask_out_forward == False

        W_hidden = (W_hidden * mask_hidden_forward + W_hidden_forward).detach()
        w_out = (w_out * mask_out_forward + w_out_forward).detach()
    with torch.no_grad():
        train_l = loss(net(features, W_hidden, w_out), labels)
        print(f'epoch {epoch + 1}, loss {float(train_l.mean()):f}')

  W_hidden_forward = torch.tensor(W_hidden * mask_hidden_forward, requires_grad = True).to_sparse()
  w_out_forward = torch.tensor(w_out * mask_out_forward, requires_grad = True).to_sparse()
> <ipython-input-20-3d7f25cdb87b>(21)<module>()
-> sgd([W_hidden_forward, w_out_forward], lr, batch_size)  # Update parameters using their gradient
tensor(indices=tensor([[0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5],
                       [0, 1, 4, 1, 2, 0, 2, 4, 5, 1, 5, 0, 1, 4, 0, 2, 4, 5]]),
       values=tensor([ 1.0873e-02,  5.3343e-02, -2.9064e-03,  5.0835e-02,
                      -9.5721e-04,  7.0614e-01,  1.8443e+00,  1.2701e+00,
                       1.3572e-01,  4.8863e-02,  1.3810e-02,  4.3484e-03,
                       5.3805e-02,  3.5719e-03,  2.8024e-01,  7.5755e-01,
                       3.4717e-01,  5.7720e-02]),
       size=(6, 6), nnz=18, layout=torch.sparse_coo, grad_fn=<ToSparseBackward>)
tensor(indices=tensor([[0, 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5,

BdbQuit: 

In [None]:
class layer(nn.Module):

    dense = parameter
    mask = mask

    def masking():
    

    def update_dense():


In [4]:
help(autograd.grad)

Help on function grad in module torch.autograd:

grad(outputs: Union[torch.Tensor, Sequence[torch.Tensor]], inputs: Union[torch.Tensor, Sequence[torch.Tensor]], grad_outputs: Union[torch.Tensor, Sequence[torch.Tensor], NoneType] = None, retain_graph: Union[bool, NoneType] = None, create_graph: bool = False, only_inputs: bool = True, allow_unused: bool = False) -> Tuple[torch.Tensor, ...]
    Computes and returns the sum of gradients of outputs w.r.t. the inputs.
    
    ``grad_outputs`` should be a sequence of length matching ``output``
    containing the "vector" in Jacobian-vector product, usually the pre-computed
    gradients w.r.t. each of the outputs. If an output doesn't require_grad,
    then the gradient can be ``None``).
    
    If ``only_inputs`` is ``True``, the function will only return a list of gradients
    w.r.t the specified inputs. If it's ``False``, then gradient w.r.t. all remaining
    leaves will still be computed, and will be accumulated into their ``.grad``
 