In [13]:
import torch.nn as nn
import torch
from functools import reduce
from operator import mul

"""Implements the EmbeddingMul class
Author: Noémien Kocher
Date: Fall 2018
Unit test: embedding_mul_test.py
"""



class EmbeddingMul(nn.Module):
    """This class implements a custom embedding mudule which uses matrix
    multiplication instead of a lookup. The method works in the functional
    way.
    Note: this class accepts the arguments from the original pytorch module
    but only with values that have no effects, i.e set to False, None or -1.
    """

    def __init__(self, depth, device):
        super(EmbeddingMul, self).__init__()
        # i.e the dictionnary size
        self.depth = depth
        self.device = device
        self.ones = torch.eye(depth, requires_grad=True, device=self.device)
#         self._requires_grad = True
        # "oh" means One Hot
        self.last_oh = None
        self.last_weight = None

    @property
    def requires_grad(self):
        return self._requires_grad

    @requires_grad.setter
    def requires_grad(self, value):
        self._requires_grad = value
        logger.info(
            f"(embedding mul) requires_grad set to {self.requires_grad}. ")

    def forward(self, input, weight, padding_idx=None, max_norm=None,
                norm_type=2., scale_grad_by_freq=False, sparse=False):
        """Declares the same arguments as the original pytorch implementation
        but only for backward compatibility. Their values must be set to have
        no effects.
        Args:
            - input: of shape (bptt, bsize)
            - weight: of shape (dict_size, emsize)
        Returns:
            - result: of shape (bptt, bsize, dict_size)
        """
        # ____________________________________________________________________
        # Checks if unsupported argument are used
        if padding_idx != -1:
            raise NotImplementedError(
                f"padding_idx must be -1, not {padding_idx}")
        if max_norm is not None:
            raise NotImplementedError(f"max_norm must be None, not {max_norm}")
        if scale_grad_by_freq:
            raise NotImplementedError(f"scale_grad_by_freq must be False, "
                                      f"not {scale_grad_by_freq}")
        if sparse:
            raise NotImplementedError(f"sparse must be False, not {sparse}")
        # ____________________________________________________________________

        if self.last_oh is not None:
            del self.last_oh
        self.last_oh = self.to_one_hot(input)

        with torch.set_grad_enabled(self.requires_grad):
            result = torch.stack(
                [torch.mm(batch.float(), weight)
                 for batch in self.last_oh], dim=0)
        self.last_weight = weight.clone()
        return result

    def to_one_hot(self, input):
        # Returns a new tensor that doesn't share memory
        result = torch.index_select(
            self.ones, 0, input.view(-1).long()).view(input.size()+(self.depth,))
        result.requires_grad = True
        return result

    def __repr__(self):
        return self.__class__.__name__ + "({})".format(self.depth)

In [14]:
device = "cuda:0"
# model = model.to(device)

In [15]:

input = torch.tensor([[1, 2, 0], [3, 4, 5]]).to(device)
dim = 10
mod = EmbeddingMul(dim, device)
emmatrix = torch.rand(10, 5).to(device)
print(emmatrix)
output = mod(input, emmatrix, -1)
print(output.shape)

tensor([[0.9185, 0.0536, 0.5641, 0.7005, 0.5286],
        [0.1699, 0.3948, 0.0829, 0.2631, 0.3625],
        [0.9101, 0.5090, 0.7041, 0.6809, 0.4297],
        [0.9429, 0.4435, 0.4642, 0.6201, 0.0467],
        [0.3296, 0.6656, 0.7738, 0.2088, 0.3047],
        [0.6299, 0.4530, 0.4670, 0.9081, 0.3913],
        [0.8439, 0.4800, 0.0558, 0.5635, 0.7674],
        [0.6071, 0.2989, 0.0061, 0.1715, 0.9280],
        [0.6510, 0.4545, 0.7552, 0.3135, 0.3300],
        [0.0050, 0.8038, 0.9484, 0.9613, 0.0032]], device='cuda:0')


RuntimeError: you can only change requires_grad flags of leaf variables.

In [28]:
class HotEmbedding(torch.nn.Module):
    def __init__(self, max_val, embedding_dim, eps=1e-2):
        super(HotEmbedding, self).__init__()
        self.A = torch.arange(max_val, requires_grad=False)
        self.B = torch.randn((max_val, embedding_dim), requires_grad=True)
        self.eps = eps

    def forward(self, x):
        return 1/((x.unsqueeze(1)**2 - self.A**2)+self.eps) @ self.B

In [29]:
layer = HotEmbedding(10, 5)
x = torch.tensor([1.,2.,3.,1.,2.,3.], requires_grad=True)git

y = layer(x)
z = y.sum()

RuntimeError: The size of tensor a (6) must match the size of tensor b (10) at non-singleton dimension 2

In [23]:
y

tensor([[-168.8604,  -76.3864,   64.0532,   86.7358, -189.1458],
        [-138.6388,   32.7710,   58.9512,   57.2478, -170.6701],
        [ -40.8857,  -26.4066,  -17.7006,  103.6303,   71.7719],
        [-168.8604,  -76.3864,   64.0532,   86.7358, -189.1458],
        [-138.6388,   32.7710,   58.9512,   57.2478, -170.6701],
        [ -40.8857,  -26.4066,  -17.7006,  103.6303,   71.7719]],
       grad_fn=<MmBackward>)

In [22]:
torch.autograd.grad(z, x)

(tensor([ 56734.0000,  63754.9609, -54358.4766,  56734.0000,  63754.9609,
         -54358.4766]),)