In [14]:
# we can test to see what a proper conv is. In the definition he wants, append 0s at the end, and then do the convolution. Let's manually verify with an example

import torch
import torch.nn as nn

I = torch.tensor([1, 2, 4, 1, 1, 6, 0, 0], dtype=torch.float).view(1, 1, 8)

m = nn.Conv1d(1, 1, 3, bias=False, padding=0)
m.weight.data = torch.tensor([[[2, 3, 1]]], dtype=torch.float)

m(I), m(I).shape

(tensor([[[12., 17., 12., 11., 20., 12.]]], grad_fn=<ConvolutionBackward0>),
 torch.Size([1, 1, 6]))

In [15]:
a = m(I)
a.backward(torch.arange(6).view(1, 1, 6).float())

In [7]:
torch.arange(6)

tensor([0, 1, 2, 3, 4, 5])

In [16]:
m.weight.grad

tensor([[[47., 33., 21.]]])

In [10]:
2 + 4 * 2 + 3 + 4 + 6 * 5

47

In [11]:
4 + 2 + 3 + 6 * 4

33

In [13]:
# and the last one is less 21

Parameter containing:
tensor([[[2., 3., 1.]]], requires_grad=True)

In [None]:
# ahh on the ipad I got all those exact same asnswers!!
# so the backwards is convolve D with I but then padd the two 0s or however many we want on the left. Let's write a loop for this

In [37]:
I = torch.tensor([1, 2, 4, 1, 1, 6], dtype=torch.float).view(1, 1, -1)
K = torch.tensor([2, 3, 1], dtype=torch.float).view(1, 1, -1)
O = torch.zeros(I.shape)
for i in range(I.shape[2]):
    if i > I.shape[2] - 3:
        num_elem = I.shape[2] - i
        O[0, 0, i] = I[0, 0, i:].dot(K[0, 0, :num_elem])
    else:
        O[0, 0, i] = I[0, 0, i : i + 3].dot(K[0, 0, :])

In [39]:
O  # perfect match!!

tensor([[[12., 17., 12., 11., 20., 12.]]])

In [42]:
# let's make it general and take in any size input in a function
def oned_1channel_conv(I, K):
    O = torch.zeros(I.shape)
    # klen = K.shape[2]
    for i in range(I.shape[2]):
        if i > I.shape[2] - K.shape[2]:
            num_elem = I.shape[2] - i
            O[0, 0, i] = I[0, 0, i:].dot(K[0, 0, :num_elem])
        else:
            O[0, 0, i] = I[0, 0, i : i + K.shape[2]].dot(K[0, 0, :])
    return O


I = torch.tensor([1, 2, 4, 1, 1, 6], dtype=torch.float).view(1, 1, -1)
K = torch.tensor([2, 3, 1], dtype=torch.float).view(1, 1, -1)
O = oned_1channel_conv(I, K)
O

tensor([[[12., 17., 12., 11., 20., 12.]]])

In [43]:
# now the hard part is what if we have multiple input and output channels!!
# first see what it should be

I = torch.tensor([1, 2, 4, 1, 1, 6, 0, 0], dtype=torch.float).view(1, 1, -1)
m = nn.Conv1d(1, 2, 3, bias=False, padding=0)
m.weight.shape

torch.Size([2, 1, 3])

In [44]:
# so now what it does is it has a separate 3x1 kernel for each of the 2 output channels. So it will have 2 3x1 kernels. So the output will be 2x6. Let's see if we can do this manually
oned_1channel_conv(I, m.weight[0:1, :, :]), oned_1channel_conv(I, m.weight[1:2, :, :])

(tensor([[[-0.4687,  1.5251,  0.4510, -1.5939,  2.6247,  0.5719,  0.0000,
            0.0000]]], grad_fn=<CopySlices>),
 tensor([[[0.6844, 1.9922, 2.0244, 0.2135, 2.2116, 2.7310, 0.0000, 0.0000]]],
        grad_fn=<CopySlices>))

In [45]:
m(I)  # again exactly what we expect, so can loop over output channels

tensor([[[-0.4687,  1.5251,  0.4510, -1.5939,  2.6247,  0.5719],
         [ 0.6844,  1.9922,  2.0244,  0.2135,  2.2116,  2.7310]]],
       grad_fn=<ConvolutionBackward0>)

In [46]:
# the real hard part is if we have multiple input channels
I = torch.tensor([1, 2, 4, 1, 1, 6, 0, 0], dtype=torch.float).view(1, 1, -1)
# duplicate the input channel
I = torch.cat((I, I), dim=1)
I.shape

torch.Size([1, 2, 8])

In [48]:
a = torch.randn(2, 8)
a * a  # this is what the dot is, just multiplication then sum... so we can think of this as a zip and reduce, which is exxactly what we talked about in class

tensor([[9.9274e-01, 9.2779e-01, 2.1114e+00, 1.8464e-03, 3.1480e-01, 8.7532e-01,
         8.0751e-01, 4.8517e-02],
        [1.3354e-02, 1.0712e+00, 1.9772e+00, 4.9204e-01, 1.3674e-01, 1.3449e-03,
         7.4082e-01, 4.4926e+00]])

In [None]:
# or we just implement it ourself, so what we have to do is loop over the length of the input, and then for each element, we have to loop over the input channels and the output channels, and then do the dot product of the input channel and the kernel for that output channel

"""
Basics of the implementation
We have thke primary outer loop be the input length, this corresponds to one element of the output along the entire out channel dimension. this is our parallelized one
So we then move the weights and then can do the dot products have to do a lot of sums tho...
"""

In [51]:
# so we made a basic function, let's test it
import minitorch

t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t2 = minitorch.tensor([[2, 3, 1]]).view(1, 1, 3)
out = minitorch.Conv1dFun.apply(t, t2)

In [52]:
out


[
	[
		[12.00 17.00 12.00 11.00 20.00 12.00]]]

In [58]:
# seems to work, now let's see if reverse works
t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
# t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1]]).view(1, 1, 3)
t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
out.backward(minitorch.tensor([0, 1, 2, 3, 4, 5]).view(1, 1, 6))

In [59]:
t2.grad


[
	[
		[47.00 33.00 21.00]]]

In [1]:
# yeah it seems to be working really well actually... huh I'll take it, so it works for 1 channel, the real test is if it works for multiple channels
# try it for 2 output channels, but make their weights identical
import minitorch

t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t2 = minitorch.tensor([[2, 3, 1], [2, 3, 1]]).view(2, 1, 3)
t2.requires_grad_(True)
t2, t2.shape

(
 [
 	[
 		[2.00 3.00 1.00]]
 	[
 		[2.00 3.00 1.00]]],
 (2, 1, 3))

In [7]:
t


[
	[
		[1.00 2.00 4.00 1.00 1.00 6.00]]]

In [2]:
out = minitorch.Conv1dFun.apply(t, t2)
out


[
	[
		[12.00 17.00 12.00 11.00 20.00 12.00]
		[12.00 17.00 12.00 11.00 20.00 12.00]]]

In [3]:
out.shape

(1, 2, 6)

In [4]:
d = minitorch.tensor([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]).view(1, 2, 6)
d


[
	[
		[0.00 1.00 2.00 3.00 4.00 5.00]
		[0.00 1.00 2.00 3.00 4.00 5.00]]]

In [5]:
out.backward(d)

In [6]:
t2.grad


[
	[
		[47.00 33.00 21.00]]
	[
		[60.00 39.00 21.00]]]

In [10]:
# let's try it again
t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1], [2, 3, 1]]).view(2, 1, 3)
t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
d = minitorch.tensor([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]).view(1, 2, 6)
out.backward(d)

In [11]:
t2.grad


[
	[
		[47.00 33.00 21.00]]
	[
		[60.00 39.00 21.00]]]

In [12]:
t.grad


[
	[
		[0.00 4.00 14.00 26.00 38.00 50.00]]]

In [19]:
import torch
import torch.nn as nn

I = torch.tensor([1, 2, 4, 1, 1, 6, 0, 0], dtype=torch.float).view(1, 1, 8)
I.requires_grad_(True)

m = nn.Conv1d(1, 1, 3, bias=False, padding=0)
m.weight.data = torch.tensor([[[2, 3, 1]]], dtype=torch.float)
a = m(I)
a.backward(torch.arange(6).view(1, 1, 6).float())
m.weight.grad

tensor([[[47., 33., 21.]]])

In [20]:
I.grad

tensor([[[ 0.,  2.,  7., 13., 19., 25., 19.,  5.]]])

In [22]:
# wait don't trust this I.grad, because those 0s at the end...
I = torch.tensor(
    [1, 2, 4, 1, 1, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.float
).view(1, 1, -1)
I.requires_grad_(True)

m = nn.Conv1d(1, 1, 3, bias=False, padding=0)
m.weight.data = torch.tensor([[[2, 3, 1]]], dtype=torch.float)
a = m(I)
a.backward(torch.arange(I.shape[2] - 2).view(1, 1, -1).float())
m.weight.grad

tensor([[[47., 33., 21.]]])

In [23]:
I.grad  # we clearly see there's issues with this by adding a bunch of zeros, still changes the gradient because we arbitrarily set this thing, so let's manually calculate it

tensor([[[ 0.,  2.,  7., 13., 19., 25., 31., 37., 43., 49., 55., 61., 67., 73.,
          51., 13.]]])

In [25]:
I = torch.tensor([1, 2, 4, 1, 1, 6], dtype=torch.float).view(1, 1, -1)
I.requires_grad_(True)
K = torch.tensor([2, 3, 1], dtype=torch.float).view(1, 1, -1)
K.requires_grad_(True)


def oned_1channel_conv(I, K):
    O = torch.zeros(I.shape)
    # klen = K.shape[2]
    for i in range(I.shape[2]):
        if i > I.shape[2] - K.shape[2]:
            num_elem = I.shape[2] - i
            O[0, 0, i] = I[0, 0, i:].dot(K[0, 0, :num_elem])
        else:
            O[0, 0, i] = I[0, 0, i : i + K.shape[2]].dot(K[0, 0, :])
    return O


O = oned_1channel_conv(I, K)
O

tensor([[[12., 17., 12., 11., 20., 12.]]], grad_fn=<CopySlices>)

In [29]:
O.backward(torch.arange(I.shape[2]).view(1, 1, -1).float())

In [30]:
K.grad

tensor([[[47., 33., 21.]]])

In [31]:
I.grad

tensor([[[ 0.,  2.,  7., 13., 19., 25.]]])

In [3]:
# let's see if minitorch gets it in the original way
import minitorch

t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t.requires_grad_(True)
# t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1]]).view(1, 1, 3)
# t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
# out.backward(minitorch.tensor([0,1,2,3,4,5]).view(1,1,6))
out


[
	[
		[12.00 17.00 12.00 11.00 20.00 12.00]]]

In [4]:
out.backward(minitorch.tensor([0, 1, 2, 3, 4, 5]).view(1, 1, 6))

In [5]:
t.grad


[
	[
		[0.00 2.00 7.00 13.00 19.00 25.00]]]

In [6]:
t2.grad

In [7]:
# but if we try to get grad of both, it crashes? It worked this time, this seems weird and random... ok I guess??
import minitorch

t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1]]).view(1, 1, 3)
t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
# out.backward(minitorch.tensor([0,1,2,3,4,5]).view(1,1,6))
out


[
	[
		[12.00 17.00 12.00 11.00 20.00 12.00]]]

In [8]:
out.backward(minitorch.tensor([0, 1, 2, 3, 4, 5]).view(1, 1, 6))

In [9]:
t.grad


[
	[
		[0.00 2.00 7.00 13.00 19.00 25.00]]]

In [10]:
t2.grad


[
	[
		[47.00 33.00 21.00]]]

In [11]:
# but it still fails if we have 2 channels
t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1], [2, 3, 1]]).view(2, 1, 3)
t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
d = minitorch.tensor([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]).view(1, 2, 6)
out.backward(d)

In [12]:
t2.grad


[
	[
		[47.00 33.00 21.00]]
	[
		[60.00 39.00 21.00]]]

# made some changes, test again

In [1]:
# we changed the order of operations, let's test it again
import minitorch

t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1], [2, 3, 1]]).view(2, 1, 3)
t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
d = minitorch.tensor([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]).view(1, 2, 6)
out.backward(d)
t2.grad


[
	[
		[47.00 33.00 21.00]]
	[
		[60.00 39.00 21.00]]]

In [10]:
t.grad  # indeed this is correct, shows there's some other fundamental problem


[
	[
		[0.00 4.00 14.00 26.00 38.00 50.00]]]

In [4]:
# let's compare to pytorch version
import torch
import torch.nn as nn

I = torch.tensor([1, 2, 4, 1, 1, 6, 0, 0], dtype=torch.float).view(1, 1, 8)
I.requires_grad_(True)
m = nn.Conv1d(1, 2, 3, bias=False, padding=0)
print(m.weight.shape)
m.weight.data = torch.tensor([[[2, 3, 1], [2, 3, 1]]], dtype=torch.float).view(2, 1, 3)
print(m.weight.shape)
a = m(I)
a

torch.Size([2, 1, 3])
torch.Size([2, 1, 3])


tensor([[[12., 17., 12., 11., 20., 12.],
         [12., 17., 12., 11., 20., 12.]]], grad_fn=<ConvolutionBackward0>)

In [7]:
d = torch.arange(6).view(1, 1, 6).float()
# stack it
d = torch.cat((d, d), dim=1)
a.backward(d)

In [8]:
m.weight.grad

tensor([[[47., 33., 21.]],

        [[47., 33., 21.]]])

In [9]:
I.grad

tensor([[[ 0.,  4., 14., 26., 38., 50., 38., 10.]]])

In [1]:
# let's remove prange and try again
import minitorch

t = minitorch.tensor([1, 2, 4, 1, 1, 6]).view(1, 1, 6)
t.requires_grad_(True)
t2 = minitorch.tensor([[2, 3, 1], [2, 3, 1]]).view(2, 1, 3)
t2.requires_grad_(True)
out = minitorch.Conv1dFun.apply(t, t2)
d = minitorch.tensor([[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]]).view(1, 2, 6)
out.backward(d)
t2.grad

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.
[1m
File "minitorch/fast_conv.py", line 36:[0m
[1m
[1mdef _tensor_conv1d(
[0m[1m^[0m[0m
[0m



[
	[
		[47.00 33.00 21.00]]
	[
		[60.00 39.00 21.00]]]