In [14]:
import torch
from torch import nn
import numpy as np

# (1,1) Linear Neural Net

Create a (1,1) linear "neural net"
It has one input and one output and calculates
y = weight * x + bias

In [15]:
net = nn.Linear(1,1)
net.weight, net.bias

(Parameter containing:
 tensor([[-0.1086]], requires_grad=True),
 Parameter containing:
 tensor([0.3745], requires_grad=True))

throw some input on the "neural net"
we make a 2 x 1 tensor; i.e. two different inputs

In [16]:
x = torch.tensor([[1.0], [2.0]])

In [17]:
y = net(x)
y

tensor([[0.2659],
        [0.1573]], grad_fn=<AddmmBackward>)

Taking weight & bias from above and calculate output from "neural net" by hand...

In [18]:
-0.3517*1+0.9578, -0.3517*2+0.9578

(0.6061, 0.25439999999999996)

# (2,1) Linear Neural Network

Two inputs, one output

$y = weight[0]*x[0] + weight[1]*x[1] + bias$


In [19]:
net = nn.Linear(2,1)
net.weight, net.bias

(Parameter containing:
 tensor([[-0.6589,  0.3121]], requires_grad=True),
 Parameter containing:
 tensor([-0.1624], requires_grad=True))

For fun, we also calculate the gradient of y wrt x
To this end, we need `x.requires_grad_(True)` and a `detach()` in all x expressions further below

In [20]:
x = torch.tensor([1.0, 2.0])
x.requires_grad_(True)
y = net(x)
y

tensor([-0.1970], grad_fn=<AddBackward0>)

In [21]:
net.weight.detach().numpy(), x.detach().numpy()*net.weight.detach().numpy()

(array([[-0.6588896 ,  0.31214994]], dtype=float32),
 array([[-0.6588896,  0.6242999]], dtype=float32))

In [22]:
np.sum(x.detach().numpy()*net.weight.detach().numpy()) + net.bias.detach().numpy()

array([-0.1969704], dtype=float32)

In [23]:
list(net.named_parameters())

[('weight',
  Parameter containing:
  tensor([[-0.6589,  0.3121]], requires_grad=True)),
 ('bias',
  Parameter containing:
  tensor([-0.1624], requires_grad=True))]

Now we calculate the gradient of y wrt to x. The gradient is equal the weights.

In [24]:
y.backward()
print(x.grad)

tensor([-0.6589,  0.3121])


# (2,1) Network with non-linear activation function (tanh)

In [25]:
net = nn.Sequential(nn.Linear(2, 1), nn.Tanh())
x = torch.tensor([1.0, 2.0])
y = net(x)
y

tensor([-0.7648], grad_fn=<TanhBackward>)

What happens? The neural network calculates

$y = \tanh ( weight[0]*x[0] + weight[1]*x[1] + bias )$

I.e. it takes the output from the first layer (which is a linear network) and applies the second layer (tanh) to it.

In [26]:
np.tanh(
    np.sum(
        x.numpy()*net[0].weight.detach().numpy())
    + net[0].bias.detach().numpy())

array([-0.76476896], dtype=float32)

# (2,2) Network with non-linear activation function

That's now a network with 2 inputs and 2 outputs. Each output is tanh'd

In [27]:
net = nn.Sequential(nn.Linear(2, 2), nn.Tanh())
x = torch.tensor([1.0, 2.0])
y = net(x)
y

tensor([-0.5356,  0.9079], grad_fn=<TanhBackward>)

In [31]:
net[0].weight, net[0].bias

(Parameter containing:
 tensor([[ 0.2782, -0.1811],
         [ 0.2763,  0.4370]], requires_grad=True),
 Parameter containing:
 tensor([-0.5139,  0.3649], requires_grad=True))

In [38]:
x.numpy()*net[0].weight.detach().numpy()

array([[ 0.2782038 , -0.36226952],
       [ 0.2763092 ,  0.87395585]], dtype=float32)

In [39]:
# the only tricky thing here is that we sum along the rows; this is achieved by the rows=1 keyword. 
# Default behaviour is summing along columns which is NOT correct in this case
np.sum(x.numpy()*net[0].weight.detach().numpy(), axis=1)

array([-0.08406574,  1.150265  ], dtype=float32)

In [40]:
np.sum(x.numpy()*net[0].weight.detach().numpy(), axis=1) + net[0].bias.detach().numpy()

array([-0.59801096,  1.5151753 ], dtype=float32)

In [41]:
np.tanh(np.sum(x.numpy()*net[0].weight.detach().numpy(), axis=1) + net[0].bias.detach().numpy())

array([-0.5356327,  0.9078532], dtype=float32)