In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [2]:
list(net.parameters())[0]

Parameter containing:
tensor([[[[-0.0783, -0.1777,  0.3286],
          [ 0.1280, -0.1422,  0.1537],
          [ 0.3057,  0.1259,  0.3002]]],


        [[[ 0.1072, -0.2581,  0.0357],
          [ 0.2387,  0.1159, -0.1828],
          [ 0.1671, -0.1369,  0.2941]]],


        [[[-0.1339, -0.1289,  0.0084],
          [-0.2539, -0.0133, -0.3316],
          [ 0.1135, -0.2145,  0.0456]]],


        [[[ 0.0004, -0.1548,  0.2371],
          [ 0.1209, -0.1726, -0.2389],
          [-0.2637, -0.3155,  0.0104]]],


        [[[-0.2016,  0.2132, -0.3194],
          [ 0.3062, -0.2212,  0.1448],
          [-0.1464,  0.2660,  0.1532]]],


        [[[ 0.2740, -0.2365, -0.0249],
          [ 0.1758,  0.0049, -0.0948],
          [ 0.2339,  0.0532,  0.1053]]]], requires_grad=True)

In [9]:
inp = torch.randn(1,1,32,32)
out = net(inp)
out

tensor([[-0.0405,  0.1092, -0.0095,  0.0579, -0.0506, -0.1690, -0.1357, -0.0206,
         -0.1228, -0.0069]], grad_fn=<AddmmBackward>)

In [18]:
print(list(net.parameters())[0].grad)

None


In [19]:
net.zero_grad()
print(list(net.parameters())[0].grad)


None


In [23]:
out.backward(torch.randn(1, 10))
print(list(net.parameters())[0].grad)


tensor([[[[-0.1126, -0.1494,  0.0310],
          [-0.0322, -0.1467,  0.0929],
          [ 0.0210,  0.0844,  0.0066]]],


        [[[ 0.0129,  0.0668, -0.0654],
          [-0.0799,  0.0328, -0.1382],
          [-0.0206,  0.0084,  0.1162]]],


        [[[ 0.0312,  0.0235,  0.0566],
          [ 0.0345, -0.1269, -0.0261],
          [ 0.0453,  0.0543,  0.0100]]],


        [[[ 0.0739, -0.1061, -0.0844],
          [ 0.0124, -0.1925,  0.1466],
          [ 0.0403,  0.0466,  0.1065]]],


        [[[-0.0239,  0.0114,  0.0576],
          [ 0.0430,  0.0811, -0.1137],
          [ 0.0393,  0.0197,  0.0424]]],


        [[[ 0.0236,  0.0008, -0.1074],
          [-0.0530,  0.0173,  0.0455],
          [-0.0294,  0.0035,  0.0688]]]])


In [44]:
output = net(inp)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output

criterion = nn.MSELoss()
loss = criterion(output, target)
print(loss)

tensor(0.6730, grad_fn=<MseLossBackward>)


In [46]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0006, -0.0201, -0.0059,  0.0094, -0.0216, -0.0148])


In [181]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
print(net.conv1.bias.grad)
output = net(inp)
loss = criterion(output, target)
loss.backward()

optimizer.step()    # Does the update


print(net.conv1.bias.grad)

tensor([0., 0., 0., 0., 0., 0.])
tensor([-4.2362e-04,  1.9571e-05,  0.0000e+00,  9.8025e-04,  0.0000e+00,
         0.0000e+00])


In [4]:
class NLP(nn.Module):
    def __init__(self):
        super(NLP, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(784, 200),
            nn.ReLU(inplace=True),
            nn.Linear(200, 200),
            nn.ReLU(inplace=True),
            nn.Linear(200, 10),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        x = self.model(x)
        return x

In [29]:
x = torch.randn(1,10)
x

tensor([[-0.0049,  1.8757,  3.1802, -0.1569, -2.1356, -0.0187,  1.0742,  1.6761,
         -0.8035, -0.4543]])

In [30]:
F.relu(x)

tensor([[0.0000, 1.8757, 3.1802, 0.0000, 0.0000, 0.0000, 1.0742, 1.6761, 0.0000,
         0.0000]])

In [31]:
x

tensor([[-0.0049,  1.8757,  3.1802, -0.1569, -2.1356, -0.0187,  1.0742,  1.6761,
         -0.8035, -0.4543]])

In [32]:
nn.ReLU()(x)

tensor([[0.0000, 1.8757, 3.1802, 0.0000, 0.0000, 0.0000, 1.0742, 1.6761, 0.0000,
         0.0000]])

In [33]:
import torchvision

train_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST('data', train=True,
                               download=True,
                               transform=torchvision.transforms.ToTensor())
)

test_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST('data', train=False,
                               download=True,
                               transform=torchvision.transforms.ToTensor())
)

In [36]:
net = NLP()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(2):
    for batch_index, (data, target) in enumerate(train_loader):
        data = data.view(-1, 28*28)
        logits = net(data)
        loss = criterion(logits, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_index%10000 ==0 :
            print("epoch:{} item:{} Loss:{}".format(epoch,batch_index,loss.item()))

epoch:0 item:0 Loss:2.3296942710876465
epoch:0 item:10000 Loss:2.5667803287506104
epoch:0 item:20000 Loss:0.07751094549894333
epoch:0 item:30000 Loss:2.3025851249694824


KeyboardInterrupt: 

In [None]:
test_loss = 0
correct = 0
for data,target in test_loader:
    data = data.view(-1, 28*28)
    logits = net(data)
    test_loss = criterion(logits, target).item()
    pred = logits.data.max(1)[1]
    correct += pred.eq(target.data).sum()
test_loss /= len(test_loader)
print("loss:{} accuracy:{}/{} ({})".format(test_loss, correct, len(test_loader), correct/len(test_loader)))