In [2]:
import torch
import numpy as np
import torch.nn as nn

torch.__version__

'1.6.0'

In [2]:
# Create uninitialized tensor
x = torch.FloatTensor(2,3)
print(x)
#Initialize to zeros
x.zero_()
print(x)

tensor([[ 0.0000e+00, -1.0842e-19,  0.0000e+00],
        [-1.0842e-19,  9.8091e-45,  0.0000e+00]])
tensor([[0., 0., 0.],
        [0., 0., 0.]])


In [3]:
# Create random tensor (seed for repeatability)
torch.manual_seed(123)
x=torch.randn(2,3)
print(x)
# export to numpy array
x_np = x.numpy()
print(x_np)

tensor([[-0.1115,  0.1204, -0.3696],
        [-0.2404, -1.1969,  0.2093]])
[[-0.11146712  0.12036294 -0.3696345 ]
 [-0.24041797 -1.1969243   0.20926936]]


In [None]:
print(torch.eye(3))
print(torch.ones(3,3))
print(torch.zeros(2,3)) # make matrix
print(torch.arange(1,3)) # make array

All tensor have a size and type.

In [11]:
x=torch.FloatTensor(3,4)
print(x.size(), x.type())

torch.Size([3, 4]) torch.FloatTensor


Math, Linear Algebra, and Indexing (review )

In [22]:
x = torch.arange(0., 5.)
y = torch.exp(x)
xnt = y.type(dtype=torch.IntTensor)
print(x)
print(torch.sum(x))
print(torch.exp(x))
print(torch.sum(torch.exp(x)))
print(torch.mean(x))

tensor([1., 2., 3., 4.])
tensor(10.)
tensor([ 2.7183,  7.3891, 20.0855, 54.5982])
tensor(84.7910)
tensor(2.5000)

tensor([ 2,  7, 20, 54], dtype=torch.int32)


In [23]:
# create a tensor
x = torch.rand(3,2)
# copy to GPU
y = x.cuda()
# copy back to CPU
z = y.cpu()
# get CPU tensor as numpy array
# cannot get GPU tensor as numpy array directly
try:
    y.numpy()
except RuntimeError as e:
    print(e)

AssertionError: Torch not compiled with CUDA enabled

In [24]:
x = torch.rand(3,5)  # CPU tensor
y = torch.rand(5,4).cuda()  # GPU tensor
try:
    torch.mm(x,y)  # Operation between CPU and GPU fails
except TypeError as e:
    print(e)

AssertionError: Torch not compiled with CUDA enabled

In [26]:
nvcc --version

NameError: name 'nvcc' is not defined

In [25]:
# Put tensor on CUDA if available
x = torch.rand(3,2)
if torch.cuda.is_available():
    x = x.cuda()
    print(x, x.dtype)
    
# Do some calculations
y = x ** 2 
print(y)

# Copy to CPU if on GPU
if y.is_cuda:
    y = y.cpu()
    print(y, y.dtype)

tensor([[0.4805, 0.1896],
        [0.0012, 0.0364],
        [0.8589, 0.2808]])


x1 = torch.rand(3, 2)
x2 = x1.new(1,2)
print(x2)


In [6]:
x1 = torch.rand(3, 2) 
x2 = x1.new(1, 2) 
print(x1)
print(x2)

tensor([[0.1610, 0.5661],
        [0.2441, 0.7232],
        [0.4741, 0.2356]])
tensor([[0., 2.]])


In [9]:
from timeit import timeit
# Create random data
x = torch.rand(1000, 64)
y = torch.rand(64, 32)
num = 10000 #number of iterations.

def square():
    z = torch.mm(x, y)
print('CPU : {}ms'.format(timeit(square, number = num)*1000))

CPU : 467.3991639999713ms


In [20]:
# Create differentiable tensor
# x = torch.tensor(torch.arange(0,4), requires_grad=False)
x = torch.tensor(torch.ones(2,4), requires_grad=True)
# print(x.dtype)
# x.grad.data.zero_()
print(x)
torch.sum(x**2).backward()
y = x**2
print('1', y)
y.sum().backward()
print('2', y)
print(x.grad)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]], requires_grad=True)
1 tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]], grad_fn=<PowBackward0>)
2 tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]], grad_fn=<PowBackward0>)
tensor([[4., 4., 4., 4.],
        [4., 4., 4., 4.]])


  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
x = torch.tensor(torch.arange(0,4), requires_grad=False)
y = x**2
z = y**2
print('0', x,'\n1', y, '\n2', z)
z.detach().numpy()
print(z)

0 tensor([0, 1, 2, 3]) 
1 tensor([0, 1, 4, 9]) 
2 tensor([ 0,  1, 16, 81])
tensor([ 0,  1, 16, 81])


  """Entry point for launching an IPython kernel.


In [27]:
x = torch.arange(0, 32)
net = torch.nn.Linear(32, 10)
y = net(x)
print(y)

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #3 'mat2' in call to _th_addmm_out

In [29]:
# create a simple seqyential network(nn.Module; object) from layers (other 'nn.Module'; object)
# Here a MLP with 2 layers and sigmoid activation.
net = torch.nn.Sequential(
    torch.nn.Linear(32, 128),
    torch.nn.Sigmoid(),
    torch.nn.Linear(128, 10))
print(net)

Sequential(
  (0): Linear(in_features=32, out_features=128, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=128, out_features=10, bias=True)
)


In [16]:
# Create a more customizable network module (equibalent here)
class MyNetwork(torch.nn.Module):
    def __init__(self, inputSize, hiddenSize, outputSize):
        super().__init__()
        self.layer1 = torch.nn.Linear(inputSize, hiddenSize)
        self.layer2 = torch.nn.Sigmoid()
        self.layer3 = torch.nn.Linear(hiddenSize, outputSize)
        
    def forward(self, inputVal):
        h = inputVal
        h = self.layer1(h)
#         print('layer1', h)
        h = self.layer2(h)
#         print('layer2', h)
        h = self.layer3(h)
#         print('layer3', h)
        return h

net = MyNetwork(32, 128, 10)
print(net)

MyNetwork(
  (layer1): Linear(in_features=32, out_features=128, bias=True)
  (layer2): Sigmoid()
  (layer3): Linear(in_features=128, out_features=10, bias=True)
)


In [7]:
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[-0.0134, -0.2918, -0.1799,  ..., -1.0561,  0.3473,  1.2354],
        [ 0.5572, -0.8590, -0.6649,  ..., -0.5015,  0.2164,  0.9878],
        [ 0.4380,  0.8587, -1.5056,  ..., -0.5404, -1.0242,  0.7898],
        ...,
        [ 0.6764,  0.7673,  0.6491,  ..., -0.7212,  1.2370,  0.9538],
        [ 1.0513,  2.3330,  0.5287,  ...,  0.8888,  0.9517, -0.3849],
        [ 1.0079, -0.0903, -0.5072,  ..., -0.5454,  1.1342,  0.2860]],
       requires_grad=True)
Parameter containing:
tensor([-0.3493, -0.6155,  0.0049,  0.7552,  0.6222,  1.0016, -0.9386, -0.5945,
         1.0814, -0.5820, -1.0319, -0.5961, -0.2457, -0.9504, -0.0978, -1.0557,
         0.5539,  0.1254,  0.2426,  0.4270,  0.0608, -0.6844,  1.4274,  0.6769,
         0.2477,  0.3832, -0.8889,  1.0568,  2.2727,  0.8648,  2.4866, -0.4655,
         1.1351,  0.1416, -0.5216,  0.5953,  2.0680,  0.2783, -0.1957, -0.5393,
        -1.5547, -0.5468, -1.0406,  0.8948, -0.0853, -2.1782,  0.7632, -1.2043,
         0.2252

In [8]:
class MyNetworkWithParmas(nn.Module):
    def __init__(self, inputSize, hiddenSize, outputSize):
        super(MyNetworkWithParmas, self).__init__()
        self.layer1_weights = nn.Parameter(torch.randn(inputSize, hiddenSize))
        self.layer1_bias = nn.Parameter(torch.randn(hiddenSize))
        self.layer2_weights = nn.Parameter(torch.randn(hiddenSize, outputSize))
        self.layer2_bias = nn.Parameter(torch.randn(outputSize))
    def forward(self, x):
        h1 = torch.matmul(x, self.layer1_weights) + self.layer1_bias
        h1_act = torch.max(h1, torch.zeros(h1.size())) # ReLU
#         print('activatefunction', h1_act)
        output = torch.matmul(hl_act, self.layer2_weights) + self.layer2_bias
        return output
    
net = MyNetworkWithParmas(32, 128, 10)
print(net)

MyNetworkWithParmas()


In [9]:
net = MyNetwork(32, 128, 10)

In [18]:
x = torch.tensor([np.arange(32), np.zeros(32), np.ones(32)]).float()
y = torch.tensor([0,3,9])
criterion = nn.CrossEntropyLoss()

output = net(x)
loss = criterion(output, y)
print(loss)


tensor(2.4306, grad_fn=<NllLossBackward>)


In [19]:
# equivalent
criterion2 = nn.NLLLoss()
sf = nn.LogSoftmax()
output = net(x)
loss = criterion2(sf(output), y)
loss

  """


tensor(2.4306, grad_fn=<NllLossBackward>)

In [50]:
loss.backward()
# Check that the parameters now have gradients.
for param in net.parameters():
    print(param.grad)

tensor([[-0.0154, -0.0153, -0.0152,  ..., -0.0127, -0.0126, -0.0125],
        [ 0.0180,  0.0180,  0.0180,  ...,  0.0180,  0.0180,  0.0180],
        [-0.0135, -0.0206, -0.0277,  ..., -0.2191, -0.2262, -0.2333],
        ...,
        [ 0.0174,  0.0131,  0.0089,  ..., -0.1065, -0.1107, -0.1150],
        [-0.0052, -0.0052, -0.0052,  ..., -0.0051, -0.0051, -0.0051],
        [ 0.0144,  0.0144,  0.0144,  ...,  0.0142,  0.0142,  0.0142]])
tensor([-0.0336,  0.0271,  0.0025,  0.0085, -0.0094, -0.0125,  0.0183, -0.0241,
        -0.0104, -0.0140, -0.0109, -0.0102,  0.0216, -0.0137,  0.0044, -0.0216,
        -0.0278,  0.0141,  0.0200,  0.0034,  0.0185, -0.0081, -0.0167, -0.0099,
        -0.0241, -0.0052,  0.0278,  0.0119, -0.0266,  0.0139,  0.0022, -0.0226,
         0.0305, -0.0035, -0.0032, -0.0279,  0.0238, -0.0145,  0.0012,  0.0141,
        -0.0076, -0.0043,  0.0227, -0.0162, -0.0045,  0.0183,  0.0107, -0.0054,
         0.0108, -0.0098, -0.0283, -0.0170,  0.0244,  0.0088,  0.0158,  0.0196,
      

In [12]:
# if I forward prop and backward prop again, gradients accmulate :
output = net(x)
loss = criterion(output, y)
loss.backward()
for param in net.parameters():
    print(param.grad)
print("############################")
# you can remove this behavior by reinitializing the gradients in your network parameters :
net.zero_grad()
output = net(x)
loss = criterion(output, y)
loss.backward()
for param in net.parameters():
    print(param.grad)

layer1 tensor([[-5.1618e+00, -4.2152e+00, -8.4670e+00,  1.4110e+01,  4.8130e+00,
         -6.6926e+00, -1.2622e+01,  1.4639e+01, -1.2471e+01, -6.9453e+00,
          4.4591e+00,  8.3486e+00,  1.1296e+01, -3.0892e+00,  4.0617e+00,
         -6.6144e+00, -2.4907e+00,  7.0176e+00, -1.0662e+00,  4.5190e+00,
         -2.3595e+01,  3.5734e+00,  1.3358e+00,  4.4969e+00, -2.3691e+00,
         -3.8123e-01, -5.0096e+00,  4.7195e+00, -1.5410e+01, -3.9049e+00,
          4.9635e+00,  1.1775e+01, -7.1133e+00, -3.9520e+00, -8.7899e-01,
          1.7951e+01,  6.2363e+00,  7.5335e+00,  1.3576e+01,  4.9625e+00,
          6.7328e+00, -7.2595e+00,  1.0612e+01, -9.8447e+00,  1.9025e-01,
         -1.2046e+01, -7.7834e+00, -6.0830e+00,  1.2308e+01, -7.1781e+00,
         -1.8929e+01,  6.3393e+00, -1.1182e+01,  4.6790e-01, -4.3229e+00,
          7.8729e+00, -1.5258e+01,  9.7835e+00,  3.8424e+00,  2.2110e+00,
          6.3909e+00, -1.9023e+01,  1.4249e+01,  1.0482e+00,  1.2440e+01,
         -7.8790e+00,  1.6814e+

# start p.11 gradient descent

In [20]:
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)

print("Parameters before gradient descent :")
for param in net.parameters():
    print(param)
optimizer.step()

print("\n\nParameters after gradient descetn :")
for param in net.parameters():
    print(param)

Parameters before gradient descent :
Parameter containing:
tensor([[-8.1703e-05,  8.2688e-02,  6.4926e-02,  ..., -7.4851e-02,
         -2.7852e-02, -5.8414e-02],
        [-1.7089e-01,  1.6789e-02, -1.0587e-02,  ..., -1.1577e-01,
          1.6688e-01, -1.0969e-01],
        [-6.7976e-03, -2.0125e-02, -1.0821e-01,  ...,  1.1288e-01,
          9.5346e-02, -8.3861e-02],
        ...,
        [-1.5774e-01, -8.1721e-02, -4.1515e-02,  ...,  1.2808e-01,
          9.6737e-02,  9.0107e-02],
        [-1.0061e-01,  3.3354e-02,  2.7259e-02,  ..., -6.6138e-02,
          6.8566e-02,  1.5990e-01],
        [ 1.6996e-03, -2.1842e-02, -4.8077e-02,  ..., -1.7191e-01,
         -1.7266e-01, -9.6207e-02]], requires_grad=True)
Parameter containing:
tensor([-0.0221, -0.0085, -0.0262, -0.1374, -0.0975,  0.1403, -0.0947,  0.1231,
         0.0443, -0.0099, -0.0896,  0.1179, -0.0602, -0.0484,  0.1250, -0.0122,
        -0.1419,  0.0642,  0.0364, -0.0696,  0.1424, -0.1057, -0.0374, -0.1748,
         0.1676,  0.0943,  

In [21]:
# In a training loop, we should perform many GD iterations.
nIter = 1000
for i in range(nIter):
    optimizer.zero_grad() # equivalent to net.zero_grad()
    output = net(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print(loss)

tensor(13.5678, grad_fn=<NllLossBackward>)
tensor(13.0338, grad_fn=<NllLossBackward>)
tensor(12.5004, grad_fn=<NllLossBackward>)
tensor(11.9675, grad_fn=<NllLossBackward>)
tensor(11.4354, grad_fn=<NllLossBackward>)
tensor(10.9041, grad_fn=<NllLossBackward>)
tensor(10.3737, grad_fn=<NllLossBackward>)
tensor(9.8444, grad_fn=<NllLossBackward>)
tensor(9.3162, grad_fn=<NllLossBackward>)
tensor(8.7893, grad_fn=<NllLossBackward>)
tensor(8.2638, grad_fn=<NllLossBackward>)
tensor(7.7399, grad_fn=<NllLossBackward>)
tensor(7.2177, grad_fn=<NllLossBackward>)
tensor(6.6975, grad_fn=<NllLossBackward>)
tensor(6.1796, grad_fn=<NllLossBackward>)
tensor(5.6645, grad_fn=<NllLossBackward>)
tensor(5.1531, grad_fn=<NllLossBackward>)
tensor(4.6469, grad_fn=<NllLossBackward>)
tensor(4.1499, grad_fn=<NllLossBackward>)
tensor(3.6714, grad_fn=<NllLossBackward>)
tensor(3.2332, grad_fn=<NllLossBackward>)
tensor(2.8752, grad_fn=<NllLossBackward>)
tensor(2.6218, grad_fn=<NllLossBackward>)
tensor(2.4357, grad_fn=<Nll

tensor(0.0308, grad_fn=<NllLossBackward>)
tensor(0.0307, grad_fn=<NllLossBackward>)
tensor(0.0306, grad_fn=<NllLossBackward>)
tensor(0.0305, grad_fn=<NllLossBackward>)
tensor(0.0304, grad_fn=<NllLossBackward>)
tensor(0.0303, grad_fn=<NllLossBackward>)
tensor(0.0302, grad_fn=<NllLossBackward>)
tensor(0.0301, grad_fn=<NllLossBackward>)
tensor(0.0300, grad_fn=<NllLossBackward>)
tensor(0.0299, grad_fn=<NllLossBackward>)
tensor(0.0298, grad_fn=<NllLossBackward>)
tensor(0.0297, grad_fn=<NllLossBackward>)
tensor(0.0296, grad_fn=<NllLossBackward>)
tensor(0.0295, grad_fn=<NllLossBackward>)
tensor(0.0295, grad_fn=<NllLossBackward>)
tensor(0.0294, grad_fn=<NllLossBackward>)
tensor(0.0293, grad_fn=<NllLossBackward>)
tensor(0.0292, grad_fn=<NllLossBackward>)
tensor(0.0291, grad_fn=<NllLossBackward>)
tensor(0.0290, grad_fn=<NllLossBackward>)
tensor(0.0289, grad_fn=<NllLossBackward>)
tensor(0.0288, grad_fn=<NllLossBackward>)
tensor(0.0288, grad_fn=<NllLossBackward>)
tensor(0.0287, grad_fn=<NllLossBac

tensor(0.0165, grad_fn=<NllLossBackward>)
tensor(0.0165, grad_fn=<NllLossBackward>)
tensor(0.0165, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0164, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0163, grad_fn=<NllLossBackward>)
tensor(0.0162, grad_fn=<NllLossBackward>)
tensor(0.0162, grad_fn=<NllLossBackward>)
tensor(0.0162, grad_fn=<NllLossBackward>)
tensor(0.0161, grad_fn=<NllLossBackward>)
tensor(0.0161, grad_fn=<NllLossBackward>)
tensor(0.0161, grad_fn=<NllLossBackward>)
tensor(0.0161, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0160, grad_fn=<NllLossBackward>)
tensor(0.0159, grad_fn=<NllLossBackward>)
tensor(0.0159, grad_fn=<NllLossBac

tensor(0.0123, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0122, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0121, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0120, grad_fn=<NllLossBackward>)
tensor(0.0119, grad_fn=<NllLossBackward>)
tensor(0.0119, grad_fn=<NllLossBac

# Saving and Loading

In [24]:
# get dictionary of keys to weights using 'state_dict'
net = torch.nn.Sequential(
    torch.nn.Linear(28*28, 256),
    torch.nn.Sigmoid(),
    torch.nn.Linear(256, 10)
)
print(net.state_dict().keys())


odict_keys(['0.weight', '0.bias', '2.weight', '2.bias'])


In [25]:
# save a dictionary
torch.save(net.state_dict(), 'test.t10')
# load a dictionary
net.load_state_dict(torch.load('test.t10'))

<All keys matched successfully>

# Type mismatch

In [26]:
net = nn.Linear(4, 2)
x = torch.tensor([1,2,3,4])
y = net(x)
print(y)

RuntimeError: Expected object of scalar type Long but got scalar type Float for argument #3 'mat2' in call to _th_addmm_out

In [28]:
x = x.float()
x = torch.tensor([1.,2.,3.,4.])
y = net(x)
print(y)
x = 2 * torch.ones(2, 2)
y = 3 * torch.ones(2, 2)
print(x)
print(y)
print(x * y)
print(x.matmul(y))
# print(x.dot(y))

tensor([ 0.5646, -0.3501], grad_fn=<AddBackward0>)
tensor([[2., 2.],
        [2., 2.]])
tensor([[3., 3.],
        [3., 3.]])
tensor([[6., 6.],
        [6., 6.]])
tensor([[12., 12.],
        [12., 12.]])


In [29]:
x = torch.ones(4, 5)
y = torch.arange(5)
print(x)
print(y)
print(x+y)
y = torch.arange(4).view(-1,1)
print(y)
print(x+y)
y = torch.arange(4)
print(y)
print(x+y)

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])
tensor([0, 1, 2, 3, 4])
tensor([[1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.],
        [1., 2., 3., 4., 5.]])
tensor([[0],
        [1],
        [2],
        [3]])
tensor([[1., 1., 1., 1., 1.],
        [2., 2., 2., 2., 2.],
        [3., 3., 3., 3., 3.],
        [4., 4., 4., 4., 4.]])
tensor([0, 1, 2, 3])


RuntimeError: The size of tensor a (5) must match the size of tensor b (4) at non-singleton dimension 1

In [33]:
x = torch.tensor([[1,2,3], [4,5,6]])
print(x)
print(x.t())
print(x.view(3,2))
print(x.view(2,3))

tensor([[1, 2, 3],
        [4, 5, 6]])
tensor([[1, 4],
        [2, 5],
        [3, 6]])
tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 2, 3],
        [4, 5, 6]])


In [None]:
class MyNet(nn.Module):
    def __init__(self, nHiddenLayers):
        super(MyNet, self).__init__()
        self.nHiddenLayers=nHiddenLayers
        self.finalLayer = nn.Linear(128, 10)
        self.act = nn.ReLU()
        self.hidden = []
        for i in range(nHiddenLayers):
            self.hidden.append(nn.Linear(128, 128))
    
    def forward(self, x):
        h = x
        for i in range(self.nHiddenLayers):
            h = self.hidden[i](h)
            h = self.act(h)
        out = self.finalLayer(h)
        return out