# PyTorch


In [25]:
import torch
torch.__version__

'1.11.0'

## PyTorch basics   
**Tensor** is a fundamental structure in PyTorch which is very similar to an array or matrix. Tensors are used to encode the inputs and outputs of a model, as well as the model’s parameters. 
##### Create Tensors

In [26]:
shape = (2, 3,)
x = torch.rand(shape)
y = torch.rand(shape)
z = torch.zeros(shape)


#### Addition

In [37]:
print("x = :")
print(x)
print("y = :")
print(y)

print("\n")

print("x + y = :")
z = x + y
print(z)

x = :
tensor([[0.1506, 0.9966, 0.0523],
        [0.2807, 0.2023, 0.9067]])
y = :
tensor([[0.5453, 0.2876, 0.7285],
        [0.3665, 0.1064, 0.8783]])


x + y = :
tensor([[0.6960, 1.2842, 0.7808],
        [0.6472, 0.3087, 1.7850]])


#### Reshape

In [38]:
print(z)
print(z.shape)

z = z.reshape([3, 2])
print(z)
print(z.shape)

tensor([[0.6960, 1.2842, 0.7808],
        [0.6472, 0.3087, 1.7850]])
torch.Size([2, 3])
tensor([[0.6960, 1.2842],
        [0.7808, 0.6472],
        [0.3087, 1.7850]])
torch.Size([3, 2])


#### Flatten

In [41]:
z = torch.rand([2, 3])

print(z)
print(z.shape)
print("\n")

z = torch.flatten(z)
print("z and its shape after the flatten operation:")
print(z)
print(z.shape)
print("\n")

tensor([[0.1675, 0.5401, 0.4135],
        [0.8770, 0.6943, 0.0997]])
torch.Size([2, 3])


z and its shape after the flatten operation:
tensor([0.1675, 0.5401, 0.4135, 0.8770, 0.6943, 0.0997])
torch.Size([6])




#### Transpose

In [49]:
z = torch.rand([2, 3])
print(z)
print(z.shape)
print("\n")

z = torch.transpose(z, 0, 1)
print("z and its shape after the transpose operation:")
print(z)
print(z.shape)
print("\n")


tensor([[0.4091, 0.8527, 0.5423],
        [0.8766, 0.4200, 0.8596]])
torch.Size([2, 3])


z and its shape after the transpose operation:
tensor([[0.4091, 0.8766],
        [0.8527, 0.4200],
        [0.5423, 0.8596]])
torch.Size([3, 2])




#### Permutation

In [52]:
"""
Permute the dimensions of z according to the specified order (exchanging axes).
"""
# Create a new tensor
shape = (3,5,1)
z = torch.rand(shape)
order = [2,0,1]
print(z)
print(z.shape)

z = torch.permute(z, (2, 0, 1))  #  Returns a view of the original tensor input with its dimensions permuted.

print("z and its shape after permutation")
print(z)
print(z.shape)

tensor([[[0.4926],
         [0.5932],
         [0.9819],
         [0.1368],
         [0.4672]],

        [[0.8629],
         [0.8223],
         [0.9654],
         [0.2195],
         [0.9839]],

        [[0.8768],
         [0.5181],
         [0.9418],
         [0.0854],
         [0.0494]]])
torch.Size([3, 5, 1])
z and its shape after permutation
tensor([[[0.4926, 0.5932, 0.9819, 0.1368, 0.4672],
         [0.8629, 0.8223, 0.9654, 0.2195, 0.9839],
         [0.8768, 0.5181, 0.9418, 0.0854, 0.0494]]])
torch.Size([1, 3, 5])


#### Dot Product

In [55]:
# Create two vectors
v = torch.tensor([2, 3])
u = torch.tensor([2, 1])

result = v @ u

print("The dot product of u and v:")
print(result.item())

The dot product of u and v:
7


#### Concatenation

In [59]:
shape = (2,3,)
x = torch.rand(shape)
y = torch.rand(shape)
print(x)
print(y)

z = torch.concat((x, y), dim=1)

print("The Concatenated tensor z of (x, y)")
print(z)

tensor([[0.6335, 0.7177, 0.3232],
        [0.4099, 0.0371, 0.5935]])
tensor([[0.4223, 0.0444, 0.9044],
        [0.4609, 0.6270, 0.7179]])
The Concatenated tensor z of (x, y)
tensor([[0.6335, 0.7177, 0.3232, 0.4223, 0.0444, 0.9044],
        [0.4099, 0.0371, 0.5935, 0.4609, 0.6270, 0.7179]])


## Typical Structure of a PyTorch Program

```python
# Create neural network according to model specification
net = MyModel().to(device) # CPU or GPU

# Prepare to load the training and test data
train_loader = torch.utils.data.DataLoader(...)
test_loader = torch.utils.data.DataLoader(...)

# Choose an optimizer: SGD, Adam, or others
optimizer = torch.optim.SGD(net.parameters, ...)

# Training loop
for epoch in range(1, epochs):
    train(params, net, device, train_loader, optimizer)
    # Periodically evaluate the network on the test data
    if epoch % 10 == 0:
        test(params, net, device, test_loader)
```


## Defining a model

```python
class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # define the structure of the network here

    def forward(self, input):
        # apply network and return output
```


## Defining a Custom Model    

Example: define a function $f(x, y) \rightarrow Ax\log(y) + By^2$

```python
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.A = nn.Parameter(torch.randn((1), requires_grad=True))
        self.B = nn.Parameter(torch.randn((1), requires_grad=True))
    
    def forward(self, input):
        output = self.A * input[:, 0] * torch.log(input[:, 1]) \
            + self.B * input[:, 1] * input[:, 1]
        return output
```


## Building a Net from Individual Components

```python
class MyModel(torch.nn.Module):

    def __init__(self):
        super(MyModel, self).__init__()
        self.in_to_hid = torch.nn.Linear(2,2)
        self.hid_to_out = torch.nn.Linear(2,1)

    def forward(self, input):
        hid_sum = self.in_to_hid(input)
        hidden = torch.tanh(hid_sum)
        out_sum = self.hid_to_out(hidden)
        output = torch.sigmoid(out_sum)
        return output
```


## Defining a Sequential Network

```python
class MyModel(torch.nn.Module):

    def __init__(self, num_input, num_hid, num_out):
        super(MyModel, self).__init__()
        self.main = nn.Sequential(
            nn.Linear(num_input, num_hid),
            nn.Tanh(),
            nn.Linear(num_hid, num_out),
            nn.Sigmoid()
        )
    def forward(self, input):
        output = self.main(input)
        return output
```

#### Sequential Components

Network Layers:
* `nn.Linear()`
* `nn.Conv2d()`

Intermediate Operators:
* `nn.Dropout()`
* `nn.BatchNorm()`

Activation Functions:
* `nn.Sigmoid()`
* `nn.Tanh()`
* `nn.ReLU()`


## Declaring Data Explicitly

```python
import torch.utils.data

# input and target values for the XOR task 
input = torch.Tensor([[0,0],[0,1],[1,0],[1,1]])
target = torch.Tensor([[0],[1],[1],[0]])

xdata = torch.utils.data.TensorDataset(input, target)
train_loader = torch.utils.data.DataLoader(xdata, batch_size=4)
```


## Loading Data from a .csv File

```python
import pandas as pd 

pd = pd.read_csv('sonar.all-data.csv')
# pre-process raw data
df = df.replace('R', 0)
df = df.replace('M', 1)
# Convert to torch tensor
data = torch.tensor(df.values, dtype=torch.float32)
# Split feature and target
num_input = data.shape[1] - 1
features = data[:, 0:num_input]
target = data[:, num_input:num_input+1]
# Create dataset
dataset = torch.utils.data.TensorDataset(features, target)
```


## Loading Custom Datasets

```python
from data import ImageFolder
# load images from a specified directory
dataset = ImageFolder(folder, transform)

from torchvision import datasets
# download popular image datasets remotely
mnist = datasets.MNIST(...)
cifar10 = datasets.CIFAR10(...)
celebA = datasets.CelebA(...)
... 
```


## Choosing an Optimizer

```python
# Stochastic Gradient Descent
optimizer = torch.optim.SGD(
    net.parameters(),
    lr=0.01,
    momentum=0.9,
    weight_decay=0.0001
)
# Adam = Adaptive Moment Estimation
optimizer = torch.optim.Adam(
    net.parameters(), 
    eps=0.000001,
    lr=0.01,
    betas=(0.5, 0.999),
    weight_decay=0.0001 
)
```


## Training 

```python
import torch.nn.functional as F
def train(args, net, device, train_loader, optimizer):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()  # zero gradients
        output = net(data)     # apply network
        loss = ...             # calculate loss
        loss.backward()        # update gradients
        optimizer.step()       # update weights 
```

#### Common Loss Functions

* `loss = torch.sum((output-target)*(output-target))`
* `loss = F.nll_loss(output,target)`             
* `loss = F.binary_cross_entropy(output,target) `
* `loss = F.softmax(output,dim=1)              `

* `loss = F.log_softmax(output,dim=1)          `


### Computational Graphs

PyTorch automatically builds a computational graph, enabling it to backpropagate derivatives.

Every parameter has `.data` and `.grad` components, e.g.:

```python
A.data
A.grad
```

`optimizer.zero_grad()` sets all `.grad` components to zero.
`loss.backward()` updates the `.grad` component of all parameters by backpropagating gradients through the computational graph.
`optimizer.step()` updates the `.data` components.

### Controlling the Computational Graph

If we need to stop the gradients from being backpropagated through a certain variable (or expression) A, we can exclude it from the computational graph by using `A.detach()`

By default, `loss.backward()` discards the computational graph after computing the gradients. If needed, we can force it to keep the computational graph by `loss.backward(retain_graph=True)`


## Testing

```python
def test(args, net, device, test_loader):
    with torch.no_grad():       # don't calculate gradients
        net.eval()              # toggle dropout, batch norm
        test_loss = 0
        for data, target in test_loader:
            output = net(data)
            test_loss += ...
        print(test_loss)
        net.train()             # toggle dropout, batch norm to training mode 
```


## Run PyTorch  

Toy example 1: train $f(x)=Ax$ s.t. $f(1) = 1$


In [1]:
import torch.utils.data
import numpy as np


class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.A = torch.nn.Parameter(torch.zeros(1, requires_grad=True))

    def forward(self, X):
        pred = self.A * X
        return pred


X = torch.Tensor([[1]])
Y = torch.Tensor([[1]])

dataset = torch.utils.data.TensorDataset(X, Y)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=1)


In [9]:
def train(m, optimizer, epochs, verbose=False):
    epoch = 0
    for _ in range(epochs):
        epoch += 1
        for batch_id, (x, y) in enumerate(train_loader):
            optimizer.zero_grad()
            pred = m(x)
            loss = 0.5 * torch.mean((pred - y)*(pred - y))
            if verbose:
                if type(m.A.grad) == type(None):
                    print('Ep%3d: zero_grad(): A.grad=  None  A.data=%7.4f loss=%7.4f'
                          % (epoch, m.A.data, loss))
                else:
                    print('Ep%3d: zero_grad(): A.grad=%7.4f A.data=%7.4f loss=%7.4f'
                          % (epoch, m.A.grad, m.A.data, loss))
            loss.backward()       # compute gradients
            optimizer.step()      # update weights
            if verbose:
                print('            step(): A.grad=%7.4f A.data=%7.4f'
                      % (m.A.grad, m.A.data))
            if loss < 0.000000001 or np.isnan(loss.data):
                return epoch
    return epoch


mom = 0
lr_lst = [0.01, 0.1, 0.5, 1.0, 1.5, 1.9, 2.0, 2.1]
# lr_lst = [1.0]
epochs = 1000

for lr in lr_lst:
    m = MyModel().to('cpu')
    optimizer = torch.optim.SGD(m.parameters(), lr=lr, momentum=mom)
    epoch = train(m, optimizer, epochs)
    print(f'{lr=}  \t{epoch=}  \t{m.A.data=}')


lr=0.01  	epoch=998  	m.A.data=tensor([1.0000])
lr=0.1  	epoch=97  	m.A.data=tensor([1.0000])
lr=0.5  	epoch=16  	m.A.data=tensor([1.0000])
lr=1.0  	epoch=2  	m.A.data=tensor([1.])
lr=1.5  	epoch=16  	m.A.data=tensor([1.0000])
lr=1.9  	epoch=97  	m.A.data=tensor([1.0000])
lr=2.0  	epoch=1000  	m.A.data=tensor([0.])
lr=2.1  	epoch=927  	m.A.data=tensor([nan])


In [13]:
lr = 1.9
mom_lst = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
epochs = 1000

for mom in mom_lst:
    m = MyModel().to('cpu')
    optimizer = torch.optim.SGD(m.parameters(), lr=lr, momentum=mom)
    epoch = train(m, optimizer, epochs)
    print(f'{mom=}  \t{epoch=}  \t{m.A.data=}')

mom=0.1  	epoch=25  	m.A.data=tensor([1.0000])
mom=0.2  	epoch=14  	m.A.data=tensor([1.0000])
mom=0.3  	epoch=13  	m.A.data=tensor([1.0005])
mom=0.4  	epoch=22  	m.A.data=tensor([0.9999])
mom=0.5  	epoch=30  	m.A.data=tensor([1.0000])
mom=0.6  	epoch=37  	m.A.data=tensor([1.0001])
mom=0.7  	epoch=48  	m.A.data=tensor([0.9997])
mom=0.8  	epoch=71  	m.A.data=tensor([1.0005])
mom=0.9  	epoch=192  	m.A.data=tensor([1.0000])


In [14]:
m = MyModel().to('cpu')
optimizer = torch.optim.SGD(m.parameters(), lr=lr, momentum=1.0)
epoch = train(m, optimizer, epochs, True)

Ep  1: zero_grad(): A.grad=  None  A.data= 0.0000 loss= 0.5000
            step(): A.grad=-1.0000 A.data= 1.9000
Ep  2: zero_grad(): A.grad= 0.0000 A.data= 1.9000 loss= 0.4050
            step(): A.grad= 0.9000 A.data= 2.0900
Ep  3: zero_grad(): A.grad= 0.0000 A.data= 2.0900 loss= 0.5940
            step(): A.grad= 1.0900 A.data= 0.2090
Ep  4: zero_grad(): A.grad= 0.0000 A.data= 0.2090 loss= 0.3128
            step(): A.grad=-0.7910 A.data=-0.1691
Ep  5: zero_grad(): A.grad= 0.0000 A.data=-0.1691 loss= 0.6834
            step(): A.grad=-1.1691 A.data= 1.6741
Ep  6: zero_grad(): A.grad= 0.0000 A.data= 1.6741 loss= 0.2272
            step(): A.grad= 0.6741 A.data= 2.2365
Ep  7: zero_grad(): A.grad= 0.0000 A.data= 2.2365 loss= 0.7645
            step(): A.grad= 1.2365 A.data= 0.4496
Ep  8: zero_grad(): A.grad= 0.0000 A.data= 0.4496 loss= 0.1515
            step(): A.grad=-0.5504 A.data=-0.2916
Ep  9: zero_grad(): A.grad= 0.0000 A.data=-0.2916 loss= 0.8341
            step(): A.grad=-1.291

In [15]:
m = MyModel().to('cpu')
optimizer = torch.optim.SGD(m.parameters(), lr=lr, momentum=1.1)
epoch = train(m, optimizer, epochs, True)

Ep  1: zero_grad(): A.grad=  None  A.data= 0.0000 loss= 0.5000
            step(): A.grad=-1.0000 A.data= 1.9000
Ep  2: zero_grad(): A.grad= 0.0000 A.data= 1.9000 loss= 0.4050
            step(): A.grad= 0.9000 A.data= 2.2800
Ep  3: zero_grad(): A.grad= 0.0000 A.data= 2.2800 loss= 0.8192
            step(): A.grad= 1.2800 A.data= 0.2660
Ep  4: zero_grad(): A.grad= 0.0000 A.data= 0.2660 loss= 0.2694
            step(): A.grad=-0.7340 A.data=-0.5548
Ep  5: zero_grad(): A.grad= 0.0000 A.data=-0.5548 loss= 1.2087
            step(): A.grad=-1.5548 A.data= 1.4964
Ep  6: zero_grad(): A.grad= 0.0000 A.data= 1.4964 loss= 0.1232
            step(): A.grad= 0.4964 A.data= 2.8096
Ep  7: zero_grad(): A.grad= 0.0000 A.data= 2.8096 loss= 1.6373
            step(): A.grad= 1.8096 A.data= 0.8158
Ep  8: zero_grad(): A.grad= 0.0000 A.data= 0.8158 loss= 0.0170
            step(): A.grad=-0.1842 A.data=-1.0274
Ep  9: zero_grad(): A.grad= 0.0000 A.data=-1.0274 loss= 2.0551
            step(): A.grad=-2.027

Toy example 2: train a two-layer NN to fit XOR.

In [22]:
import random
import torch.nn.functional as F


class XORModel(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.in_hid = torch.nn.Linear(2, 2)
        self.hid_out = torch.nn.Linear(2, 1)

    def forward(self, X):
        l1 = torch.tanh(self.in_hid(X))
        pred = torch.sigmoid(self.hid_out(l1))
        return pred


X = torch.Tensor([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = torch.Tensor([[0], [1], [1], [0]])

xor_dataset = torch.utils.data.TensorDataset(X, Y)
train_loader = torch.utils.data.DataLoader(xor_dataset, batch_size=4)

xor_m = XORModel().to('cpu')


def train(lr, mon, init, epochs=10000, verbose=False):
    # init weights
    torch.manual_seed(random.randint(0, 100000))
    xor_m.in_hid.weight.data.normal_(0, init)
    xor_m.hid_out.weight.data.normal_(0, init)
    optimizer = torch.optim.SGD(xor_m.parameters(), lr=lr, momentum=mon)
    for epoch in range(1, epochs):
        for x, y in train_loader:
            optimizer.zero_grad()
            pred = xor_m(x)
            loss = F.binary_cross_entropy(pred, y)
            loss.backward()
            optimizer.step()
            if verbose and epoch % 100 == 0:
                print('ep%3d: loss = %7.4f' % (epoch, loss.item()))
            if loss < 0.01:
                print(f"{epoch=}, Global Minimum, loss = {loss.item()}")
                return
    print(f"Local Minimum, loss = {loss.item()}")

lr = 0.1
mon = .0
init = 1.
for _ in range(10):
    train(lr, mon, init)

Local Minimum, loss = 0.34804069995880127
Local Minimum, loss = 0.3483721911907196
Local Minimum, loss = 0.4790840446949005
Local Minimum, loss = 0.01830323413014412
epoch=6981, Global Minimum, loss = 0.009997597895562649
epoch=3758, Global Minimum, loss = 0.009998379275202751
epoch=3459, Global Minimum, loss = 0.009998274967074394
epoch=3216, Global Minimum, loss = 0.009996936656534672
epoch=3538, Global Minimum, loss = 0.009998416528105736
Local Minimum, loss = 0.4790799021720886


In [24]:
mon = 0.9
init = 0.01
for _ in range(10):
    train(lr, mon, init)

epoch=650, Global Minimum, loss = 0.00997321866452694
epoch=472, Global Minimum, loss = 0.009986438788473606
epoch=419, Global Minimum, loss = 0.009983159601688385
epoch=429, Global Minimum, loss = 0.00999308843165636
epoch=2459, Global Minimum, loss = 0.009996993467211723
epoch=901, Global Minimum, loss = 0.00997856818139553
epoch=498, Global Minimum, loss = 0.009987319819629192
epoch=424, Global Minimum, loss = 0.00999538879841566
epoch=422, Global Minimum, loss = 0.00997464545071125
epoch=431, Global Minimum, loss = 0.00997532531619072
