# <center>**Chapter 10 : Building Neural Nwtworks with Pytorch**</center>

## **Pytorch Fundamentals**



In [1]:
import torch


In [2]:
X = torch.tensor([[1.0 , 4.0 , 7.0] , [2.0 , 3.0 , 6.0]])
X

tensor([[1., 4., 7.],
        [2., 3., 6.]])

In [3]:
X.shape , X.dtype

(torch.Size([2, 3]), torch.float32)

In [4]:
# indexing works just like numpy arrays
X[0,1] , X[:,1]

(tensor(4.), tensor([4., 3.]))

In [5]:
import numpy as np
X.numpy()

array([[1., 4., 7.],
       [2., 3., 6.]], dtype=float32)

In [6]:
torch.tensor(np.array(np.array([[1 , 4 , 7] , [ 2 , 3 , 6]])))

tensor([[1, 4, 7],
        [2, 3, 6]])

In [7]:
#pytorch's API also provides many inplace operations

X.relu_()

tensor([[1., 4., 7.],
        [2., 3., 6.]])

## **Hardware Acceleration**


In [8]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

In [9]:
M = torch.tensor([[1. , 2. , 3.] , [4.,5.,6.]])
M = M.to(device)

In [10]:
M.device

device(type='cuda', index=0)

In [11]:
R = M @ M.T # this runs on the GPU
R

tensor([[14., 32.],
        [32., 77.]], device='cuda:0')

In [12]:
M = torch.rand(1000 , 1000) # on the CPU
%timeit M @ M.T

6.27 ms ± 855 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
M = torch.rand((1000 , 1000) , device = "cuda")
%timeit M @ M.T

471 μs ± 83.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## **Autograd**


In [14]:
x = torch.tensor(5.0 , requires_grad=True)
f = x ** 2
f
print(f)

tensor(25., grad_fn=<PowBackward0>)


In [15]:
f.backward()
x.grad

tensor(10.)

In [16]:
learning_rate = 0.1
with torch.no_grad():
    x -= learning_rate * x.grad  # this is the gradient decent step

In [17]:
# another way to avoid gradient computation

# detach method createds a new tensor detached from the computation graph with requires_grad = False, but still pointing to same data in memory
# this can be effective when you need to run some computations on a tensor without affecting the gradients ( eg : evaluation) , or when you need fine grained control over which operations should contribute to gradient computation.

x_detached = x.detach()
x_detached -= learning_rate * x.grad

### **Warning : If you forget to zero out the gradients at each training iteration, the backward() method will just accumulate them, causing incorrect gradient descent updates. Since, there wont be any explicit error, just low performance this issue may be hard to debug**

In [18]:
x.grad.zero_()

tensor(0.)

In [19]:
# putting  everything together, the whole training loop looks like this:

learning_rate = 0.1
x = torch.tensor(5.0 , requires_grad= True)
for iteration in range(100):
    f = x ** 2
    f.backward()
    with torch.no_grad():
        x -= learning_rate * x.grad  # this is the gradient descent step

    x.grad.zero_()

#### 1.  **Some oprerations - such as exp(), relu(), rsqrt(), sigmoid(), sqrt(), tan(), and tanh() - save their outputs in the computation graph during the forward pass, then use these outputs to compute the gradients during the backward pass. This means that you must not modify such an operations output in place, or you will get an error during the backward pass**

#### 2.  **Other operations  such as abs(), cos() , log() , sin(), square(), and var() save their inputs instead of their oputputs. Such an operation doesnot care if you modify its output in place, but you must not modify its inputs in place before the backward pass.**


## **Implementing Linear Regression**

In [20]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()
X_train_full , X_test ,y_train_full , y_test = train_test_split(housing.data , housing.target , random_state = 42)
X_train , X_valid , y_train , y_valid = train_test_split(X_train_full , y_train_full , random_state=42)

In [21]:
# converting to tensors

X_train = torch.FloatTensor(X_train)
X_valid = torch.FloatTensor(X_valid)
X_test = torch.FloatTensor(X_test)
mean = X_train.mean(dim = 0 , keepdims= True)
stds = X_train.std(dim = 0 , keepdims= True)
X_train = (X_train - mean)/stds
X_valid = (X_valid - mean)/stds
X_test = (X_test - mean)/stds

In [22]:
# converting the target column to tensors

y_train = torch.FloatTensor(y_train).reshape(-1,1)
y_valid = torch.FloatTensor(y_valid).reshape(-1,1)
y_test = torch.FloatTensor(y_test).reshape(-1,1)


In [23]:
# parameters of our linear regression model

torch.manual_seed(42)
n_features = X_train.shape[1] # there are 8 input features
w = torch.randn((n_features , 1), requires_grad=True)
b = torch.tensor(0. , requires_grad=True)

In [24]:
# using batch gradient descent

learning_rate = 0.4   # first we define the learning rate
n_epochs = 20    # run the loop for 20 epochs
for epoch in range(n_epochs):
    y_pred = X_train @ w + b    # running the forward pass
    loss = ((y_pred - y_train) ** 2).mean()  # calculating the mean squared error
    loss.backward()  # we run this to compute the gradients of the loss
    with torch.no_grad():   # performing the gradient descent step
        b -= learning_rate * b.grad
        w -= learning_rate * w.grad
        b.grad.zero_()
        w.grad.zero_()
    print(f"Epoch {epoch + 1}/{n_epochs} , Loss: {loss.item()}")

Epoch 1/20 , Loss: 16.158456802368164
Epoch 2/20 , Loss: 4.879366397857666
Epoch 3/20 , Loss: 2.2552270889282227
Epoch 4/20 , Loss: 1.3307628631591797
Epoch 5/20 , Loss: 0.9680694937705994
Epoch 6/20 , Loss: 0.8142679929733276
Epoch 7/20 , Loss: 0.7417047023773193
Epoch 8/20 , Loss: 0.7020702362060547
Epoch 9/20 , Loss: 0.6765919923782349
Epoch 10/20 , Loss: 0.6577966213226318
Epoch 11/20 , Loss: 0.6426153182983398
Epoch 12/20 , Loss: 0.6297224760055542
Epoch 13/20 , Loss: 0.6184942722320557
Epoch 14/20 , Loss: 0.608596920967102
Epoch 15/20 , Loss: 0.5998217463493347
Epoch 16/20 , Loss: 0.5920187830924988
Epoch 17/20 , Loss: 0.5850691795349121
Epoch 18/20 , Loss: 0.578873336315155
Epoch 19/20 , Loss: 0.573345422744751
Epoch 20/20 , Loss: 0.5684100985527039


In [25]:
# testing the model on new unseen data

X_new = X_test[:3] # pretend these are new instances
with torch.no_grad():
    y_pred = X_new @ w + b # using the trained parameter to make pedictions

y_pred

tensor([[0.8916],
        [1.6480],
        [2.6577]])

### **TIP : It's best to use ```torch.no_grad``` context during inference. Pytorch will consume less RAM and run faster since it wont have to keep track of the computation graph**

## **Linear Regression using pytorch's High level API**

In [26]:
import torch.nn as nn  # conventional way to import the model

torch.manual_seed(42)
model = nn.Linear(in_features=n_features , out_features= 1)


In [27]:
model.bias

Parameter containing:
tensor([0.3117], requires_grad=True)

In [28]:
model.weight

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)

In [29]:
for param in model.named_parameters():
    [...]  # do something with each parameter

In [30]:
model(X_train[:2])

tensor([[-0.4718],
        [ 0.1131]], grad_fn=<AddmmBackward0>)

#### **when we use a module as a function, pytorch internally calls the module's ```forward()``` method. in the case of the ```nn.Linear``` module, the forward() method computes X @ self.weight.T + self.bias.**

In [31]:
# now that we have our model, we need to create an optimizer to update the model parameters, and we must also choose a loss function

optimizer = torch.optim.SGD(model.parameters() , lr = learning_rate)
mse = nn.MSELoss()

In [32]:
# a small function to train our model

def train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs):
    for epoch in range(n_epochs):
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch + 1}/{n_epochs} , Loss : {loss.item()}")

- ##### **in pytorch the loss function object is commonly referred to as the criterion, to distinguish it from the loss value itself.**
- ##### **the ```optimizer.step()``` line corresponds to the two lines that updated b and w in our earlier code**

In [35]:
# function call to train our model

train_bgd(model, optimizer, mse, X_train , y_train, n_epochs)

Epoch 1/20 , Loss : 4.3378496170043945
Epoch 2/20 , Loss : 0.780293345451355
Epoch 3/20 , Loss : 0.6253840327262878
Epoch 4/20 , Loss : 0.6060433387756348
Epoch 5/20 , Loss : 0.5956299304962158
Epoch 6/20 , Loss : 0.5873566269874573
Epoch 7/20 , Loss : 0.5802990198135376
Epoch 8/20 , Loss : 0.5741382241249084
Epoch 9/20 , Loss : 0.5687100887298584
Epoch 10/20 , Loss : 0.5639079213142395
Epoch 11/20 , Loss : 0.5596510767936707
Epoch 12/20 , Loss : 0.5558737516403198
Epoch 13/20 , Loss : 0.5525193810462952
Epoch 14/20 , Loss : 0.5495391488075256
Epoch 15/20 , Loss : 0.5468899011611938
Epoch 16/20 , Loss : 0.544533908367157
Epoch 17/20 , Loss : 0.5424376130104065
Epoch 18/20 , Loss : 0.5405715703964233
Epoch 19/20 , Loss : 0.5389096736907959
Epoch 20/20 , Loss : 0.5374288558959961


In [37]:
X_new = X_test[0:3]    # pretend these are new instances
with torch.no_grad():
    y_pred = model(X_new)    # use the trained model to make predictions

y_pred

tensor([[0.8061],
        [1.7116],
        [2.6973]])

##  **Implementing a Regression MLP**

- ##### **pytorch provides a helpful ```nn.Sequential``` module that chains multiple modules: when you call this module with some inputs, it feeds these inputs to the first module, then feeds the output of the first module to the second module and so on**

In [None]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(n_features,50),      # n_features = 8, number of outputs = 50
    nn.ReLU(),                     # ReLu activation function
    nn.Linear(50,40),
    nn.ReLU(),
    nn.Linear(40,1)
)

In [40]:
# training model

learning_rate = 0.1
optimizer = torch.optim.SGD(model.parameters(),lr = learning_rate)
mse = nn.MSELoss()
train_bgd(model , optimizer , mse , X_train, y_train , n_epochs)

Epoch 1/20 , Loss : 5.045480251312256
Epoch 2/20 , Loss : 2.0523128509521484
Epoch 3/20 , Loss : 1.0039883852005005
Epoch 4/20 , Loss : 0.8570139408111572
Epoch 5/20 , Loss : 0.7740675210952759
Epoch 6/20 , Loss : 0.7225848436355591
Epoch 7/20 , Loss : 0.6893726587295532
Epoch 8/20 , Loss : 0.6669033765792847
Epoch 9/20 , Loss : 0.6507738828659058
Epoch 10/20 , Loss : 0.6383934617042542
Epoch 11/20 , Loss : 0.6281994581222534
Epoch 12/20 , Loss : 0.6193399429321289
Epoch 13/20 , Loss : 0.6113173365592957
Epoch 14/20 , Loss : 0.6038705706596375
Epoch 15/20 , Loss : 0.5968308448791504
Epoch 16/20 , Loss : 0.5901119112968445
Epoch 17/20 , Loss : 0.5836468935012817
Epoch 18/20 , Loss : 0.5774064064025879
Epoch 19/20 , Loss : 0.5713555216789246
Epoch 20/20 , Loss : 0.565444827079773


## **Implementing Mini-Batch Gradient Descent using Dataloaders**

- ##### **to implement mini batch GD, pytorch provides a class named Dataloader. It can efficiently load batches of data of the desired size and shuffle the data at each epoch if we want to**
- ##### **we first need to wrap X_train and y_train tensors in a dataset object with the required API, to help with this, pytorch provides a TensorDataset class.**

In [41]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train , y_train)
train_loader = DataLoader(train_dataset , batch_size= 32 , shuffle=True)

In [43]:
torch.manual_seed(42)
model = nn.Sequential(
    nn.Linear(n_features, 50),
    nn.ReLU(),
    nn.Linear(50 , 40),
    nn.ReLU(),
    nn.Linear(40,1)
)
model = model.to(device)


In [47]:
#N lets create a train function to implement mini batch GD

def train(model, optimizer , critereion , train_loader , n_epochs):
    model.train()
    for epoch in range(n_epochs):
        total_loss = 0.
        for X_batch , y_batch in train_loader:
            X_batch , y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = critereion(y_pred , y_batch)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        mean_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{n_epochs} , Loss : {mean_loss : .4f}")

In [48]:
train(model , optimizer , mse , train_loader , n_epochs)

Epoch 1/20 , Loss :  5.0443
Epoch 2/20 , Loss :  5.0456
Epoch 3/20 , Loss :  5.0457
Epoch 4/20 , Loss :  5.0450
Epoch 5/20 , Loss :  5.0452
Epoch 6/20 , Loss :  5.0449
Epoch 7/20 , Loss :  5.0449
Epoch 8/20 , Loss :  5.0459
Epoch 9/20 , Loss :  5.0460
Epoch 10/20 , Loss :  5.0456
Epoch 11/20 , Loss :  5.0449
Epoch 12/20 , Loss :  5.0461
Epoch 13/20 , Loss :  5.0452
Epoch 14/20 , Loss :  5.0456
Epoch 15/20 , Loss :  5.0454
Epoch 16/20 , Loss :  5.0455
Epoch 17/20 , Loss :  5.0460
Epoch 18/20 , Loss :  5.0451
Epoch 19/20 , Loss :  5.0456
Epoch 20/20 , Loss :  5.0445
