<a href="https://colab.research.google.com/github/AnirudhVIyer/NLP/blob/main/Pytorch_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# create data
import numpy as np
np.random.seed(99)
x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1) #actual a=1 b=2

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

we know a=1, b=2, let's validate it throughout the process

In [2]:
# Initializes parameters "a" and "b" randomly
np.random.seed(42)
a = np.random.randn(1)
b = np.random.randn(1)

print(a, b)

# Sets learning rate
lr = 1e-1
# Defines number of epochs
n_epochs = 1000

for epoch in range(n_epochs):
    # Computes our model's predicted output
    yhat = a + b * x_train
    
    # How wrong is our model? That's the error! 
    error = (y_train - yhat)
    # It is a regression, so it computes mean squared error (MSE)
    loss = (error ** 2).mean()
    
    # Computes gradients for both "a" and "b" parameters
    a_grad = -2 * error.mean()
    b_grad = -2 * (x_train * error).mean()
    
    # Updates parameters using gradients and the learning rate
    a = a - lr * a_grad
    b = b - lr * b_grad
    
print(a, b)

# Sanity Check: do we get the same results as our gradient descent?
from sklearn.linear_model import LinearRegression
linr = LinearRegression()
linr.fit(x_train, y_train)
print(linr.intercept_, linr.coef_[0])

[0.49671415] [-0.1382643]
[1.07891506] [1.86215196]
[1.07891418] [1.8621536]


https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e#40de

In [4]:
import torch
import torch.optim as optim
import torch.nn as nn
!pip install torchviz
from torchviz import make_dot

# what processor to use
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Our data was in Numpy arrays, but we need to transform them into PyTorch's Tensors
# and then we send them to the chosen device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)

# Here we can see the difference - notice that .type() is more useful
# since it also tells us WHERE the tensor is (device)
print(type(x_train), type(x_train_tensor), x_train_tensor.type())

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
<class 'numpy.ndarray'> <class 'torch.Tensor'> torch.cuda.FloatTensor


In [4]:
# FIRST
# Initializes parameters "a" and "b" randomly, ALMOST as we did in Numpy
# since we want to apply gradient descent on these parameters, we need
# to set REQUIRES_GRAD = TRUE
# cpu tensor
a = torch.randn(1, requires_grad=True, dtype=torch.float)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
print(a, b)

# SECOND
# But what if we want to run it on a GPU? We could just send them to device, right?
# send to gpu but lose gradients
a = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
b = torch.randn(1, requires_grad=True, dtype=torch.float).to(device)
print(a, b)
# Sorry, but NO! The to(device) "shadows" the gradient...

# THIRD
# We can either create regular tensors and send them to the device (as we did with our data)
a = torch.randn(1, dtype=torch.float).to(device)
b = torch.randn(1, dtype=torch.float).to(device)
# and THEN set them as requiring gradients...
a.requires_grad_()
b.requires_grad_()
print(a, b)

## all these methods have their issues

tensor([0.2318], requires_grad=True) tensor([0.8427], requires_grad=True)
tensor([0.9739], device='cuda:0', grad_fn=<ToCopyBackward0>) tensor([-0.8145], device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([0.1432], device='cuda:0', requires_grad=True) tensor([0.6174], device='cuda:0', requires_grad=True)


In [5]:
## bestter way
## create a tensor, require compute gradient and dorectly sent it tp a devide
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

tensor([-1.1342], device='cuda:0', requires_grad=True) tensor([-0.1530], device='cuda:0', requires_grad=True)


In [6]:
lr = 1e-1
n_epochs = 1000

torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)

for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = (error ** 2).mean()

    # No more manual computation of gradients! 
    # a_grad = -2 * error.mean()
    # b_grad = -2 * (x_tensor * error).mean()
    
    # We just tell PyTorch to work its way BACKWARDS from the specified loss!
    loss.backward()
    # Let's check the computed gradients...
    # print(a.grad)
    # print(b.grad)
    
    # What about UPDATING the parameters? Not so fast...
    
    # FIRST ATTEMPT
    # AttributeError: 'NoneType' object has no attribute 'zero_'
    # a = a - lr * a.grad
    # b = b - lr * b.grad
    # print(a)

    # SECOND ATTEMPT
    # RuntimeError: a leaf Variable that requires grad has been used in an in-place operation.
    # a -= lr * a.grad
    # b -= lr * b.grad        
    
    # THIRD ATTEMPT
    # We need to use NO_GRAD to keep the update out of the gradient computation
    # Why is that? It boils down to the DYNAMIC GRAPH that PyTorch uses...
    with torch.no_grad():
        a -= lr * a.grad
        b -= lr * b.grad
    
    # PyTorch is "clingy" to its computed gradients, we need to tell it to let it go...
    a.grad.zero_()
    b.grad.zero_()
    
print(a, b)

tensor([1.0789], device='cuda:0', requires_grad=True) tensor([1.8621], device='cuda:0', requires_grad=True)


In [8]:
## using optimizers and loss
torch.manual_seed(42)
a = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
b = torch.randn(1, requires_grad=True, dtype=torch.float, device=device)
print(a, b)

lr = 1e-1
n_epochs = 1000

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD([a, b], lr=lr)
# adam exampls : optimizer = optim.Adam([a,b],lr-lr)
for epoch in range(n_epochs):
    yhat = a + b * x_train_tensor
    error = y_train_tensor - yhat
    loss = loss_fn(y_train_tensor,yhat)

    loss.backward()    
    
    # No more manual update!
    # with torch.no_grad():
    #     a -= lr * a.grad
    #     b -= lr * b.grad
    optimizer.step()
    
    # No more telling PyTorch to let gradients go!
    # a.grad.zero_()
    # b.grad.zero_()
    optimizer.zero_grad()
    
print(a, b)


tensor([0.1940], device='cuda:0', requires_grad=True) tensor([0.1391], device='cuda:0', requires_grad=True)
tensor([1.0789], device='cuda:0', requires_grad=True) tensor([1.8621], device='cuda:0', requires_grad=True)


In [5]:
## creating a model class

class customModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.a = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float, device=device))
    self.b = nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float, device=device))
    
  def forward(self, x):
    # Computes the outputs / predictions
     return self.a + self.b * x






In [20]:
torch.manual_seed(42)

##create a model and send to device
new_model = customModel().to(device)

lr = 1e-1
n_epochs = 1000

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(new_model.parameters(), lr=lr)
# adam exampls : optimizer = optim.Adam([a,b],lr-lr)
for epoch in range(n_epochs):

    #setup train mode
    new_model.train()
    yhat = new_model(x_train_tensor)

    loss = loss_fn(y_train_tensor,yhat)
    loss.backward()    
    
    # No more manual update!
    # with torch.no_grad():
    #     a -= lr * a.grad
    #     b -= lr * b.grad
    optimizer.step()
    
    # No more telling PyTorch to let gradients go!
    # a.grad.zero_()
    # b.grad.zero_()
    optimizer.zero_grad()
    
print(new_model.state_dict())


OrderedDict([('a', tensor([1.0789], device='cuda:0')), ('b', tensor([1.8621], device='cuda:0'))])


In [9]:
def make_train_step(model,lossfn,optimizer):
  def train_step(x,y):
    model.train()
    yhat = model(x)
    loss = lossfn(y,yhat)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    return loss.item()

  return train_step


torch.manual_seed(42)

##create a model and send to device
new_model = customModel().to(device)

lr = 1e-1
n_epochs = 1000

# Defines a MSE loss function
loss_fn = nn.MSELoss(reduction='mean')

# Defines a SGD optimizer to update the parameters
optimizer = optim.SGD(new_model.parameters(), lr=lr)
# adam exampls : optimizer = optim.Adam([a,b],lr-lr)
losses= []
train_step = make_train_step(new_model,loss_fn,optimizer)

for epoch in range(n_epochs):
      # Performs one train step and returns the corresponding loss
    loss = train_step(x_train_tensor, y_train_tensor)
    losses.append(loss)




#print(losses[-1])
print(new_model.state_dict())


OrderedDict([('a', tensor([1.0789], device='cuda:0')), ('b', tensor([1.8621], device='cuda:0'))])


In [10]:
## Builing Dataset and Dataloader

from torch.utils.data import Dataset, TensorDataset

class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index): ##to get specific index from the data
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.x)

# Wait, is this a CPU tensor now? Why? Where is .to(device)?
x_train_tensor = torch.from_numpy(x_train).float()
y_train_tensor = torch.from_numpy(y_train).float()

train_data = CustomDataset(x_train_tensor, y_train_tensor) 
print(train_data[0])##example of __getitem__

train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

## here we have crated the x,y data and stored it as a dataset in CPU

(tensor([0.5543]), tensor([2.1181]))
(tensor([0.5543]), tensor([2.1181]))


In [11]:
## use a dataloader class to wrap this dataset to serve mini batches

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_data, batch_size=16, shuffle=True)
losses = []

train_step = make_train_step(new_model, loss_fn, optimizer)

for epoch in range(n_epochs):
    for x_batch, y_batch in train_loader:
        # the dataset "lives" in the CPU, so do our mini-batches
        # therefore, we need to send those mini-batches to the
        # device where the model "lives"
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        
        loss = train_step(x_batch, y_batch)
        losses.append(loss)
        
print(new_model.state_dict())

OrderedDict([('a', tensor([1.0796], device='cuda:0')), ('b', tensor([1.8619], device='cuda:0'))])


In [12]:
# with random splitter 
from torch.utils.data.dataset import random_split

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

train_dataset, val_dataset = random_split(dataset, [80, 20])

# each dataloader for different datasets
train_loader = DataLoader(dataset=train_dataset, batch_size=16)
val_loader = DataLoader(dataset=val_dataset, batch_size=20)

In [15]:
### with validation evaluation


losses = []
val_losses = []
train_step = make_train_step(new_model, loss_fn, optimizer)

for epoch in range(n_epochs):
  #train one batch
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        loss = train_step(x_batch, y_batch)
        losses.append(loss)

        
    with torch.no_grad():
      #validate that batch
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            
            new_model.eval() ## in eval mode, will not learn from it

            yhat = new_model(x_val)
            val_loss = loss_fn(y_val, yhat)
            val_losses.append(val_loss.item()) #val loss won't affect the parameters, but for our reference

   

print(new_model.state_dict())

OrderedDict([('a', tensor([1.0515], device='cuda:0')), ('b', tensor([1.9164], device='cuda:0'))])


In [16]:
torch.manual_seed(42)



class ManualLinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(1, 1)

    def forward(self, x):
        return self.linear(x)

def make_train_step(model, loss_fn, optimizer):
    def train_step(x, y):
        model.train()
        yhat = model(x)
        loss = loss_fn(y, yhat)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        return loss.item()
    return train_step


model = ManualLinearRegression().to(device) # model = nn.Sequential(nn.Linear(1, 1)).to(device)
loss_fn = nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=1e-1)
train_step = make_train_step(model, loss_fn, optimizer)

n_epochs = 100
training_losses = []
validation_losses = []
print(model.state_dict())

for epoch in range(n_epochs):
    batch_losses = []
    for x_batch, y_batch in train_loader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        loss = train_step(x_batch, y_batch)
        batch_losses.append(loss)
    training_loss = np.mean(batch_losses)
    training_losses.append(training_loss)

    with torch.no_grad():
        val_losses = []
        for x_val, y_val in val_loader:
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            model.eval()
            yhat = model(x_val)
            val_loss = loss_fn(y_val, yhat).item()
            val_losses.append(val_loss)
        validation_loss = np.mean(val_losses)
        validation_losses.append(validation_loss)

    print(f"[{epoch+1}] Training loss: {training_loss:.3f}\t Validation loss: {validation_loss:.3f}")

print(model.state_dict())

OrderedDict([('linear.weight', tensor([[0.7645]], device='cuda:0')), ('linear.bias', tensor([0.8300], device='cuda:0'))])
[1] Training loss: 0.354	 Validation loss: 0.071
[2] Training loss: 0.071	 Validation loss: 0.038
[3] Training loss: 0.051	 Validation loss: 0.033
[4] Training loss: 0.046	 Validation loss: 0.029
[5] Training loss: 0.041	 Validation loss: 0.026
[6] Training loss: 0.037	 Validation loss: 0.023
[7] Training loss: 0.033	 Validation loss: 0.021
[8] Training loss: 0.030	 Validation loss: 0.018
[9] Training loss: 0.027	 Validation loss: 0.016
[10] Training loss: 0.025	 Validation loss: 0.015
[11] Training loss: 0.023	 Validation loss: 0.014
[12] Training loss: 0.021	 Validation loss: 0.012
[13] Training loss: 0.019	 Validation loss: 0.011
[14] Training loss: 0.018	 Validation loss: 0.011
[15] Training loss: 0.017	 Validation loss: 0.010
[16] Training loss: 0.016	 Validation loss: 0.009
[17] Training loss: 0.015	 Validation loss: 0.009
[18] Training loss: 0.014	 Validation