# PyTorch Cheatsheet

[Pytorch Documentation](https://pytorch.org/docs/stable/index.html)

In [None]:
# !python -m pip install torch==2.0.0 torchvision torchaudio

In [1]:
import torch

## Tensors

**Creating tensors**

In [2]:
# Create a Torch tensor
elements = [[1, 2, 3], 
            [4, 5, 6]]
t = torch.Tensor(elements)
t

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [3]:
# Construct a matrix filled zeros and of dtype long:
x = torch.zeros(5, 3, dtype=torch.long)
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [4]:
# Create tensor from normal distribution randoms
t = torch.randn(3, 3)
t

tensor([[ 0.0528, -0.0225, -0.7702],
        [-1.1994,  0.1754, -0.0061],
        [-0.2733,  1.1535,  1.8019]])

**Inspecting tensors**

In [5]:
# Create a tensor 
# a. fill it with zeros the values of "elements" above
# b. Set the tensor to type 'long'
x = torch.LongTensor(elements).long()
x, x.type()

(tensor([[1, 2, 3],
         [4, 5, 6]]),
 'torch.LongTensor')

In [6]:
# Tensor indexing: extract the value of the first row and second column of the tensor
x[0, 1]

tensor(2)

In [7]:
# Tensor indexing: extract the first row of the tensor
x[1]

tensor([4, 5, 6])

In [8]:
# Some tensor info
# a. create a tensor of size 3,4,2 filled with random values drawn from normal distribution
t = torch.randn(3, 4, 2)
# b. getting some information about the created tensor
print('Tensor shape: \t\t', t.shape)       # Note: t.size() gives the same
print('Number of dimensions: \t', t.dim()) # number of dimensions the tensor has
print('Tensor type: \t\t', t.type())       # Note: there are other types (https://pytorch.org/docs/2.0/tensors.html?highlight=tensor+types)

Tensor shape: 		 torch.Size([3, 4, 2])
Number of dimensions: 	 3
Tensor type: 		 torch.FloatTensor


In [9]:
# c. more methods to get information about the created tensor
print('Tensor dtype: \t\t', t.dtype)       # type of the data contained within the tensor.
print('Device tensor on: \t', t.device)    # where tensor computations will be performed (CPU or GPU)
print('Tensor layout: \t\t', t.layout)     # how the tensor is stored in memory
print('Number of elements: \t', t.numel()) # number of elements contained within the tensor

Tensor dtype: 		 torch.float32
Device tensor on: 	 cpu
Tensor layout: 		 torch.strided
Number of elements: 	 24


**Reshaping Tensors**

Reshaping changes the tensor's shape but not the underlying data

In [10]:
# Create a tensor of size (3,4) with random values
t = torch.randn(3, 4)
t, t.numel()

(tensor([[-0.7397, -0.1561,  0.0840,  0.8304],
         [ 1.1975, -0.4931,  1.3313, -2.1158],
         [ 0.7900, -0.2853, -1.1461,  0.0483]]),
 12)

In [11]:
# reshape from (3,4) to (2,6). new shape must give same numel as initial
t_26 = t.reshape([2,6])
t_26, t_26.numel()

(tensor([[-0.7397, -0.1561,  0.0840,  0.8304,  1.1975, -0.4931],
         [ 1.3313, -2.1158,  0.7900, -0.2853, -1.1461,  0.0483]]),
 12)

In [12]:
# reshape from (3,4) to (6,2)
t_62 = t.reshape([6,2])
t_62, t_62.numel()

(tensor([[-0.7397, -0.1561],
         [ 0.0840,  0.8304],
         [ 1.1975, -0.4931],
         [ 1.3313, -2.1158],
         [ 0.7900, -0.2853],
         [-1.1461,  0.0483]]),
 12)

In [13]:
# reshape from (3,4) to (3,4). nothing changes
t.reshape([3,4])

tensor([[-0.7397, -0.1561,  0.0840,  0.8304],
        [ 1.1975, -0.4931,  1.3313, -2.1158],
        [ 0.7900, -0.2853, -1.1461,  0.0483]])

In [14]:
# reshape with an added dimension 
t.reshape(2,2,3)

tensor([[[-0.7397, -0.1561,  0.0840],
         [ 0.8304,  1.1975, -0.4931]],

        [[ 1.3313, -2.1158,  0.7900],
         [-0.2853, -1.1461,  0.0483]]])

**Resizing using torch.view**

In [15]:
x = torch.randn(4, 4)   
y = x.view(16)          # change tensor of size (4,4) to tensor of size (16)
z = x.view(-1, 8)       # Note: the size where -1 is, is inferred from other dimensions
print(x.size(), y.size(), z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


In [16]:
# View again...
x = t.view(6,2)
x.size()

torch.Size([6, 2])

In [17]:
y = x.view(3, 2, 2)
y

tensor([[[-0.7397, -0.1561],
         [ 0.0840,  0.8304]],

        [[ 1.1975, -0.4931],
         [ 1.3313, -2.1158]],

        [[ 0.7900, -0.2853],
         [-1.1461,  0.0483]]])

In [18]:
y.view(6, -1)

tensor([[-0.7397, -0.1561],
        [ 0.0840,  0.8304],
        [ 1.1975, -0.4931],
        [ 1.3313, -2.1158],
        [ 0.7900, -0.2853],
        [-1.1461,  0.0483]])

**Squeezing and unsqueezing**

* Squeezing a tensor removes the dimensions or axes that have a length of one.
* Unsqueezing a tensor adds a dimension with a length of one.
* These functions allow us to expand or shrink the rank (number of dimensions) of our tensor

In [19]:
print(t.reshape([1,12]).shape)
print(t.reshape([1,12]).squeeze())
print(t.reshape([1,12]).squeeze().shape)

torch.Size([1, 12])
tensor([-0.7397, -0.1561,  0.0840,  0.8304,  1.1975, -0.4931,  1.3313, -2.1158,
         0.7900, -0.2853, -1.1461,  0.0483])
torch.Size([12])


In [20]:
y = torch.randn(3,2,3)

In [21]:
print(y.size())
y3 = y.unsqueeze(2)
print(y3.size())
print(y3.squeeze().size())

torch.Size([3, 2, 3])
torch.Size([3, 2, 1, 3])
torch.Size([3, 2, 3])


**Transpose**

In [22]:
x = torch.randn(3,2)
y = x.transpose(0, 1)
print('X shape: ', x.shape)
print('Y shape: ', y.shape)
x, y

X shape:  torch.Size([3, 2])
Y shape:  torch.Size([2, 3])


(tensor([[ 0.6847, -0.4057],
         [-1.5375,  0.4401],
         [-0.3087, -0.0034]]),
 tensor([[ 0.6847, -1.5375, -0.3087],
         [-0.4057,  0.4401, -0.0034]]))

**Broadcasting**

See [pytorch 2.0.0 documentation](http://pytorch.org/docs/0.3.1/notes/broadcasting.html)

In [23]:
# broadcasting
x = torch.rand(3, 1)
y = torch.rand(3, 2)
print('X shape: ', x.shape)
print('Y shape: ', y.shape) # Notice that x's dimension 1 (aka 2nd dim, aka last dim) is of size 1, while y's is of size 2
print(x)
print(y)
print(x + y)
print(torch.equal(x+y, y+x)) # check equality of 2 tensors

X shape:  torch.Size([3, 1])
Y shape:  torch.Size([3, 2])
tensor([[0.9411],
        [0.8422],
        [0.9816]])
tensor([[0.7207, 0.0050],
        [0.2998, 0.8244],
        [0.8220, 0.1230]])
tensor([[1.6618, 0.9462],
        [1.1420, 1.6667],
        [1.8036, 1.1047]])
True


**Basic Tensor Operations**

**Matrix product**

In [24]:
# Compute matrix product
x = torch.rand(3, 2)
y = torch.rand(2, 3)
x.matmul(y)

tensor([[0.6203, 0.4387, 0.5037],
        [0.4763, 0.2250, 0.2514],
        [0.2042, 0.2217, 0.2593]])

In [25]:
# Compute batch matrix product
# a. in x and y below think of the first dim (where size is both 2) as the batch size 
x = torch.rand(2, 3, 2)
y = torch.rand(2, 2, 3)

# b. torch.bmm() will do matrix product between each element of the batch in x and y
xy = x.bmm(y)
print("Result of x.bmm(y) operation: ", xy) # Note: same semantics with  x.bmm(y) if you do torch.bmm(x, y)

# c. show that 1st elem of dim=0 of bmm(x,y) is the same as x[0].matmul(y[0])
xy_0 = xy[0]
x0_y0 = torch.matmul(x[0], y[0])
print("Check equality: ", torch.equal(xy_0, x0_y0))

Result of x.bmm(y) operation:  tensor([[[0.2272, 0.3952, 0.6442],
         [0.0824, 0.1391, 0.2336],
         [0.2350, 0.4159, 0.6666]],

        [[1.1697, 1.0601, 0.6435],
         [1.3658, 1.1937, 0.8389],
         [0.4688, 0.3894, 0.3283]]])
Check equality:  True


In [26]:
# Sequence of operations
# a. create a tensor of size (2,2)
x = torch.Tensor([[2, 4], 
                  [5, 10]])
# b. create a tensor of size (1,1)
y = torch.Tensor([[1], 
                  [1]])
# c. use broadcasting to add 1 to every element of x using y
# d. matrix muliply the result of c with a tensor of size (1,1)
t = (x + y).mm(torch.Tensor([[10], [20]]))
t

tensor([[130.],
        [280.]])

**Concatenating tensors**

We combine tensors using the cat() function, and the resulting tensor will have a shape that depends on the shape of the two input tensors.

In [27]:
t1 = torch.tensor([[1,2],
                   [3,4]])
t2 = torch.tensor([[5,6],
                   [7,8]])
print(t1)
print(t2)

tensor([[1, 2],
        [3, 4]])
tensor([[5, 6],
        [7, 8]])


In [28]:
# Combine row-wise 
torch.cat((t1, t2), dim=0)

tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])

In [29]:
# Combine columns wize
torch.cat((t1, t2), dim=1)

tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])

**Stacking tensors**

In [30]:
t1 = torch.tensor([
    [1,1,1,1],
    [1,1,1,1],
    [1,1,1,1],
    [1,1,1,1]
])

t2 = torch.tensor([
    [2,2,2,2],
    [2,2,2,2],
    [2,2,2,2],
    [2,2,2,2]
])

t3 = torch.tensor([
    [3,3,3,3],
    [3,3,3,3],
    [3,3,3,3],
    [3,3,3,3]
])

In [31]:
t = torch.stack((t1, t2, t3))
t.shape, t1.shape, t2.shape, t3.shape

(torch.Size([3, 4, 4]),
 torch.Size([4, 4]),
 torch.Size([4, 4]),
 torch.Size([4, 4]))

**Extracting Tensor Values** 

In [32]:
# Slicing
t = torch.Tensor([[1, 2, 3], 
                  [4, 5, 6], 
                  [7, 8, 9]])

# Every row, only the last column
print(t[:, -1])

# First 2 rows, all columns
print(t[:2, :])

# Lower right most corner
print(t[-1:, -1:])


tensor([3., 6., 9.])
tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([[9.]])


In [33]:
# If you have a one element tensor, use .item() to get the value as a Python number

x = torch.randn(1)
print(x)
print(x.item())

tensor([-0.2931])
-0.29308679699897766


**Max value**

In [34]:
# use a range to create a vector with values in the range (0,10)
# resize vector of size (10) to matrix of size (2,5)
x = torch.arange(0,10).resize_((2,5))

# Return max values and their position 
# k: is the number of top values to return
# dim: the dimension to obtain the top-k values from
topk, indices = torch.topk(x, k=2, dim=1)

print(x)
print(topk)
print(indices)

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
tensor([[4, 3],
        [9, 8]])
tensor([[4, 3],
        [4, 3]])


**PyTorch Tensor To and From Numpy ndarray**

You can easily create a tensors from an ndarray and vice versa. These operations are fast, since the data of both structures will share the same memory space, and so no copying is involved. This is obviously an efficient approach.


In [35]:
# Numpy ndarray <--> PyTorch tensor
import numpy as np

# ndarray to tensor
a = np.random.randn(3, 5)
t = torch.from_numpy(a)
print(a)
print(t)
print(type(a))
print(type(t))

[[ 0.24169915 -0.23481165  0.39476747  1.06707743  2.17999221]
 [ 0.12580694  0.20014081  3.6688598   1.98841113  0.64940067]
 [-0.74333261  1.33172357  1.55088031  0.291102   -0.38484785]]
tensor([[ 0.2417, -0.2348,  0.3948,  1.0671,  2.1800],
        [ 0.1258,  0.2001,  3.6689,  1.9884,  0.6494],
        [-0.7433,  1.3317,  1.5509,  0.2911, -0.3848]], dtype=torch.float64)
<class 'numpy.ndarray'>
<class 'torch.Tensor'>



### GPUs

PyTorch tensors have inherent GPU support. Specifying to use the GPU memory and CUDA cores for storing and performing tensor calculations is easy; the cuda package can help determine whether GPUs are available, and the package's cuda() method assigns a tensor to the GPU.


In [None]:
# Checking whether GPU is available
torch.cuda.is_available()

# Move to GPU # 
# Note: if you did not install PyTorch with CUDA, this will throw an error
# Note 2: if your laptop does not have an Nvidia GPU, you don't need to install the CUDA-enabled version of PyTorch
t.cuda()

## Shuffling and batching data

In [36]:
# Create some dummy training data 
# a. the training and testing dataset X, Y has 200 examples 
# b. each example in X is described by 250 features (for e.g. 250 word vocab from a TFIDFVectorizer)
# c. the target to predict has values of 0 or 1. i.e. binary classification
X, Y = torch.randn(200, 250), torch.cat((torch.ones(100,1), torch.zeros(100,1)))
from sklearn.model_selection import train_test_split

X_train, X_valid,Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, shuffle=True)

In [37]:
from torch.utils.data import TensorDataset, DataLoader
# the TensorDataset is a ready to use class to represent your data as list of tensors. 
# Note that input_features and labels must match on the length of the first dimension
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

# DataLoader shuffles and batches the data and load its in parallel using multiprocessing workers
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=32)

## Specifying a neural network


Within the __init__ we define the layers of the module. 

Our three layers are an embedding layer, our RNN, and a linear layer. All layers have their parameters initialized to random values, unless explicitly specified.

The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector spac

In [38]:
from torch import tensor
from torch import nn
from torch import sigmoid
import torch.nn.functional as F
import torch.optim as optim

class LinearModel(nn.Module):
    def __init__(self, model_dimension):
        """
        create an instance of the base nn.Module class
        """
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(model_dimension, 1)  # hidden_size in and one out

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data.
        """
        y_pred = sigmoid(self.linear(x))
        return y_pred


# our model
# NOTE: using the dummy data created above (X, Y) of sizes (200,250) and (200,1)
linear_model = LinearModel(model_dimension = X.size(-1))

# Construct our loss function and an Optimizer. The call to linear_model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = nn.BCELoss()
optimizer = optim.SGD(linear_model.parameters(), lr=0.01)

# Training loop
num_epochs = 10
linear_model.train()
for epoch in range(num_epochs):
    for bn, batch in enumerate(train_loader):
        x, y = batch
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = linear_model(x)

        # Compute and print loss
        loss = criterion(y_pred, y)
        print(f'Epoch {epoch + 1}/{num_epochs}, batch num {bn} | Loss: {loss.item():.4f}')

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print('#'*50)
# Compute loss on validation set
linear_model.eval()
with torch.no_grad():
    y_pred_test = linear_model(X_valid)
    print('Validation set loss: ', criterion(y_pred_test, Y_valid))

Epoch 1/10, batch num 0 | Loss: 0.6784
Epoch 1/10, batch num 1 | Loss: 0.7026
Epoch 1/10, batch num 2 | Loss: 0.6872
Epoch 1/10, batch num 3 | Loss: 0.6870
Epoch 1/10, batch num 4 | Loss: 0.7241
Epoch 2/10, batch num 0 | Loss: 0.6812
Epoch 2/10, batch num 1 | Loss: 0.6298
Epoch 2/10, batch num 2 | Loss: 0.6870
Epoch 2/10, batch num 3 | Loss: 0.6861
Epoch 2/10, batch num 4 | Loss: 0.6951
Epoch 3/10, batch num 0 | Loss: 0.6249
Epoch 3/10, batch num 1 | Loss: 0.6239
Epoch 3/10, batch num 2 | Loss: 0.6888
Epoch 3/10, batch num 3 | Loss: 0.6874
Epoch 3/10, batch num 4 | Loss: 0.6697
Epoch 4/10, batch num 0 | Loss: 0.6501
Epoch 4/10, batch num 1 | Loss: 0.5580
Epoch 4/10, batch num 2 | Loss: 0.6909
Epoch 4/10, batch num 3 | Loss: 0.6208
Epoch 4/10, batch num 4 | Loss: 0.6857
Epoch 5/10, batch num 0 | Loss: 0.5902
Epoch 5/10, batch num 1 | Loss: 0.6743
Epoch 5/10, batch num 2 | Loss: 0.6358
Epoch 5/10, batch num 3 | Loss: 0.6297
Epoch 5/10, batch num 4 | Loss: 0.5915
Epoch 6/10, batch num 0 |

### MLP

In [40]:
import torch
from torch import nn
from torch import sigmoid
import torch.nn.functional as F
import torch.optim as optim


class MLPModel(nn.Module):
    def __init__(self, model_dimension):
        """
        In the constructor we instantiate nn.Linear module
        """
        super(MLPModel, self).__init__()
        self.linear = nn.Linear(model_dimension, 1)  # One in and one out
    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data.
        """
        y_pred = sigmoid(self.linear(x))
        return y_pred

# our model
mlp_model = MLPModel(model_dimension = X.size(-1))
criterion = nn.BCELoss()
optimizer = optim.SGD(mlp_model.parameters(), lr=0.01)

# Training loop
num_epochs = 10
mlp_model.train()
for epoch in range(num_epochs):
    for bn, batch in enumerate(train_loader):
        x, y = batch
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = mlp_model(x)

        # Compute and print loss
        loss = criterion(y_pred, y)
        print(f'Epoch {epoch + 1}/{num_epochs}, batch num {bn} | Loss: {loss.item():.4f}')

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print('#'*50)
# Compute loss on validation set
mlp_model.eval()
with torch.no_grad():
    y_pred_test = mlp_model(X_valid)
    print('Validation set loss: ', criterion(y_pred_test, Y_valid))

Epoch 1/10, batch num 0 | Loss: 0.8113
Epoch 1/10, batch num 1 | Loss: 0.7186
Epoch 1/10, batch num 2 | Loss: 0.7229
Epoch 1/10, batch num 3 | Loss: 0.6390
Epoch 1/10, batch num 4 | Loss: 0.7212
Epoch 2/10, batch num 0 | Loss: 0.6905
Epoch 2/10, batch num 1 | Loss: 0.7101
Epoch 2/10, batch num 2 | Loss: 0.7361
Epoch 2/10, batch num 3 | Loss: 0.6733
Epoch 2/10, batch num 4 | Loss: 0.6912
Epoch 3/10, batch num 0 | Loss: 0.7093
Epoch 3/10, batch num 1 | Loss: 0.6753
Epoch 3/10, batch num 2 | Loss: 0.6598
Epoch 3/10, batch num 3 | Loss: 0.6487
Epoch 3/10, batch num 4 | Loss: 0.7043
Epoch 4/10, batch num 0 | Loss: 0.7528
Epoch 4/10, batch num 1 | Loss: 0.6081
Epoch 4/10, batch num 2 | Loss: 0.7039
Epoch 4/10, batch num 3 | Loss: 0.5931
Epoch 4/10, batch num 4 | Loss: 0.6410
Epoch 5/10, batch num 0 | Loss: 0.6600
Epoch 5/10, batch num 1 | Loss: 0.6425
Epoch 5/10, batch num 2 | Loss: 0.6014
Epoch 5/10, batch num 3 | Loss: 0.6101
Epoch 5/10, batch num 4 | Loss: 0.6933
Epoch 6/10, batch num 0 |

### RNN

In [41]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_labels):
        super().__init__()
        
        # Here we define the network layers
        
        # An embedding layer projecting vectors of size vocab_size into embeddings of size embed_size
        # Assigns to each word in the vocabulary an embedding vector of size embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        
        # A recurrent (GRU) layer to process each input token (represented by its embedding)
        # The GRU network takes as input the embedding (of size embed_size) of the current word 
        # and the previous hidden state (of size hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True)
        
        # Drop out layer for regularisation
        self.dropout = nn.Dropout(0.3)
        
        # Fully connected layer mapping 
        # the last layer maps a hidden state to a vector of size the number of classes
        self.decision = nn.Linear(hidden_size, num_labels)
         
    def forward(self, x):
        # Here we say how the layers are connected 
       
        # for each token in the input, retrieve the corresponding embeddings 
        # x = [batch size, max sent length]
        embed = self.embed(x)
        
        # Run the RNN on the input embeddings
        # embed = [batch size, sent len, emb dim]
        # output is the sequence of hidden states produced by the RNN
        # hidden is the last hidden state produced
        output, hidden = self.rnn(embed)
        
        # output = [sent len, batch size, hidden size]
        # hidden = [num_layers * num_directions, batch, hidden_size]
        
        # Apply dropout (for regularisation)
        drop = self.dropout(hidden)
        
        # Apply the fully connected layer to the output of the dropout
        # drop = [num_layers * num_directions, batch_size, hidden_size]
        # Expected output shape: (batch_size, num_labels)
        return self.decision(drop.squeeze()) # or:  self.decision(drop.view(x.size(0), -1))

# NOTE: we will be using a different set of dummy training data from above (X,Y)
# NOTE: X here is of size (200, 250) which corresponds to (batch_size, sequence length)
# NOTE: create X as a randint tensor of size (200,250) with the highest value of 3000 (think of this as vocab size)
X, Y = torch.randint(3000, (200, 250), dtype=torch.long), torch.cat((torch.ones(100, dtype=torch.long), 
                                                                     torch.zeros(100, dtype=torch.long)))
X_train, X_valid,Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, shuffle=True)
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=32)


# 1. instantiate the RNN with the required parameter values
# NOTE: hidden_size and embed_size are the dimensions for the internal layers of the network
rnn_model = RNN(vocab_size = len(X.unique()), embed_size= 100, hidden_size = 75, num_labels = len(Y.unique()))
# 2. check for cuda availablilty. set gpu/cpu device usage accordingly
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# 3. set the model to the device to use for computation 
rnn_model.to(device)
print(rnn_model)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(rnn_model.parameters(), lr=0.01)

# Training loop
rnn_model.train()
num_epochs = 10
for epoch in range(num_epochs):
    for bn, batch in enumerate(train_loader):
        x, y = batch
        # Forward pass: Compute predicted y by passing x to the model
        y_pred = rnn_model(x)

        # Compute and print loss
        loss = criterion(y_pred, y)
        print(f'Epoch {epoch + 1}/{num_epochs}, batch num {bn} | Loss: {loss.item():.4f}')

        # Zero gradients, perform a backward pass, and update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print('#'*50)
# Compute loss on validation set
rnn_model.eval()
with torch.no_grad():
    y_pred_test = rnn_model(X_valid)
    print('Validation set loss: ', criterion(y_pred_test, Y_valid))

RNN(
  (embed): Embedding(3000, 100)
  (rnn): GRU(100, 75, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (decision): Linear(in_features=75, out_features=2, bias=True)
)
Epoch 1/10, batch num 0 | Loss: 0.7213
Epoch 1/10, batch num 1 | Loss: 0.6757
Epoch 1/10, batch num 2 | Loss: 0.6960
Epoch 1/10, batch num 3 | Loss: 0.7255
Epoch 1/10, batch num 4 | Loss: 0.7369
Epoch 2/10, batch num 0 | Loss: 0.6966
Epoch 2/10, batch num 1 | Loss: 0.7453
Epoch 2/10, batch num 2 | Loss: 0.6703
Epoch 2/10, batch num 3 | Loss: 0.6641
Epoch 2/10, batch num 4 | Loss: 0.6971
Epoch 3/10, batch num 0 | Loss: 0.6648
Epoch 3/10, batch num 1 | Loss: 0.6935
Epoch 3/10, batch num 2 | Loss: 0.7063
Epoch 3/10, batch num 3 | Loss: 0.6829
Epoch 3/10, batch num 4 | Loss: 0.7083
Epoch 4/10, batch num 0 | Loss: 0.6844
Epoch 4/10, batch num 1 | Loss: 0.6834
Epoch 4/10, batch num 2 | Loss: 0.7069
Epoch 4/10, batch num 3 | Loss: 0.6667
Epoch 4/10, batch num 4 | Loss: 0.6949
Epoch 5/10, batch num 0 | Loss: 0.

### Embedding layer

* The nn.Embedding module holds a Tensor of dimension (vocab_size, embedding_size), i.e. of the size of the vocabulary x the dimension of each vector embedding, and a method for retrieving the embedding of a word. 


In [42]:
# Create an embedding layer
# Parameters: (vocab_size, embedding_size)
embedding = nn.Embedding(1000,128)
# Print out the embedding of the token represented by index 3
embedding(torch.LongTensor([3]))

tensor([[-0.8339, -0.8401, -0.4141,  0.7291, -0.3189, -0.7835,  1.1149,  1.6854,
          0.8369, -0.5857, -0.8683, -1.6540,  1.0870, -1.0315, -0.4118,  1.0056,
          0.3092, -1.1671,  0.2132,  0.6444, -1.4036,  1.1835,  2.1626, -1.8322,
         -0.6601,  0.5682, -0.7845, -0.7873, -1.8262, -0.5982, -1.9202, -0.2246,
         -1.1471,  0.4236, -0.2028,  1.4375,  1.2629,  0.8292, -1.4450, -0.3609,
          1.6184,  0.2269, -0.0455, -2.0148, -0.6915, -1.1114,  1.2323, -0.0207,
         -0.5399, -0.4170, -0.4824, -0.4848,  0.4060,  1.1731, -0.6320, -1.4604,
          0.5150, -0.5422, -1.4209, -0.3815, -0.0864, -1.4695,  0.3017,  0.8218,
         -0.0975,  0.8161, -0.1343,  0.2470,  0.9689,  0.4680,  1.6157,  1.0490,
         -0.3120,  0.7378, -0.9465, -0.7687,  0.8759, -1.0310, -0.8165, -1.1310,
          0.6545, -0.8943,  0.4720, -0.6477, -0.8004, -0.0692,  0.1378,  0.7362,
          0.6140,  1.5090, -0.7292, -2.0434, -0.2622, -0.6968,  1.3669, -1.1981,
          0.7943,  1.6262, -

### Linear Layer

Parameters:

* in_features – size of each input sample
* out_features – size of each output sample

In [43]:
import torch
import torch.nn as nn

#x contains three inputs (i.e. the batch size is 3),
x = torch.tensor([[1.0, -1.0],
                  [0.0,  1.0],
                  [0.0,  0.0]])

in_features = x.shape[1]  # = 2
out_features = 1

# input x of shape (batch_size, in_features)
# output of shape (batchsize, out_features)
m = nn.Linear(in_features, out_features)

# output
y = m(x)
y

tensor([[ 0.6191],
        [-0.3373],
        [-0.1708]], grad_fn=<AddmmBackward0>)