In [1]:
import torch.nn as nn

# Create an RNN
rnn = nn.RNN(input_size=4, hidden_size=3, num_layers=1)

# Check initial weights
print("Input weights W_{hx}:\n", rnn.weight_ih_l0)  # Weights for input to hidden
print("Hidden weights W_{hh}:\n", rnn.weight_hh_l0)  # Weights for hidden to hidden


Input weights W_{hx}:
 Parameter containing:
tensor([[-0.4996,  0.3494,  0.2941,  0.4295],
        [ 0.5082,  0.3466,  0.3223, -0.5622],
        [ 0.2575,  0.0025,  0.1807,  0.1331]], requires_grad=True)
Hidden weights W_{hh}:
 Parameter containing:
tensor([[-0.4459, -0.3941,  0.5222],
        [-0.2701,  0.0718,  0.5033],
        [-0.4235,  0.1081,  0.4351]], requires_grad=True)


## Custom Weight Initialization

In [2]:
import torch
for i in rnn.named_parameters():
    print(type(i))
for i in rnn.named_parameters():
    print(i)


<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
('weight_ih_l0', Parameter containing:
tensor([[-0.4996,  0.3494,  0.2941,  0.4295],
        [ 0.5082,  0.3466,  0.3223, -0.5622],
        [ 0.2575,  0.0025,  0.1807,  0.1331]], requires_grad=True))
('weight_hh_l0', Parameter containing:
tensor([[-0.4459, -0.3941,  0.5222],
        [-0.2701,  0.0718,  0.5033],
        [-0.4235,  0.1081,  0.4351]], requires_grad=True))
('bias_ih_l0', Parameter containing:
tensor([-0.4059,  0.1711,  0.1784], requires_grad=True))
('bias_hh_l0', Parameter containing:
tensor([-0.2112, -0.3734,  0.5158], requires_grad=True))


In [3]:
for i in rnn.named_parameters():
    print(i[0], i[1])
    break

weight_ih_l0 Parameter containing:
tensor([[-0.4996,  0.3494,  0.2941,  0.4295],
        [ 0.5082,  0.3466,  0.3223, -0.5622],
        [ 0.2575,  0.0025,  0.1807,  0.1331]], requires_grad=True)


In [4]:
import torch

# Initialize weights with Xavier uniform distribution
for name, param in rnn.named_parameters():
    if 'weight' in name:
        nn.init.xavier_uniform_(param)
        print(param)


Parameter containing:
tensor([[ 0.2148,  0.8306,  0.4785, -0.2067],
        [ 0.3863,  0.7520, -0.5327, -0.3851],
        [-0.0748,  0.2377, -0.2322, -0.5978]], requires_grad=True)
Parameter containing:
tensor([[ 0.9204,  0.7638,  0.4592],
        [-0.2544, -0.9840,  0.4678],
        [-0.7433,  0.2672, -0.8830]], requires_grad=True)


### Benefits of Initializing Weights with a Uniform Distribution

Initializing weights using a **uniform distribution** offers several important benefits, particularly in the context of neural networks. Let’s break down the main advantages:

#### 1. Breaking Symmetry

One of the primary reasons for initializing weights with a uniform distribution (or any random distribution) is to **break symmetry** between the neurons. If all weights are initialized to the same value (e.g., zeros), each neuron in a layer will receive the same gradients and update in the same way during training, effectively making them learn the same features. Random initialization with a uniform distribution ensures that neurons start with different weights, allowing them to learn different aspects of the data.

- **Benefit**: Different neurons can learn different features, leading to a more expressive and powerful network.

#### 2. Ensuring Appropriate Scale of Weights

A uniform distribution allows control over the range of the initial weights, which can help keep the initial activations and gradients at a manageable scale, preventing problems like vanishing or exploding gradients. Proper initialization, particularly with small random values, helps ensure that the input signals neither shrink nor grow too much as they propagate through the network.

- **Benefit**: Reduces the risk of vanishing or exploding gradients, leading to more stable and efficient training.

#### 3. Faster Convergence

Random weight initialization using a uniform distribution, especially when coupled with techniques like **Xavier/Glorot** or **He/Kaiming** initialization, helps the network converge faster by providing weights that are well-scaled for the specific activation functions in use (e.g., sigmoid, ReLU). These initialization techniques are designed to keep the variance of the outputs consistent across layers, which can significantly improve the training speed and convergence.

- **Benefit**: Improved training speed and faster convergence to an optimal solution.

#### 4. Flexibility

Using a uniform distribution offers flexibility in controlling the range of initial weights. By specifying the bounds (e.g., `[-a, a]`), you can ensure the initial weights are not too large or too small, which can help avoid large fluctuations or vanishing signals during forward and backward propagation.

- **Benefit**: Control over the range of initial weights, reducing the risk of extreme initial values that can destabilize training.

#### 5. Good for Large Networks

Uniform initialization works well in practice for large networks, as it ensures that each neuron starts with a different weight but within a controlled range. This is especially important in deep neural networks, where improper weight initialization can cause problems as the signals pass through many layers.

- **Benefit**: Uniform initialization provides a practical and scalable solution for initializing weights in large, deep neural networks.

### Common Methods Based on Uniform Distribution

Several commonly used weight initialization techniques are based on uniform distributions. These methods adjust the range of the uniform distribution to account for the size of the input and output layers:

1. **Xavier/Glorot Uniform Initialization**:
   - Uses a uniform distribution with the range dependent on the number of input and output neurons.
   - Designed for use with sigmoid and tanh activation functions.
   
      \[
      $W \sim \mathcal{U}\left( -\frac{1}{\sqrt{n}}, \frac{1}{\sqrt{n}} \right)
      \]
      where \( n \) is the number of input and output units in the layer.

2. **He/Kaiming Uniform Initialization**:
   - Uses a uniform distribution with the range adjusted for ReLU and variants.
   
   \[
   W \sim \mathcal{U}\left( -\sqrt{\frac{6}{n}}, \sqrt{\frac{6}{n}} \right)
   \]
   where \( n \) is the number of input units in the layer.

These methods ensure that the variance of the activations remains consistent across layers, which is critical for efficient training.

### Summary of Benefits

1. **Breaking Symmetry**: Ensures neurons have unique weights, allowing them to learn different features.
2. **Appropriate Weight Scale**: Helps avoid vanishing or exploding gradients by controlling the range of initial weights.
3. **Faster Convergence**: Properly initialized weights lead to faster and more stable convergence during training.
4. **Flexibility**: You can control the bounds of the uniform distribution to fit the network’s needs.
5. **Scalable to Large Networks**: Uniform initialization is practical for large, deep networks and can be tailored for different activation functions.

In conclusion, initializing weights with a uniform distribution ensures that the neural network can learn efficiently from the start, prevents various common issues during training, and improves convergence speed.


In [5]:
import torch
import torch.nn as nn
input = torch.randn(5, 3,10)
input
# batch, row, coloumn

tensor([[[ 1.1499, -0.3657, -0.7299, -0.1267, -0.7077, -0.2641, -2.6540,
          -0.1504, -1.1138,  0.5935],
         [ 0.4498, -0.1167,  1.1558,  1.1256, -0.2923,  1.4538,  0.1645,
          -0.7498,  1.7855,  0.7669],
         [-1.0417,  0.5994, -0.7215,  0.8563,  1.1804, -0.5962,  0.1644,
           1.0475, -0.2483, -0.2710]],

        [[-0.5920,  1.2169,  0.9833,  0.1513,  0.2068, -1.1294, -1.5255,
          -1.4176,  0.9134, -1.3590],
         [ 0.3466,  0.7718, -0.1248,  0.4561, -0.9916,  0.2479, -1.5459,
          -0.3620, -0.1774, -0.7092],
         [-0.5474, -0.7459, -0.4226, -0.5895,  0.0923, -0.4525,  0.3769,
           1.0127, -0.1297,  0.3434]],

        [[ 0.9408, -0.5802,  0.5878,  0.4247, -0.0800, -0.3107,  0.2853,
          -0.2015, -0.3247, -2.3850],
         [-0.1829,  1.1241,  0.5454, -0.7379, -1.5108, -0.7592, -1.0533,
          -0.5271, -0.0379,  1.5924],
         [ 0.1004, -0.4802, -0.9359,  0.6083,  0.7245, -0.2818,  1.2587,
          -0.9187, -1.0562,  0.5173

In [6]:
rnn = nn.RNN(10,20, 2)

In [7]:
rnn

RNN(10, 20, num_layers=2)

In [8]:
h0 = torch.randn(2,3,20)

In [7]:
h0

tensor([[[ 1.5996,  0.2440, -0.5369,  1.4706, -0.0229,  1.6173,  1.5830,
          -0.1642, -0.0112, -0.3083,  0.4081,  0.3066,  1.3025, -0.8954,
          -1.3332,  0.6373,  0.1867, -0.6263,  0.9040,  0.6838],
         [-1.3037, -0.7867,  0.1945,  0.2044,  1.7548, -1.0303, -1.3641,
           0.6604,  0.2993,  1.7027, -0.3290,  0.0372, -1.1426,  0.1028,
          -0.5673,  0.2967,  1.3292, -1.8374, -0.1199,  0.9783],
         [-0.9179,  0.6312,  0.1442, -0.1914, -0.1751, -0.1297, -0.3323,
          -0.3465,  0.5814, -0.2075,  0.4553, -0.2455, -0.2824,  0.0291,
          -2.5605, -0.1180,  0.7881,  0.7495,  1.2218, -0.8857]],

        [[ 2.6250, -1.0780,  0.1782, -1.7311,  0.4933, -0.8868, -0.0901,
           0.0348, -0.0034,  1.5636, -0.8918, -0.1658,  1.1308,  0.4559,
           1.1487, -0.2816,  0.6065, -1.7440,  1.1548, -0.5164],
         [-0.7567,  0.3737, -0.5100, -0.8625, -0.0104,  0.8632,  1.7742,
          -0.0511, -1.0769, -0.2410, -0.2307, -0.2506, -0.3763, -0.1372,
        

In [9]:
output, hn = rnn(input, h0)

In [10]:
output.shape

torch.Size([5, 3, 20])

In [10]:
m = nn.Dropout(p=0.2)
input = torch.randn(20, 16)
print(input)
output = m(input)
output
# The nn.Linear layer is a fully connected (dense) layer that maps the output of the LSTM to a probability distribution over the vocabulary. 
# This is a standard practice when predicting categorical values (in this case, words in a vocabulary).

tensor([[-4.1692e-01,  3.3775e-01,  1.0347e+00, -5.0556e-01,  6.3558e-01,
         -2.2188e-01, -2.9829e-01,  2.1201e-01, -1.5060e-01, -1.4928e+00,
          1.0425e+00, -6.3973e-01, -3.1927e+00,  4.4534e-01, -9.0239e-01,
          9.9398e-02],
        [-1.4698e+00,  4.5913e-01, -1.1272e+00,  2.7887e-01,  5.8610e-01,
         -1.1392e+00, -8.3770e-01, -7.2372e-01,  6.0096e-01,  4.3577e-01,
          6.3487e-01, -1.4629e+00,  5.6206e-01, -2.1030e-01, -9.3288e-01,
         -8.2281e-01],
        [-9.7664e-01, -7.6996e-01, -8.7964e-01,  3.4571e-01, -3.5144e-01,
         -2.9372e-02, -5.8434e-01, -1.4216e+00, -3.0299e-01, -1.5231e+00,
         -2.1749e-02,  1.6886e-01, -7.8691e-01, -5.1211e-01,  4.5315e-01,
         -6.5712e-01],
        [-1.7187e-01,  1.1224e-01,  2.2634e+00,  4.5089e-01,  6.6332e-01,
         -5.6883e-01, -2.0975e+00, -2.8912e-01,  1.8391e+00, -2.1161e-01,
         -1.3850e-01, -2.9966e-01, -3.2739e-01, -8.4278e-03, -6.9303e-01,
          2.3019e+00],
        [-1.2751e+00

tensor([[-5.2115e-01,  4.2219e-01,  0.0000e+00, -0.0000e+00,  7.9448e-01,
         -2.7735e-01, -3.7287e-01,  0.0000e+00, -1.8824e-01, -1.8660e+00,
          0.0000e+00, -7.9966e-01, -3.9909e+00,  5.5667e-01, -1.1280e+00,
          1.2425e-01],
        [-1.8373e+00,  5.7391e-01, -1.4090e+00,  3.4859e-01,  7.3263e-01,
         -1.4239e+00, -0.0000e+00, -9.0464e-01,  7.5120e-01,  0.0000e+00,
          0.0000e+00, -1.8286e+00,  7.0258e-01, -2.6288e-01, -1.1661e+00,
         -1.0285e+00],
        [-1.2208e+00, -9.6245e-01, -1.0996e+00,  4.3214e-01, -4.3929e-01,
         -3.6715e-02, -0.0000e+00, -1.7770e+00, -3.7874e-01, -1.9039e+00,
         -0.0000e+00,  2.1108e-01, -9.8364e-01, -0.0000e+00,  5.6643e-01,
         -8.2140e-01],
        [-0.0000e+00,  1.4030e-01,  2.8292e+00,  5.6361e-01,  8.2915e-01,
         -7.1104e-01, -2.6219e+00, -0.0000e+00,  2.2988e+00, -0.0000e+00,
         -0.0000e+00, -3.7457e-01, -4.0924e-01, -1.0535e-02, -8.6629e-01,
          2.8774e+00],
        [-0.0000e+00

In [None]:
import torch
import torch.nn as nn

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(0.5)
    def forward(self, features, captions):
        """
            Forward pass of the encoder
            Arguments:
            - Features: Tensor of shape (batch_size, feature_size=512)
            - caption: Tensor of shape (batch_size, max_caption_length), word indices
            Returns:
            - output: Tensor of shape (batch_size, max_caption_length, vocab_size), word prediction
        """
        # Embedding the caption, excluding the <end> token
        embedding = self.embedding(captions[:, :-1])
        


In [1]:
import torch
import torch.nn as nn

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        
        # Embedding layer: converts word indices into dense vectors of size embed_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # LSTM: input to hidden, hidden_size must match the size of features from CNN
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer to map LSTM output to vocab_size
        self.fc = nn.Linear(hidden_size, vocab_size)
        
        # Initialize the hidden state (if needed)
        self.hidden_size = hidden_size
        
        # Optional dropout to prevent overfitting
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, features, captions):
        """
        Forward pass of the decoder.
        Arguments:
        - features: Tensor of shape (batch_size, feature_size=512)
        - captions: Tensor of shape (batch_size, max_caption_length), word indices
        
        Returns:
        - outputs: Tensor of shape (batch_size, max_caption_length, vocab_size), word predictions
        """
        
        # Embedding the captions, excluding the <end> token"
        embeddings = self.embedding(captions[:, :-1])
        
        # Concatenate the features with the embedded captions
        # Features are passed as input to the first time step
        features = features.unsqueeze(1)  # shape (batch_size, 1, feature_size)
        lstm_input = torch.cat((features, embeddings), 1)  # shape (batch_size, 1 + caption_length, embed_size)
        
        # Pass the concatenated inputs through the LSTM
        lstm_out, _ = self.lstm(lstm_input)
        
        # Pass the LSTM output through the fully connected layer to get word predictions
        outputs = self.fc(lstm_out)
        
        return outputs


In [1]:
import torch.optim as optim

# Hyperparameters
embed_size = 256
hidden_size = 512
vocab_size = len(vocab)  # Vocabulary size
num_epochs = 10
learning_rate = 0.001
log_interval = 10  # Log every 10 batches

# Initialize the model, loss function, and optimizer
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers=1)
criterion = nn.CrossEntropyLoss()  # CrossEntropyLoss is good for multi-class classification
optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

# Assume you have a pretrained CNN (e.g., ResNet) model as your feature extractor
cnn_model = pretrained_resnet_model  # The CNN for image feature extraction

# Set the CNN to evaluation mode (to avoid updating its weights)
cnn_model.eval()

# Training loop
for epoch in range(num_epochs):
    decoder.train()  # Set the decoder to training mode
    
    for i, (images, captions, lengths) in enumerate(data_loader):
        
        # Forward pass: Pass the images through the CNN to get features
        with torch.no_grad():  # No need to compute gradients for CNN
            features = cnn_model(images)  # (batch_size, feature_size=512)

        # Zero the gradients for the optimizer
        optimizer.zero_grad()
        
        # Forward pass: Pass the features and captions through the RNN
        outputs = decoder(features, captions)
        
        # Compute the loss between the RNN outputs and the target captions
        # We need to reshape the output to (batch_size * max_caption_length, vocab_size)
        loss = criterion(outputs.view(-1, vocab_size), captions[:, 1:].reshape(-1))  # Exclude <start> token in targets
        
        # Backpropagation
        loss.backward()
        
        # Update the model's parameters
        optimizer.step()
        
        # Logging the loss every log_interval
        if i % log_interval == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(data_loader)}], Loss: {loss.item():.4f}")


NameError: name 'vocab' is not defined