# prerun

In [2]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.utils.rnn as rnn_utils
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torchvision.models as models
from torchvision import transforms

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import cv2

import plotly.graph_objects as go
import plotly.graph_objs as go
from plotly.subplots import make_subplots


import plotly.io as pio
# Set the renderer to 'notebook'
pio.renderers.default = 'notebook'


from collections import Counter
import itertools


# downloading dataset

In [None]:
#download dataset
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d tomtillo/satellite-image-caption-generation

!unzip satellite-image-caption-generation.zip


# prepare x and y tensors

In [3]:
#creating images tensor and descriptions tensor
def load_images_and_descriptions(csv_file):
    df = csv_file
    images = []
    descriptions = []

    for _, row in df.iterrows():
        captions = str(row['captions'])
        img_path = f"/kaggle/input/satellite-image-caption-generation/{row['filepath']}"
        img = cv2.imread(img_path)
        if img is None:
            print(f"Warning: Image {img_path} not found or cannot be loaded.")
            continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (50,50))

        img_tensor = torch.tensor(img, dtype=torch.float32).permute(2, 0, 1)

        #adding normalization phase
        normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
        img_tensor = normalize(img_tensor)

        images.append(img_tensor)
        descriptions.append(captions)

    images_tensor = torch.stack(images)
    descriptions_tensor = descriptions


    return images_tensor, descriptions_tensor


In [23]:
#for training
csv_file = '/kaggle/input/satellite-image-caption-generation/train.csv'
df = pd.read_csv(csv_file)
df = df.head(400)

images_tensor, descriptions_tensor = load_images_and_descriptions(df)

#check the shapes
print(images_tensor.shape)
print(len(descriptions_tensor))

#for validation
csv_file2 = '/kaggle/input/satellite-image-caption-generation/valid.csv'
df2 = pd.read_csv(csv_file2)
df2 = df2.head(100)

images_val, descriptions_val = load_images_and_descriptions(df2)

#check the shapes
print(images_val.shape)
print(len(descriptions_val))



torch.Size([400, 3, 50, 50])
400
torch.Size([100, 3, 50, 50])
100


# prepare vocab

In [5]:
# Define vocabulary class (to store all of our words)
class Vocabulary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if word not in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(captions, threshold=1):
    counter = Counter(itertools.chain(*[caption.split() for caption in captions]))
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')
    for word in words:
        vocab.add_word(word)

    return vocab


# Build the vocabulary instance
vocab = build_vocab(descriptions_tensor)

# Tokenize captions
def tokenize_caption(caption, vocab):
    tokens = [vocab('<start>')] + [vocab(word) for word in caption.split()] + [vocab('<end>')]
    return tokens

tokenized_captions = [tokenize_caption(caption, vocab) for caption in descriptions_tensor]

max_length = max(len(caption) for caption in tokenized_captions)
padded_captions = [caption + [vocab('<pad>')] * (max_length - len(caption)) for caption in tokenized_captions]
captions_tensor = torch.tensor(padded_captions, dtype=torch.long)

# Tokenize and pad validation captions (trim long captions)
val_tokenized_captions = [tokenize_caption(caption, vocab) for caption in descriptions_val]
padded_val_captions = [caption + [vocab('<pad>')] * (max_length - len(caption))
                       if len(caption) < max_length
                       else caption[:max_length] # Trim captions longer than max_length
                       for caption in val_tokenized_captions]
captions_val_tensor = torch.tensor(padded_val_captions, dtype=torch.long)



# Model1 (base model with LSTM)

In [None]:
#classic model arch

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        hiddens, _ = self.lstm(embeddings)
        outputs = self.fc(hiddens)
        return outputs


# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 1
vocab_size = len(vocab)
num_epochs = 10

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


# Model2 (base model with LSTM)

In [12]:
# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        hiddens, _ = self.gru(embeddings)
        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 1
vocab_size = len(vocab)
num_epochs = 20

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/10], Loss: 621.8923, Accuracy: 68.31%, Val Loss: 176.9488, Val Accuracy: 66.52%

Epoch [2/10], Loss: 350.2467, Accuracy: 77.72%, Val Loss: 175.5991, Val Accuracy: 67.84%

Epoch [3/10], Loss: 258.7711, Accuracy: 82.13%, Val Loss: 178.5295, Val Accuracy: 68.48%

Epoch [4/10], Loss: 196.0293, Accuracy: 85.78%, Val Loss: 182.2837, Val Accuracy: 68.78%

Epoch [5/10], Loss: 151.5640, Accuracy: 88.59%, Val Loss: 184.9597, Val Accuracy: 69.27%

Epoch [6/10], Loss: 120.9619, Accuracy: 91.15%, Val Loss: 189.5154, Val Accuracy: 69.75%

Epoch [7/10], Loss: 101.1327, Accuracy: 92.53%, Val Loss: 198.8513, Val Accuracy: 69.22%

Epoch [8/10], Loss: 84.8167, Accuracy: 93.72%, Val Loss: 205.4597, Val Accuracy: 69.86%

Epoch [9/10], Loss: 74.5751, Accuracy: 94.57%, Val Loss: 207.8507, Val Accuracy: 69.87%

Epoch [10/10], Loss: 69.4296, Accuracy: 94.94%, Val Loss: 212.7086, Val Accuracy: 69.47%


# Model3
*increasing accuracy* 

*   2LSTM
*   Drop outs
*   batch normalizations
*   lr = 0.0001





In [18]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.3):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout(embeddings)

        hiddens, _ = self.lstm(embeddings)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 20
learning_rate = 0.0001
dropout = 0.3

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/20], Loss: 530.6340, Accuracy: 55.84%, Val Loss: 103.9975, Val Accuracy: 62.59%
Epoch [2/20], Loss: 329.5927, Accuracy: 67.88%, Val Loss: 92.8582, Val Accuracy: 65.59%
Epoch [3/20], Loss: 275.1061, Accuracy: 71.80%, Val Loss: 89.5889, Val Accuracy: 67.14%
Epoch [4/20], Loss: 243.0804, Accuracy: 73.72%, Val Loss: 87.1720, Val Accuracy: 67.82%
Epoch [5/20], Loss: 221.2857, Accuracy: 74.92%, Val Loss: 85.6500, Val Accuracy: 68.31%
Epoch [6/20], Loss: 202.9956, Accuracy: 76.31%, Val Loss: 85.6310, Val Accuracy: 68.51%
Epoch [7/20], Loss: 189.6923, Accuracy: 77.24%, Val Loss: 85.4069, Val Accuracy: 69.03%
Epoch [8/20], Loss: 177.9456, Accuracy: 78.37%, Val Loss: 83.3614, Val Accuracy: 69.41%
Epoch [9/20], Loss: 167.8100, Accuracy: 79.17%, Val Loss: 85.2541, Val Accuracy: 69.29%
Epoch [10/20], Loss: 157.7190, Accuracy: 80.23%, Val Loss: 84.7215, Val Accuracy: 70.25%
Epoch [11/20], Loss: 149.1948, Accuracy: 80.75%, Val Loss: 84.6831, Val Accuracy: 70.31%
Epoch [12/20], Loss: 141.0497

# Model4
*decreasing overfitting and creating effecient epochs curve* 
* LR scheduler
* regulization


In [16]:
#L1 + scheduler

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.3):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout(embeddings)

        hiddens, _ = self.lstm(embeddings)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 20
learning_rate = 0.01
dropout = 0.3
lr_decay = 0.9
l1_lambda = 0.0001

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))

        # L1 regularization
        l1_norm = sum(p.abs().sum() for p in model.parameters())
        loss += l1_lambda * l1_norm

        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))

            # L1 regularization
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            loss += l1_lambda * l1_norm

            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/20], LR: 0.009000, Loss: 14291.8877, Accuracy: 62.10%, Val Loss: 3315.0925, Val Accuracy: 63.92%
Epoch [2/20], LR: 0.008100, Loss: 12806.7478, Accuracy: 67.62%, Val Loss: 3149.1565, Val Accuracy: 67.56%
Epoch [3/20], LR: 0.007290, Loss: 12394.4344, Accuracy: 69.70%, Val Loss: 3099.7537, Val Accuracy: 67.67%
Epoch [4/20], LR: 0.006561, Loss: 12274.4917, Accuracy: 70.78%, Val Loss: 3080.5010, Val Accuracy: 68.08%
Epoch [5/20], LR: 0.005905, Loss: 12221.0388, Accuracy: 71.23%, Val Loss: 3074.8877, Val Accuracy: 68.23%
Epoch [6/20], LR: 0.005314, Loss: 12187.9378, Accuracy: 71.91%, Val Loss: 3064.4877, Val Accuracy: 68.46%
Epoch [7/20], LR: 0.004783, Loss: 12160.1006, Accuracy: 72.51%, Val Loss: 3061.6598, Val Accuracy: 68.22%
Epoch [8/20], LR: 0.004305, Loss: 12146.5278, Accuracy: 72.50%, Val Loss: 3057.0014, Val Accuracy: 69.88%
Epoch [9/20], LR: 0.003874, Loss: 12125.6606, Accuracy: 73.28%, Val Loss: 3054.4662, Val Accuracy: 69.97%
Epoch [10/20], LR: 0.003487, Loss: 12111.0882,

In [15]:
#L2 + changing the schedul

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.3):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout(embeddings)

        hiddens, _ = self.lstm(embeddings)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 20
learning_rate = 0.1
dropout = 0.3
lr_decay =0.7
weight_decay = 0.0001  # L2 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/20], LR: 0.070000, Loss: 680.6621, Accuracy: 43.47%, Val Loss: 1582.1782, Val Accuracy: 2.45%
Epoch [2/20], LR: 0.049000, Loss: 587.0707, Accuracy: 46.05%, Val Loss: 968.1508, Val Accuracy: 1.92%
Epoch [3/20], LR: 0.034300, Loss: 572.6540, Accuracy: 46.83%, Val Loss: 408.5000, Val Accuracy: 47.09%
Epoch [4/20], LR: 0.024010, Loss: 525.4732, Accuracy: 48.22%, Val Loss: 477.5589, Val Accuracy: 0.19%
Epoch [5/20], LR: 0.016807, Loss: 459.0459, Accuracy: 52.53%, Val Loss: 176.9468, Val Accuracy: 55.67%
Epoch [6/20], LR: 0.011765, Loss: 401.0101, Accuracy: 57.96%, Val Loss: 120.0285, Val Accuracy: 59.90%
Epoch [7/20], LR: 0.008235, Loss: 351.5846, Accuracy: 61.29%, Val Loss: 103.4555, Val Accuracy: 61.28%
Epoch [8/20], LR: 0.005765, Loss: 298.8965, Accuracy: 65.82%, Val Loss: 93.1298, Val Accuracy: 66.10%
Epoch [9/20], LR: 0.004035, Loss: 258.7226, Accuracy: 69.40%, Val Loss: 82.9832, Val Accuracy: 68.32%
Epoch [10/20], LR: 0.002825, Loss: 226.2539, Accuracy: 72.65%, Val Loss: 80.6

# Model5

* elastic net (L1+L2)
* Drop out = 0.5
* weight_decay_l2 = 0.00005, weight_decay_l1 = 0.00005  
* scheduler step_size=0.5


In [28]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout(embeddings)

        hiddens, _ = self.lstm(embeddings)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 20
learning_rate = 0.1
dropout = 0.5
lr_decay = 0.7
weight_decay_l2 = 0.00005  # L2 regularization factor
weight_decay_l1 = 0.00005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/20], LR: 0.070000, Loss: 701.3862, Accuracy: 44.19%, Val Loss: 305.2428, Val Accuracy: 46.28%
Epoch [2/20], LR: 0.049000, Loss: 602.0969, Accuracy: 46.01%, Val Loss: 207.7646, Val Accuracy: 51.82%
Epoch [3/20], LR: 0.034300, Loss: 555.7430, Accuracy: 47.78%, Val Loss: 152.9004, Val Accuracy: 53.61%
Epoch [4/20], LR: 0.024010, Loss: 478.5887, Accuracy: 52.95%, Val Loss: 119.3681, Val Accuracy: 56.38%
Epoch [5/20], LR: 0.016807, Loss: 380.4332, Accuracy: 60.49%, Val Loss: 102.3968, Val Accuracy: 61.86%
Epoch [6/20], LR: 0.011765, Loss: 326.3386, Accuracy: 64.25%, Val Loss: 88.5633, Val Accuracy: 64.85%
Epoch [7/20], LR: 0.008235, Loss: 295.8099, Accuracy: 66.46%, Val Loss: 84.3803, Val Accuracy: 66.71%
Epoch [8/20], LR: 0.005765, Loss: 275.1791, Accuracy: 68.23%, Val Loss: 80.7271, Val Accuracy: 67.92%
Epoch [9/20], LR: 0.004035, Loss: 260.4217, Accuracy: 69.82%, Val Loss: 79.7512, Val Accuracy: 68.34%
Epoch [10/20], LR: 0.002825, Loss: 247.8265, Accuracy: 70.96%, Val Loss: 80.7

In [40]:
# batch_size=4, learning_rate = 0.003, lr_decay = 0.9, weight_decay_l2 = 0.0001, weight_decay_l1 = 0.000005 

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout(embeddings)

        hiddens, _ = self.lstm(embeddings)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 20
learning_rate = 0.003
dropout = 0.5
lr_decay = 0.9
weight_decay_l2 = 0.0001  # L2 regularization factor
weight_decay_l1 = 0.000005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/20], LR: 0.002700, Loss: 186.5710, Accuracy: 63.54%, Val Loss: 43.3956, Val Accuracy: 68.43%
Epoch [2/20], LR: 0.002430, Loss: 126.4570, Accuracy: 71.55%, Val Loss: 43.2076, Val Accuracy: 68.22%
Epoch [3/20], LR: 0.002187, Loss: 110.8226, Accuracy: 73.66%, Val Loss: 40.3922, Val Accuracy: 68.91%
Epoch [4/20], LR: 0.001968, Loss: 100.3305, Accuracy: 75.29%, Val Loss: 40.7468, Val Accuracy: 69.76%
Epoch [5/20], LR: 0.001771, Loss: 93.0310, Accuracy: 76.41%, Val Loss: 40.6791, Val Accuracy: 69.84%
Epoch [6/20], LR: 0.001594, Loss: 88.3123, Accuracy: 77.28%, Val Loss: 39.3075, Val Accuracy: 70.57%
Epoch [7/20], LR: 0.001435, Loss: 83.3953, Accuracy: 77.88%, Val Loss: 40.7316, Val Accuracy: 71.03%
Epoch [8/20], LR: 0.001291, Loss: 78.2574, Accuracy: 78.88%, Val Loss: 39.4648, Val Accuracy: 71.30%
Epoch [9/20], LR: 0.001162, Loss: 74.5864, Accuracy: 79.84%, Val Loss: 39.1767, Val Accuracy: 71.78%
Epoch [10/20], LR: 0.001046, Loss: 70.7607, Accuracy: 80.70%, Val Loss: 39.8527, Val Ac

In [69]:
# learning_rate = 0.0008, weight_decay_l2 = 0.0007, 4dropout layers, 3 types of dropouts with diff values

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout+0.3)
        self.dropout3 = nn.Dropout(dropout+0.4)
            
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)
        

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout3(embeddings)

        hiddens, _ = self.lstm(embeddings)
        hiddens = self.dropout(hiddens)
        
        

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)
        hiddens = self.dropout2(hiddens)
        

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)
        

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 20
learning_rate = 0.0008
dropout = 0.5
lr_decay = 0.9
weight_decay_l2 = 0.0001  # L2 regularization factor
weight_decay_l1 = 0.0000005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/20], LR: 0.000720, Loss: 363.5161, Accuracy: 39.81%, Val Loss: 63.5026, Val Accuracy: 53.67%
Epoch [2/20], LR: 0.000648, Loss: 279.1617, Accuracy: 46.34%, Val Loss: 62.7416, Val Accuracy: 55.00%
Epoch [3/20], LR: 0.000583, Loss: 265.4119, Accuracy: 47.84%, Val Loss: 61.1825, Val Accuracy: 56.63%
Epoch [4/20], LR: 0.000525, Loss: 253.1877, Accuracy: 49.75%, Val Loss: 59.7241, Val Accuracy: 57.57%
Epoch [5/20], LR: 0.000472, Loss: 241.1579, Accuracy: 51.87%, Val Loss: 57.1187, Val Accuracy: 58.58%
Epoch [6/20], LR: 0.000425, Loss: 230.8891, Accuracy: 53.50%, Val Loss: 55.2351, Val Accuracy: 60.40%
Epoch [7/20], LR: 0.000383, Loss: 219.9688, Accuracy: 55.38%, Val Loss: 54.2495, Val Accuracy: 60.72%
Epoch [8/20], LR: 0.000344, Loss: 212.3555, Accuracy: 56.90%, Val Loss: 53.3544, Val Accuracy: 61.52%
Epoch [9/20], LR: 0.000310, Loss: 206.1531, Accuracy: 57.90%, Val Loss: 52.8242, Val Accuracy: 61.82%
Epoch [10/20], LR: 0.000279, Loss: 200.6111, Accuracy: 58.77%, Val Loss: 51.9071, 

# Model 6
*discovering plateau* 

* more epoch 

In [75]:
# epoch = 60, drop out = 0.3, change the dropout 2 and 3 values

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout+0.4)
        self.dropout3 = nn.Dropout(dropout+0.5)
            
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)
        

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout3(embeddings)

        hiddens, _ = self.lstm(embeddings)
        hiddens = self.dropout2(hiddens)
        
        

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)
        hiddens = self.dropout(hiddens)
        

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)
        

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 60
learning_rate = 0.0008
dropout = 0.3
lr_decay = 0.9
weight_decay_l2 = 0.0001  # L2 regularization factor
weight_decay_l1 = 0.0000005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/60], LR: 0.000720, Loss: 292.4516, Accuracy: 47.91%, Val Loss: 57.6766, Val Accuracy: 57.61%
Epoch [2/60], LR: 0.000648, Loss: 206.5131, Accuracy: 59.49%, Val Loss: 50.8475, Val Accuracy: 62.54%
Epoch [3/60], LR: 0.000583, Loss: 176.1344, Accuracy: 63.69%, Val Loss: 47.9327, Val Accuracy: 64.16%
Epoch [4/60], LR: 0.000525, Loss: 159.2241, Accuracy: 66.14%, Val Loss: 45.6854, Val Accuracy: 66.41%
Epoch [5/60], LR: 0.000472, Loss: 148.6417, Accuracy: 67.61%, Val Loss: 44.6460, Val Accuracy: 67.06%
Epoch [6/60], LR: 0.000425, Loss: 139.9571, Accuracy: 69.25%, Val Loss: 43.5076, Val Accuracy: 67.38%
Epoch [7/60], LR: 0.000383, Loss: 133.1254, Accuracy: 69.98%, Val Loss: 43.5662, Val Accuracy: 67.68%
Epoch [8/60], LR: 0.000344, Loss: 128.6994, Accuracy: 70.52%, Val Loss: 42.8783, Val Accuracy: 67.61%
Epoch [9/60], LR: 0.000310, Loss: 125.3535, Accuracy: 71.25%, Val Loss: 41.9105, Val Accuracy: 68.90%
Epoch [10/60], LR: 0.000279, Loss: 122.1148, Accuracy: 71.77%, Val Loss: 42.4664, 

In [73]:
# epoch = 80, dropout = 0.5

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout+0.3)
        self.dropout3 = nn.Dropout(dropout+0.4)
            
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)
        

        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout3(embeddings)

        hiddens, _ = self.lstm(embeddings)
        hiddens = self.dropout(hiddens)
        
        

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)
        hiddens = self.dropout2(hiddens)
        

        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)
        

        outputs = self.fc(hiddens)
        return outputs

# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 80
learning_rate = 0.0008
dropout = 0.5
lr_decay = 0.9
weight_decay_l2 = 0.0001  # L2 regularization factor
weight_decay_l1 = 0.0000005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/80], LR: 0.000720, Loss: 359.7484, Accuracy: 39.88%, Val Loss: 64.0126, Val Accuracy: 53.99%
Epoch [2/80], LR: 0.000648, Loss: 278.1447, Accuracy: 46.38%, Val Loss: 63.3516, Val Accuracy: 54.35%
Epoch [3/80], LR: 0.000583, Loss: 264.4993, Accuracy: 48.04%, Val Loss: 61.4151, Val Accuracy: 55.73%
Epoch [4/80], LR: 0.000525, Loss: 251.9715, Accuracy: 50.13%, Val Loss: 59.0032, Val Accuracy: 58.01%
Epoch [5/80], LR: 0.000472, Loss: 239.4423, Accuracy: 52.23%, Val Loss: 56.5635, Val Accuracy: 58.51%
Epoch [6/80], LR: 0.000425, Loss: 228.9902, Accuracy: 53.94%, Val Loss: 55.2757, Val Accuracy: 60.39%
Epoch [7/80], LR: 0.000383, Loss: 218.6120, Accuracy: 55.56%, Val Loss: 54.2423, Val Accuracy: 61.22%
Epoch [8/80], LR: 0.000344, Loss: 211.2248, Accuracy: 57.08%, Val Loss: 53.2726, Val Accuracy: 61.46%
Epoch [9/80], LR: 0.000310, Loss: 204.7912, Accuracy: 58.16%, Val Loss: 51.9075, Val Accuracy: 61.57%
Epoch [10/80], LR: 0.000279, Loss: 200.3589, Accuracy: 58.72%, Val Loss: 51.8421, 

# Model 7
*trying non-linear approach & increase complexity*
* relu
* more LSTM layers


In [80]:
# relu, diff distrubution of dropouts

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout + 0.3)
        self.dropout3 = nn.Dropout(dropout + 0.4)
            
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)
         
        
        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout3(embeddings)

        hiddens, _ = self.lstm(embeddings)
        hiddens = self.dropout(hiddens)
        
        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)
        hiddens = torch.relu(hiddens)  
        hiddens = self.dropout(hiddens)
        
        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout2(hiddens)
        
        outputs = self.fc(hiddens)
        return outputs
# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 2
vocab_size = len(vocab)
num_epochs = 60
learning_rate = 0.0008
dropout = 0.5
lr_decay = 0.9
weight_decay_l2 = 0.0001  # L2 regularization factor
weight_decay_l1 = 0.0000005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/60], LR: 0.000720, Loss: 332.4240, Accuracy: 40.87%, Val Loss: 64.7662, Val Accuracy: 52.35%
Epoch [2/60], LR: 0.000648, Loss: 271.8966, Accuracy: 46.25%, Val Loss: 63.6635, Val Accuracy: 53.67%
Epoch [3/60], LR: 0.000583, Loss: 265.0549, Accuracy: 46.91%, Val Loss: 62.6093, Val Accuracy: 57.05%
Epoch [4/60], LR: 0.000525, Loss: 259.3214, Accuracy: 47.73%, Val Loss: 61.3298, Val Accuracy: 56.83%
Epoch [5/60], LR: 0.000472, Loss: 253.7531, Accuracy: 49.00%, Val Loss: 60.0111, Val Accuracy: 57.09%
Epoch [6/60], LR: 0.000425, Loss: 247.2718, Accuracy: 50.33%, Val Loss: 59.0863, Val Accuracy: 57.95%
Epoch [7/60], LR: 0.000383, Loss: 240.4151, Accuracy: 51.51%, Val Loss: 58.0013, Val Accuracy: 58.68%
Epoch [8/60], LR: 0.000344, Loss: 234.5806, Accuracy: 52.25%, Val Loss: 56.9864, Val Accuracy: 58.69%
Epoch [9/60], LR: 0.000310, Loss: 229.8696, Accuracy: 53.31%, Val Loss: 56.6793, Val Accuracy: 59.18%
Epoch [10/60], LR: 0.000279, Loss: 225.5148, Accuracy: 53.89%, Val Loss: 56.9929, 

In [86]:
#diff distrubution of dropouts, num_layers = 4,

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Prepare data loaders
train_dataset = TensorDataset(images_tensor, captions_tensor)
val_dataset = TensorDataset(images_val, captions_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

# Define the model
class ImageCaptioningModel(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers, dropout=0.5):
        super(ImageCaptioningModel, self).__init__()
        self.resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        for param in self.resnet.parameters():
            param.requires_grad = False
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, embed_size)

        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        self.fc = nn.Linear(hidden_size, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout + 0.3)
        self.dropout3 = nn.Dropout(dropout + 0.4)
            
        self.batch_norm1 = nn.BatchNorm1d(embed_size)
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)

    def forward(self, images, captions):
        features = self.resnet(images)
        features = self.batch_norm1(features)
        features = torch.relu(features)
        features = self.dropout(features)
         
        
        embeddings = self.embedding(captions)
        embeddings = torch.cat((features.unsqueeze(1), embeddings), 1)
        embeddings = self.dropout2(embeddings)

        hiddens, _ = self.lstm(embeddings)
        
        
        hiddens = hiddens.transpose(1, 2)
        hiddens = self.batch_norm2(hiddens)

         
        hiddens = self.dropout3(hiddens)
        
        hiddens = hiddens.transpose(1, 2)
        hiddens = self.dropout(hiddens)
        
        outputs = self.fc(hiddens)
        return outputs
# Hyperparameters
embed_size = 256
hidden_size = 512
num_layers = 4
vocab_size = len(vocab)
num_epochs = 60
learning_rate = 0.0008
dropout = 0.5
lr_decay = 0.9
weight_decay_l2 = 0.0001  # L2 regularization factor
weight_decay_l1 = 0.0000005  # L1 regularization factor

# Initialize model, loss function, and optimizer
model = ImageCaptioningModel(embed_size, hidden_size, vocab_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay_l2)

def l1_l2_regularization(model, l1_factor, l2_factor):
    l1_norm, l2_norm = 0.0, 0.0
    for name, param in model.named_parameters():
        if param.requires_grad:
            l1_norm += torch.norm(param, p=1)
            l2_norm += torch.norm(param, p=2)**2
    return l1_factor * l1_norm + l2_factor * l2_norm

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=0.5, gamma=lr_decay)

# Training and validation
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for images, captions in train_loader:
        images, captions = images.to(device), captions.to(device)
        optimizer.zero_grad()
        outputs = model(images, captions[:, :-1])
        loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
        
        # Apply L1 + L2 regularization
        l1_l2_loss = l1_l2_regularization(model, l1_factor=weight_decay_l1, l2_factor=weight_decay_l2)
        total_loss = loss + l1_l2_loss
        
        total_loss.backward()
        optimizer.step()
        train_loss += loss.item()

        _, predicted = outputs.max(2)
        total += captions[:, 1:].numel()
        correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    train_losses.append(train_loss / len(train_loader))
    train_accuracies.append(100. * correct / total)

    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, captions in val_loader:
            images, captions = images.to(device), captions.to(device)
            outputs = model(images, captions[:, :-1])
            loss = criterion(outputs[:, 1:].reshape(-1, vocab_size), captions[:, 1:].reshape(-1))
            val_loss += loss.item()

            _, predicted = outputs.max(2)
            total += captions[:, 1:].numel()
            correct += (predicted[:, 1:] == captions[:, 1:]).sum().item()

    val_losses.append(val_loss / len(val_loader))
    val_accuracies.append(100. * correct / total)

    # Step the scheduler
    scheduler.step()

    # Print learning rate
    print(f'Epoch [{epoch+1}/{num_epochs}], LR: {scheduler.get_last_lr()[0]:.6f}, '
          f'Loss: {train_loss:.4f}, Accuracy: {train_accuracies[-1]:.2f}%, '
          f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracies[-1]:.2f}%')

# Plotting the results with Plotly
fig = make_subplots(rows=1, cols=2, subplot_titles=('Loss', 'Accuracy'))

# Plot Loss
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_losses, mode='lines', name='Train Loss'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_losses, mode='lines', name='Val Loss'), row=1, col=1)

# Plot Accuracy
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=train_accuracies, mode='lines', name='Train Accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(num_epochs)), y=val_accuracies, mode='lines', name='Val Accuracy'), row=1, col=2)

fig.update_layout(title='Training and Validation Metrics', xaxis_title='Epoch', yaxis_title='Value')
fig.show()


Epoch [1/60], LR: 0.000720, Loss: 375.2309, Accuracy: 42.18%, Val Loss: 63.7852, Val Accuracy: 54.71%
Epoch [2/60], LR: 0.000648, Loss: 288.6518, Accuracy: 46.74%, Val Loss: 62.0440, Val Accuracy: 55.65%
Epoch [3/60], LR: 0.000583, Loss: 262.6807, Accuracy: 48.71%, Val Loss: 60.4293, Val Accuracy: 57.78%
Epoch [4/60], LR: 0.000525, Loss: 245.3833, Accuracy: 50.51%, Val Loss: 57.4868, Val Accuracy: 57.98%
Epoch [5/60], LR: 0.000472, Loss: 232.2752, Accuracy: 52.48%, Val Loss: 56.3008, Val Accuracy: 59.57%
Epoch [6/60], LR: 0.000425, Loss: 222.4141, Accuracy: 54.37%, Val Loss: 54.8004, Val Accuracy: 60.61%
Epoch [7/60], LR: 0.000383, Loss: 211.8493, Accuracy: 56.30%, Val Loss: 53.5394, Val Accuracy: 61.12%
Epoch [8/60], LR: 0.000344, Loss: 203.1387, Accuracy: 57.75%, Val Loss: 52.9569, Val Accuracy: 61.95%
Epoch [9/60], LR: 0.000310, Loss: 197.7176, Accuracy: 58.93%, Val Loss: 51.4968, Val Accuracy: 62.35%
Epoch [10/60], LR: 0.000279, Loss: 191.3571, Accuracy: 59.94%, Val Loss: 51.0481, 