In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms

from model import EncoderCNN2DecoderRNN
from loader import get_loader

In [2]:
transform = transforms.Compose(
        [
            transforms.Resize((356, 356)),
            transforms.RandomCrop((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]
    )

data_dir = 'data/Images/'
captions_file = 'data/captions.txt'

train_loader, train_dataset = get_loader(
    data_dir=data_dir, 
    captions_file=captions_file,
    transform=transform,
    train_set=True)

In [3]:
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Hyperparameters
embed_size = 256
hidden_size = 256
vocab_size = len(train_dataset)
num_layers = 1
learning_rate = 1e-3
num_epochs = 5

In [5]:
# Initialize model
model = EncoderCNN2DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocab.stoi['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [6]:
def train_log(loss, example_ct, epoch):
    # Where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
    print(f"Loss after {str(example_ct).zfill(5)} examples: {loss:.3f}")

In [7]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Deep Learning Image Captioning Model",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": learning_rate,
    "architecture": "CNNtoRNN",
    "dataset": "Flickr8k",
    "epochs": num_epochs,
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpolmedinaarevalo[0m ([33mimgcaption_dl_project[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
from tqdm.auto import tqdm

# Run training and track with wandb
wandb.watch(model, criterion, log="all", log_freq=10)

total_batches = len(train_loader) * num_epochs
example_ct = 0  # number of examples seen
batch_ct = 0

model.train()

for epoch in tqdm(range(num_epochs)):
    for _, (images, captions) in enumerate(train_loader):
        images, captions = images.to(device), captions.to(device)
        
        # Forward pass ➡
        outputs = model(images, captions[:-1])
        loss = criterion(outputs.reshape(-1, outputs.shape[2]), captions[:, 1:].reshape(-1))
        
        # Backward pass ⬅
        optimizer.zero_grad()
        loss.backward()

        # Step with optimizer
        optimizer.step()
        
        example_ct +=  len(images)
        batch_ct += 1

        # Report metrics every 25th batch
        if ((batch_ct + 1) % 25) == 0:
            train_log(loss, example_ct, epoch)
            
wandb.finish()

  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/5 [00:13<?, ?it/s]


TypeError: relu(): argument 'input' (position 1) must be Tensor, not InceptionOutputs