In [1]:
import os
import random
import wandb

import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as T

from train import *
from test import *
from utils.utils import *
from models.models import *
import multiprocessing



# Global variables
global device

import os

# Setting CUDA ALLOC split size to 256 to avoid running out of memory
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
# Stopping wandb from creating symlinks
os.environ["WANDB_DISABLE_SYMLINKS"] = "true"

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(hash("setting random seeds") % 2**32 - 1)
np.random.seed(hash("improves reproducibility") % 2**32 - 1)
torch.manual_seed(hash("by removing stochasticity") % 2**32 - 1)
torch.cuda.manual_seed_all(hash("so runs are repeatable") % 2**32 - 1)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def model_pipeline(cfg: dict):
    # tell wandb to get started
    with wandb.init(project="pytorch-demo", config=cfg):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # Execute only once to create the dataset
        # generate_and_dump_dataset(config.root_dir, config.captions_file, config.transforms, cfg.DATA_LOCATION)

        # Generate Dataset
        dataset = make_dataset(config)

        # Get the data loaders
        train_loader, test_loader = make_dataloaders(config, dataset, 1)

        # Generate vocab
        vocab = dataset.vocab
        config.vocab_size = len(vocab)

        # Get the model
        my_model = make_model(config, device)

        # Define the loss and optimizer
        criterion = get_criterion(config.criterion, vocab.stoi["<PAD>"])
        criterion.ignore_index=vocab.stoi["<PAD>"]
        
        optimizer = get_optimizer(config.optimizer, my_model.parameters(), config.learning_rate)
        
        # Arrays to log data
        train_loss_arr_epoch, test_loss_arr_epoch, acc_arr_epoch  = [], [], [] # Epoch-wise
        train_loss_arr_batch, test_loss_arr_batch, acc_arr_batch = [], [], [] # Batch-wise
        train_execution_times, test_execution_times = [], [] # Execution times

        
        for epoch in tqdm(range(1, config.epochs + 1)):
            # Training
            my_model.train()
            train_loss_arr_aux, train_time = train(my_model, train_loader, criterion, optimizer, config, epoch)
            my_model.eval()

            # Testing
            acc_arr_aux, test_loss_arr_aux, test_time = test(my_model, test_loader, criterion, vocab, config, device)

            # Check how model performs
            test_model_performance(my_model, test_loader, device, vocab, epoch, config)
            
            # Logging data for vizz
            train_loss_arr_epoch.append(np.mean(train_loss_arr_aux)); test_loss_arr_epoch.append(np.mean(test_loss_arr_aux))
            train_loss_arr_batch += train_loss_arr_aux; test_loss_arr_batch += test_loss_arr_aux
            acc_arr_epoch.append(np.mean(acc_arr_aux)); acc_arr_batch += acc_arr_aux
            train_execution_times.append(train_time); test_execution_times.append(test_time)

            
        if config.save:
            export_data(train_loss_arr_epoch, test_loss_arr_epoch, acc_arr_epoch, train_execution_times, test_execution_times,
                   train_loss_arr_batch, acc_arr_batch, test_loss_arr_batch, config)
            
            save_model(my_model, config, config.DATA_LOCATION+'/logs'+'/EncoderDecorder_model.pth')

    return my_model


if __name__ == "__main__":
    wandb.login()

    print("Using: ", device)

    transforms = T.Compose([
        T.Resize(226),
        T.RandomCrop(224),
        T.ToTensor(),
        T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    DATA_LOCATION = '../data'

    config = dict(
        # Paths
        root_dir=DATA_LOCATION+"/Images",
        captions_file=DATA_LOCATION+"/captions.txt",
        DATA_LOCATION=DATA_LOCATION,
        save=True,

        # Training data
        epochs=1,
        batch_size=50,
        train_size=0.1,
        
        # Model data
        optimizer='Adam',
        criterion='CrossEntropy',
        learning_rate=0.0001,
        device=device,
        encoder='ResNet50',
        transforms=transforms,
        embed_size=300,
        attention_dim=256,
        encoder_dim=2048,
        decoder_dim=512,
    )

    model = model_pipeline(config)


[34m[1mwandb[0m: Currently logged in as: [33mpau-ventr[0m ([33mgrup10[0m). Use [1m`wandb login --relogin`[0m to force relogin


Using:  cuda:0


  0%|          | 0/1 [00:00<?, ?it/s]

Loss after 00050 examples: 8.026
Loss after 00100 examples: 8.010
Loss after 00150 examples: 7.957
Loss after 00200 examples: 7.943
Loss after 00250 examples: 7.904
Loss after 00300 examples: 7.875
Loss after 00350 examples: 7.808
Loss after 00400 examples: 7.777
Loss after 00450 examples: 7.722
Loss after 00500 examples: 7.702
Loss after 00550 examples: 7.648
Loss after 00600 examples: 7.570
Loss after 00650 examples: 7.531
Loss after 00700 examples: 7.444
Loss after 00750 examples: 7.368
Loss after 00800 examples: 7.333
Loss after 00850 examples: 7.297
Loss after 00900 examples: 7.165
Loss after 00950 examples: 7.010
Loss after 01000 examples: 6.923
Loss after 01050 examples: 6.831
Loss after 01100 examples: 6.886
Loss after 01150 examples: 6.729
Loss after 01200 examples: 6.596
Loss after 01250 examples: 6.563
Loss after 01300 examples: 6.427
Loss after 01350 examples: 6.366
Loss after 01400 examples: 6.182
Loss after 01450 examples: 6.148
Loss after 01500 examples: 6.003
Loss after

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Mean BLEU score of the model on the 100 test images: 4.835781380720562e-155%


0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,████▇▇▇▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▂▁▂▂▂▁▁▁▁▁▁▁▁▁
test_mean_bleu,▁

0,1
epoch,1.0
loss,4.80721
test_mean_bleu,0.0


In [26]:
np.mean([1,2,4])

2.3333333333333335

In [24]:
b

[]

In [41]:
with wandb.init(project="pytorch-demo", config=config):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # Execute only once to create the dataset
        # generate_and_dump_dataset(config.root_dir, config.captions_file, config.transforms, cfg.DATA_LOCATION)

        # Generate Dataset
        dataset = make_dataset(config)

        # Get the data loaders
        train_loader, test_loader = make_dataloaders(config, dataset, 1)

        # Generate vocab
        vocab = dataset.vocab
        config.vocab_size = len(vocab)

        # Get the model
        my_model = make_model(config, device)

        # Define the loss and optimizer
        criterion = config.criterion
        criterion.ignore_index=vocab.stoi["<PAD>"]
        
        optimizer = config.optimizer
        optimizer.parms = my_model.parameters()
        optimizer.lr = config.learning_rate

VBox(children=(Label(value='0.001 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.066979…

AttributeError: 'str' object has no attribute 'ignore_index'

In [None]:
config.criterion

In [9]:
crit.ignore_index = 4

In [18]:
a = torch.optim.Adam

In [21]:
a(params=[torch.zeros([4,2])])

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0
)

In [20]:
a

torch.optim.adam.Adam

In [3]:
with wandb.init(project="pytorch-demo", config=config):
    config = wandb.config

    # Generate Dataset
    dataset = make_dataset(config)

    # make the data_loaders, and optimizer
    #train_loader, test_loader = make_dataloaders(config, dataset, 1)

In [4]:
data_list = preprocess_dataset(dataset)

In [5]:
data_list[:3]

[[tensor([[[-0.9019, -0.1315, -0.2341,  ..., -1.3301, -1.5527, -1.2783],
           [-1.0215, -0.1829, -0.2170,  ..., -1.4502, -1.5869, -1.6895],
           [-1.0566, -0.1656, -0.2000,  ..., -1.6211, -1.5527, -1.6387],
           ...,
           [ 1.6670,  1.4951,  0.4680,  ...,  1.5293,  0.6733,  0.7246],
           [ 0.8789,  0.1083,  0.0056,  ...,  1.4951,  0.6733,  0.6904],
           [ 0.6904,  1.5469,  0.9644,  ...,  1.4951,  0.7075,  0.6904]],
  
          [[-0.7925,  0.0301, -0.0049,  ..., -1.2832, -1.5107, -0.9502],
           [-0.9678, -0.0224,  0.0476,  ..., -1.4229, -1.5635, -1.5801],
           [-1.0029, -0.0049,  0.0301,  ..., -1.5459, -1.4404, -1.5459],
           ...,
           [ 1.2207,  0.7656, -0.6177,  ...,  1.8506,  1.2031,  1.1855],
           [-0.2500, -0.4775, -0.4602,  ...,  1.8330,  1.2207,  1.2031],
           [ 0.0476,  1.0459,  0.3103,  ...,  1.8154,  1.2031,  1.1855]],
  
          [[-0.6021,  0.1302,  0.0779,  ..., -1.2812, -1.4902, -1.2637],
           

In [None]:
my_iter = iter(train_loader)
img, cap = next(my_iter)

In [None]:
cap

In [15]:
len(img)

50

In [7]:
img2, cap2 = next(my_iter)

In [8]:
cap2

tensor([[   1,    4,    9,    7,   32,   10,  711,   27,  104, 2409,    5,    2,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    9,    7,   32,   76,    4,  157, 2409,    5,    2,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    7,  316,   76,    4,  157,   74,    5,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    9,    7,    8,    4,  195,  151,  316,   76,    4,  157,
            3,    5,    2,    0,    0,    0,    0,    0],
        [   1,    4,   28,    8,    4,  195,  151,   17,   32,   67,    4,  353,
           11,  711,    8,   24,    3,  496,    5,    2],
        [   1,    4,    7,  316,   76,    4,  157,   74,    5,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    9,    7,   32,   10,  711,   27,  104, 2409,    5,    2,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    9, 

In [17]:
image,captions = next(iter(train_loader))

In [21]:
image, captions = image.to(device), captions.to(device)

# Zero the gradients.
optimizer.zero_grad()

# Feed forward
outputs, attentions = my_model(image.to(torch.float32), captions)

# Calculate the batch loss.
targets = captions[:, 1:]
loss = criterion(outputs.view(-1, config.vocab_size), targets.reshape(-1))

# Backward pass.
loss.backward()

# Update the parameters in the optimizer.
optimizer.step()

In [27]:
outputs.view(-1, config.vocab_size)

tensor([[-0.0419, -0.0200, -0.0398,  ..., -0.0310,  0.0729,  0.0062],
        [ 0.1156, -0.0232, -0.1058,  ...,  0.0588,  0.0166, -0.1509],
        [ 0.0336,  0.0681, -0.1028,  ...,  0.0521,  0.0040, -0.1283],
        ...,
        [-0.2936,  0.1309,  0.0571,  ..., -0.1252,  0.2291, -0.0856],
        [-0.2734,  0.2516, -0.0050,  ...,  0.0603, -0.0179, -0.0015],
        [-0.2988,  0.1344, -0.0523,  ...,  0.0208, -0.0471,  0.0206]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [35]:
captions[:,:].shape

torch.Size([50, 20])

In [39]:
outputs.view(-1, config.vocab_size)

tensor([[-0.0419, -0.0200, -0.0398,  ..., -0.0310,  0.0729,  0.0062],
        [ 0.1156, -0.0232, -0.1058,  ...,  0.0588,  0.0166, -0.1509],
        [ 0.0336,  0.0681, -0.1028,  ...,  0.0521,  0.0040, -0.1283],
        ...,
        [-0.2936,  0.1309,  0.0571,  ..., -0.1252,  0.2291, -0.0856],
        [-0.2734,  0.2516, -0.0050,  ...,  0.0603, -0.0179, -0.0015],
        [-0.2988,  0.1344, -0.0523,  ...,  0.0208, -0.0471,  0.0206]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
# Run training and track with wandb
example_ct = 0  # number of examples seen
batch_ct = 0

loss_arr_batch = []  # Losses of the batches

for idx, (image, captions) in enumerate(iter(data_loader)):

    loss = train_batch(image.to(torch.float32), captions, model, config.vocab_size, optimizer, criterion, device=config.device)
    example_ct += len(image)
    batch_ct += 1

    loss_arr_batch.append(loss.tolist())

    # Report metrics every 1th batch
    if ((batch_ct + 1) % 1) == 0 and verbatim:
        train_log(loss, example_ct, epoch)


In [4]:
t0 = time.time()
my_iter = iter(train_loader)
t1 = time.time()
t0-t1
# bs 32 nw all

-5.296547889709473

In [5]:
a, b = next(my_iter)

In [8]:
train_loader.dataset.data[0//5][1][0%5]

tensor([  1,   4,  28,   8,   4, 195, 151,  17,  32,  67,   4, 353,  11, 711,
          8,  24,   3, 496,   5,   2], dtype=torch.int16)

In [11]:
b

tensor([[   1,    4,    9,    7,    8,    4,  195,  151,  316,   76,    4,  157,
            3,    5,    2,    0,    0,    0,    0,    0],
        [   1,    4,    9,    7,   32,   10,  711,   27,  104, 2409,    5,    2,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    7,  316,   76,    4,  157,   74,    5,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    7,  316,   76,    4,  157,   74,    5,    2,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    9,    7,   32,   76,    4,  157, 2409,    5,    2,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,   28,    8,    4,  195,  151,   17,   32,   67,    4,  353,
           11,  711,    8,   24,    3,  496,    5,    2],
        [   1,    4,    9,    7,   32,   76,    4,  157, 2409,    5,    2,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   1,    4,    9, 

In [37]:
t0 = time.time()
my_iter = iter(train_loader)
t1 = time.time()
t0-t1
# bs 500 nw 1

-5.776714086532593

In [38]:
for a, b in my_iter:
    print(1)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [21]:
iter_2 = deepcopy(my_iter)

NotImplementedError: ('{} cannot be pickled', '_MultiProcessingDataLoaderIter')