In [5]:
from torch.utils.data import Dataset, DataLoader
import sys
import os
import torch
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.autograd import Variable
import time
from tqdm import tqdm 

sys.path.append('src/')
from data.dataset_utils import TripletDataLoader
#from model_architectures.googlenet.googtilenet import make_googtilenet
from model_architectures.googlenet.googtilenet_v3 import make_googtilenet
from training import train_triplet_epoch

img_type = 'naip'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

cuda = torch.cuda.is_available()
in_channels = 4
z_dim = 512

def prep_triplets(triplets, cuda):
    """
    Takes a batch of triplets and converts them into Pytorch variables 
    and puts them on GPU if available.
    """
    a, n, d = (Variable(triplets['anchor']), Variable(triplets['neighbor']), Variable(triplets['distant']))
    if cuda:
    	a, n, d = (a.cuda(), n.cuda(), d.cuda())
    return (a, n, d)


In [9]:
net = make_googtilenet(in_channels=in_channels, z_dim=z_dim)
if cuda: net.cuda()
net.train()

print('GoogTiLeNet set up complete.')

lr = 1e-3
optimizer = optim.Adam(net.parameters(), lr=lr, betas=(0.5, 0.999), weight_decay=0.01)
# scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

print('Optimizer set up complete.')

dataloader = TripletDataLoader(img_type, batch_size=64)

print('Dataset set up.')

margin = 10
l2 = 0.01
print_every=100

net.train()
sum_loss, sum_l_n, sum_l_d, sum_l_nd = (0, 0, 0, 0)
n_train, n_batches = len(dataloader.dataset), len(dataloader)
print_sum_loss = 0
for idx, triplets in enumerate(tqdm(dataloader, desc="training loop within epoch")):
    p, n, d = prep_triplets(triplets, cuda)
    optimizer.zero_grad()
    loss, l_n, l_d, l_nd, l2_loss = net.loss(p, n, d, margin=margin, l2=l2)
    loss.backward()
    for param in net.parameters():
        if torch.isnan(param.grad).any() or torch.isinf(param.grad).any():
            print("Gradient contains NaN or infinite values.")
            raise
    optimizer.step()
    # scheduler.step(loss)
    for param in net.parameters():
        if torch.isnan(param).any() or torch.isinf(param).any():
            print("Parameter contains NaN or infinite values.")
            raise
    sum_loss += loss.data
    # sum_l_n += l_n.data
    # sum_l_d += l_d.data
    # sum_l_nd += l_nd.data
    if (idx + 1) * dataloader.batch_size % print_every == 0:
            print_avg_loss = (sum_loss - print_sum_loss) / (
                print_every / dataloader.batch_size)
            print('[{}/{} ], avg loss: {:0.4f}, last l2_loss: {:0.4f} '.format(
                    (idx + 1) * dataloader.batch_size, n_train,
                loss.data, l2_loss.data, l_n.data, l_d.data, l_nd.data ))
            print_sum_loss = sum_loss
# avg_loss = sum_loss / n_batches
# avg_l_n = sum_l_n / n_batches
# avg_l_d = sum_l_d / n_batches
# avg_l_nd = sum_l_nd / n_batches
# print('Finished epoch {}: {:0.3f}s'.format(epoch, time()-t0))
# print('  Average loss: {:0.4f}'.format(avg_loss))
# print('  Average l_n: {:0.4f}'.format(avg_l_n))
# print('  Average l_d: {:0.4f}'.format(avg_l_d))
# print('  Average l_nd: {:0.4f}\n'.format(avg_l_nd))
# return (avg_loss, avg_l_n, avg_l_d, avg_l_nd)

GoogTiLeNet set up complete.
Optimizer set up complete.
73894
Dataset set up.


training loop within epoch:   0%|          | 1/1155 [00:04<1:28:21,  4.59s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.56 GiB total capacity; 2.06 GiB already allocated; 17.50 MiB free; 2.10 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF