In [1]:
from __future__ import print_function

import matplotlib; matplotlib.use('Agg')
import os
import os.path as osp
import argparse

from train import train 
from test import test
from test_beam import test_beam 

## 하이퍼 파라미터 설정

In [2]:
parser = argparse.ArgumentParser(description='PyTorch Convolutional Image Captioning Model')

parser.add_argument('model_dir', help='output directory to save models & results')

parser.add_argument('-g', '--gpu', type=int, default=0,\
                    help='gpu device id')

parser.add_argument('--coco_root', type=str, default= './data/coco/',\
                    help='directory containing coco dataset train2014, val2014, & annotations')

parser.add_argument('-t', '--is_train', type=int, default=1,\
                    help='use 1 to train model')

parser.add_argument('-e', '--epochs', type=int, default=30,\
                    help='number of training epochs')

parser.add_argument('-b', '--batchsize', type=int, default=32,\
                    help='number of images per training batch')

parser.add_argument('-c', '--ncap_per_img', type=int, default=5,\
                    help='ground-truth captions per image in training batch')

parser.add_argument('-n', '--num_layers', type=int, default=3,\
                    help='depth of convcap network')

parser.add_argument('-m', '--nthreads', type=int, default=4,\
                    help='pytorch data loader threads')

# parser.add_argument('-ft', '--finetune_after', type=int, default=8,\
#                     help='epochs after which vgg16 is fine-tuned')

parser.add_argument('-lr', '--learning_rate', type=float, default=5e-5,\
                    help='learning rate for convcap')

parser.add_argument('-st', '--lr_step_size', type=int, default=15,\
                    help='epochs to decay learning rate after')

parser.add_argument('-sc', '--score_select', type=str, default='CIDEr',\
                    help='metric to pick best model')

parser.add_argument('--beam_size', type=int, default=1, \
                    help='beam size to use for test') 

parser.add_argument('--attention', dest='attention', action='store_true', \
                    help='Use this for convcap with attention (by default set)')

parser.add_argument('--no-attention', dest='attention', action='store_false', \
                    help='Use this for convcap without attention')


_StoreFalseAction(option_strings=['--no-attention'], dest='attention', nargs=0, const=False, default=True, type=None, choices=None, help='Use this for convcap without attention', metavar=None)

In [3]:
parser.set_defaults(attention=True)

args, _ = parser.parse_known_args()

In [4]:
args.finetune_after = 8
args.model_dir = 'output'

In [5]:
import os
import os.path as osp
import argparse
import numpy as np 
import json
import time
 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.utils.data import DataLoader

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torchvision import models                                                                     

from coco_loader import coco_loader
from convcap import convcap
from vggfeats import Vgg16Feats
from tqdm import tqdm 
from test import test 

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

In [7]:
if (args.is_train == 1):
    print('train')

train


In [8]:
t_start = time.time()
train_data = coco_loader(args.coco_root, split='train', ncap_per_img=args.ncap_per_img)
print('[DEBUG] Loading train data ... %f secs' % (time.time() - t_start))

Loading annotation file...
Found 113287 images in split: train
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading train data ... 3.481802 secs


In [9]:
train_data_loader = DataLoader(dataset=train_data, num_workers=0, batch_size=args.batchsize, \
                               shuffle=True, drop_last=True)

In [10]:
model_imgcnn = Vgg16Feats()
model_imgcnn.cuda()
model_imgcnn.train(True)

Vgg16Feats(
  (features_nopool): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding

In [11]:
#Convcap model
model_convcap = convcap(train_data.numwords, args.num_layers, is_attention=args.attention)
model_convcap.cuda()
model_convcap.train(True)

convcap(
  (emb_0): Embedding(9221, 512, padding_idx=0)
  (emb_1): Linear(in_features=512, out_features=512, bias=True)
  (imgproj): Linear(in_features=4096, out_features=512, bias=True)
  (resproj): Linear(in_features=1024, out_features=512, bias=True)
  (convs): ModuleList(
    (0): Conv1d(1024, 1024, kernel_size=(5,), stride=(1,), padding=(4,))
    (1): Conv1d(512, 1024, kernel_size=(5,), stride=(1,), padding=(4,))
    (2): Conv1d(512, 1024, kernel_size=(5,), stride=(1,), padding=(4,))
  )
  (attention): ModuleList(
    (0): AttentionLayer(
      (in_projection): Linear(in_features=512, out_features=512, bias=True)
      (out_projection): Linear(in_features=512, out_features=512, bias=True)
    )
    (1): AttentionLayer(
      (in_projection): Linear(in_features=512, out_features=512, bias=True)
      (out_projection): Linear(in_features=512, out_features=512, bias=True)
    )
    (2): AttentionLayer(
      (in_projection): Linear(in_features=512, out_features=512, bias=True)
      

In [12]:
optimizer = optim.RMSprop(model_convcap.parameters(), lr=args.learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=.1)
img_optimizer = None

In [13]:
batchsize = args.batchsize
ncap_per_img = args.ncap_per_img
batchsize_cap = batchsize*ncap_per_img
max_tokens = train_data.max_tokens
nbatches = np.int_(np.floor((len(train_data.ids)*1.)/batchsize)) 
bestscore = .0

In [14]:
def repeat_img_per_cap(imgsfeats, imgsfc7, ncap_per_img):
    batchsize, featdim, feat_h, feat_w = imgsfeats.size()
    batchsize_cap = batchsize*ncap_per_img
    imgsfeats = imgsfeats.unsqueeze(1).expand(batchsize, ncap_per_img, featdim, feat_h, feat_w)
    imgsfeats = imgsfeats.contiguous().view(batchsize_cap, featdim, feat_h, feat_w)
    
    batchsize, featdim = imgsfc7.size()
    batchsize_cap = batchsize*ncap_per_img
    imgsfc7 = imgsfc7.unsqueeze(1).expand(batchsize, ncap_per_img, featdim)
    imgsfc7 = imgsfc7.contiguous().view(batchsize_cap, featdim)
    
    return imgsfeats, imgsfc7

In [15]:
args.epochs

30

In [16]:
#   for epoch in range(args.epochs):
# 코드가 잘 돌아가는지 확인하기 위해 2번만 돌려봤습니다. 전 30(args.epochs)번 돌렸습니다.

In [17]:
for epoch in range(2):
    loss_train = 0.
    
    if(epoch == args.finetune_after):
        img_optimizer = optim.RMSprop(model_imgcnn.parameters(), lr=1e-5)
        img_scheduler = lr_scheduler.StepLR(img_optimizer, step_size=args.lr_step_size, gamma=.1)

    scheduler.step()    
    if(img_optimizer):
        img_scheduler.step()

    #One epoch of train
    for batch_idx, (imgs, captions, wordclass, mask, _) in tqdm(enumerate(train_data_loader), total=nbatches):
        imgs = imgs.view(batchsize, 3, 224, 224)
        wordclass = wordclass.view(batchsize_cap, max_tokens)
        mask = mask.view(batchsize_cap, max_tokens)
        
        imgs_v = Variable(imgs).cuda()
        wordclass_v = Variable(wordclass).cuda()
        
        optimizer.zero_grad()
        if(img_optimizer):
            img_optimizer.zero_grad() 

        imgsfeats, imgsfc7 = model_imgcnn(imgs_v)
        imgsfeats, imgsfc7 = repeat_img_per_cap(imgsfeats, imgsfc7, ncap_per_img)
        _, _, feat_h, feat_w = imgsfeats.size()

        if(args.attention == True):
            wordact, attn = model_convcap(imgsfeats, imgsfc7, wordclass_v)
            attn = attn.view(batchsize_cap, max_tokens, feat_h, feat_w)
        else:
            wordact, _ = model_convcap(imgsfeats, imgsfc7, wordclass_v)
        
        wordact = wordact[:,:,:-1]
        wordclass_v = wordclass_v[:,1:]
        mask = mask[:,1:].contiguous()
        
        wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize_cap*(max_tokens-1), -1)
        wordclass_t = wordclass_v.contiguous().view(batchsize_cap*(max_tokens-1), 1)
      
        maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1)

        if(args.attention == True):
            #Cross-entropy loss and attention loss of Show, Attend and Tell
            loss = F.cross_entropy(wordact_t[maskids, ...], \
            wordclass_t[maskids, ...].contiguous().view(maskids.shape[0])) \
            + (torch.sum(torch.pow(1. - torch.sum(attn, 1), 2)))\
            /(batchsize_cap*feat_h*feat_w)
        else:
            loss = F.cross_entropy(wordact_t[maskids, ...], \
            wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))

    loss_train = loss_train + loss.data

    loss.backward()

    optimizer.step()
    if(img_optimizer):
        img_optimizer.step()

    loss_train = (loss_train*1.)/(batch_idx)
    print('[DEBUG] Training epoch %d has loss %f' % (epoch, loss_train))

    modelfn = osp.join(args.model_dir, 'model.pth')

    if(img_optimizer):
        img_optimizer_dict = img_optimizer.state_dict()
    else:
        img_optimizer_dict = None

    torch.save({
        'epoch': epoch,
        'state_dict': model_convcap.state_dict(),
        'img_state_dict': model_imgcnn.state_dict(),
        'optimizer' : optimizer.state_dict(),
        'img_optimizer' : img_optimizer_dict,
      }, modelfn)

    #Run on validation and obtain score
    scores = test(args, 'val', model_convcap=model_convcap, model_imgcnn=model_imgcnn)
    score = scores[0][args.score_select]

    if(score > bestscore):
        bestscore = score
        print('[DEBUG] Saving model at epoch %d with %s score of %f'\
              % (epoch, args.score_select, score))
        bestmodelfn = osp.join(args.model_dir, 'bestmodel.pth')
        os.system('cp %s %s' % (modelfn, bestmodelfn))

  x = F.softmax(x.view(sz[0] * sz[1], sz[2]))
100%|██████████████████████████████████████████████████████████████████████████████| 3540/3540 [43:29<00:00,  1.36it/s]


[DEBUG] Training epoch 0 has loss 0.011605
Loading annotation file...
Found 5000 images in split: val
[DEBUG] #words in wordlist: 9221
[DEBUG] Loading val data ... 4.796083 secs
[DEBUG] Running inference on val with 156 batches


  wordprobs = F.softmax(wordact_t).cpu().data.numpy()
100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:50<00:00,  3.07it/s]


loading annotations into memory...
Done (t=1.26s)
creating index...
index created!
Using 4992/4992 predictions
Loading and preparing results...
DONE (t=0.04s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 69851, 'reflen': 61712, 'guess': [69851, 64859, 59870, 54881], 'correct': [390, 0, 0, 0]}
ratio: 1.1318868291418018
Bleu_1: 0.006
Bleu_2: 0.000
Bleu_3: 0.000
Bleu_4: 0.000
computing METEOR score...
METEOR: 0.013
computing Rouge score...
ROUGE_L: 0.007
computing CIDEr score...
CIDEr: 0.001
computing SPICE score...


CalledProcessError: Command '['java', '-jar', '-Xmx8G', 'spice-1.0.jar', 'C:\\Users\\410\\anaconda3\\envs\\convcap\\lib\\site-packages\\pycocoevalcap\\spice\\tmp\\tmpfqip9rjt', '-cache', 'C:\\Users\\410\\anaconda3\\envs\\convcap\\lib\\site-packages\\pycocoevalcap\\spice\\cache', '-out', 'C:\\Users\\410\\anaconda3\\envs\\convcap\\lib\\site-packages\\pycocoevalcap\\spice\\tmp\\tmpr4namkiy', '-subset', '-silent']' returned non-zero exit status 1.

In [None]:
bestmodelfn = osp.join(args.model_dir, 'bestmodel.pth')

In [None]:
if (osp.exists(bestmodelfn)):
    print('if (osp.exists(bestmodelfn)):')
    
    if (args.beam_size == 1):
        print('if (args.beam_size == 1):')
        scores = test(args, 'test', modelfn=bestmodelfn)
    else:
        print('else:')
        scores = test_beam(args, 'test', modelfn=bestmodelfn)
        
    print('TEST set scores')
    for k, v in scores[0].items():
        print('%s: %f' % (k, v))
else:
    print('2 else')
    raise Exception('No checkpoint found %s' % bestmodelfn)

In [None]:
scores[0].items()