DeepRNN · lfydegithub · May 23, 2018 · May 23, 2018 · May 23, 2018 · May 23, 2018
diff --git a/README.md b/README.md
@@ -1,3 +1,10 @@
+## Python 3 Version of Show, Attend and Tell using Tensorflow
+This repo is python3 version of [DeepRNN/image_captioning](https://github.com/DeepRNN/image_captioning), which implements "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" by Xu et al. (ICML2015). Many thanks to [salaniz's coco evaluation tool for python3](https://github.com/salaniz/pycocoevalcap). I am using
+- Python 3.6
+- Tensorflow 1.8.0
+
+#### Original readme below
+
 ### Introduction
 This neural system for image captioning is roughly based on the paper "Show, Attend and Tell: Neural Image Caption Generation with Visual Attention" by Xu et al. (ICML2015). The input is an image, and the output is a sentence describing the content of the image. It uses a convolutional neural network to extract visual features from the image, and uses a LSTM recurrent neural network to decode these features into a sentence. A soft attention mechanism is incorporated to improve the quality of the caption. This project is implemented using the Tensorflow library, and allows end-to-end training of both CNN and RNN parts.
 

diff --git a/base_model.py b/base_model.py
@@ -2,8 +2,12 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
+
+import matplotlib
+matplotlib.use('agg')
+
 import matplotlib.pyplot as plt
-import cPickle as pickle
+import pickle
 import copy
 import json
 from tqdm import tqdm
@@ -66,6 +70,7 @@ def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
         config = self.config
 
         results = []
+        print('config.eval_result_dir:', config.eval_result_dir)
         if not os.path.exists(config.eval_result_dir):
             os.mkdir(config.eval_result_dir)
 
@@ -81,7 +86,7 @@ def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
                 word_idxs = caption_data[l][0].sentence
                 score = caption_data[l][0].score
                 caption = vocabulary.get_sentence(word_idxs)
-                results.append({'image_id': eval_data.image_ids[idx],
+                results.append({'image_id': eval_data.image_ids[idx].item(),
                                 'caption': caption})
                 idx += 1
 
@@ -97,7 +102,7 @@ def eval(self, sess, eval_gt_coco, eval_data, vocabulary):
                     plt.savefig(os.path.join(config.eval_result_dir,
                                              image_name+'_result.jpg'))
 
-        fp = open(config.eval_result_file, 'wb')
+        fp = open(config.eval_result_file, 'w')
         json.dump(results, fp)
         fp.close()
 
@@ -259,7 +264,7 @@ def load(self, sess, model_file=None):
                                      str(global_step)+".npy")
 
         print("Loading the model from %s..." %save_path)
-        data_dict = np.load(save_path).item()
+        data_dict = np.load(save_path, encoding='latin1').item()
         count = 0
         for v in tqdm(tf.global_variables()):
             if v.name in data_dict.keys():
@@ -270,11 +275,14 @@ def load(self, sess, model_file=None):
     def load_cnn(self, session, data_path, ignore_missing=True):
         """ Load a pretrained CNN model. """
         print("Loading the CNN from %s..." %data_path)
-        data_dict = np.load(data_path).item()
+        # import pdb; pdb.set_trace()
+        import os;
+        data_path = data_path.strip()
+        data_dict = np.load(os.getcwd() + '/' + data_path, encoding='latin1').item()
         count = 0
         for op_name in tqdm(data_dict):
             with tf.variable_scope(op_name, reuse = True):
-                for param_name, data in data_dict[op_name].iteritems():
+                for param_name, data in data_dict[op_name].items():
                     try:
                         var = tf.get_variable(param_name)
                         session.run(var.assign(data))

diff --git a/dataset.py b/dataset.py
@@ -122,7 +122,7 @@ def prepare_train_data(config):
         data = {'word_idxs': word_idxs, 'masks': masks}
         np.save(config.temp_data_file, data)
     else:
-        data = np.load(config.temp_data_file).item()
+        data = np.load(config.temp_data_file, encoding='latin1').item()
         word_idxs = data['word_idxs']
         masks = data['masks']
     print("Captions processed.")

diff --git a/utils/coco/pycocoevalcap/.gitignore b/utils/coco/pycocoevalcap/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/utils/coco/pycocoevalcap/README.md b/utils/coco/pycocoevalcap/README.md
@@ -0,0 +1,44 @@
+Microsoft COCO Caption Evaluation
+===================
+
+Evaluation codes for MS COCO caption generation.
+
+## Description ##
+This repository provides Python 3 support for the caption evaluation metrics used for the MS COCO dataset.
+
+The code is derived from the original repository that supports Python 2.7: https://github.com/tylin/coco-caption.  
+Caption evaluation depends on the COCO API that natively supports Python 3 (see Requirements).
+
+## Requirements ##
+- Java 1.8.0
+- Python 3 (tested on Python 3.6)
+- pycocotools (COCO Python API): https://github.com/cocodataset/cocoapi
+
+## Files ##
+./
+- evals.py: The file includes COCOEavlCap class that can be used to evaluate results on COCO.
+- tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
+- bleu: Bleu evalutation codes
+- meteor: Meteor evaluation codes
+- rouge: Rouge-L evaluation codes
+- cider: CIDEr evaluation codes
+
+## References ##
+
+- [Microsoft COCO Captions: Data Collection and Evaluation Server](http://arxiv.org/abs/1504.00325)
+- PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml).
+- BLEU: [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)
+- Meteor: [Project page](http://www.cs.cmu.edu/~alavie/METEOR/) with related publications. We use the latest version (1.5) of the [Code](https://github.com/mjdenkowski/meteor). Changes have been made to the source code to properly aggreate the statistics for the entire corpus.
+- Rouge-L: [ROUGE: A Package for Automatic Evaluation of Summaries](http://anthology.aclweb.org/W/W04/W04-1013.pdf)
+- CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf)
+
+## Developers ##
+- Xinlei Chen (CMU)
+- Hao Fang (University of Washington)
+- Tsung-Yi Lin (Cornell)
+- Ramakrishna Vedantam (Virgina Tech)
+
+## Acknowledgement ##
+- David Chiang (University of Norte Dame)
+- Michael Denkowski (CMU)
+- Alexander Rush (Harvard University)
diff --git a/utils/coco/pycocoevalcap/bleu/bleu.py b/utils/coco/pycocoevalcap/bleu/bleu.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # File Name : bleu.py
 #
 # Description : Wrapper for BLEU scorer.
@@ -8,7 +8,7 @@
 # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 
-from bleu_scorer import BleuScorer
+from .bleu_scorer import BleuScorer
 
 
 class Bleu:

diff --git a/utils/coco/pycocoevalcap/bleu/bleu_scorer.py b/utils/coco/pycocoevalcap/bleu/bleu_scorer.py
@@ -7,7 +7,7 @@
 # reserved. Do not redistribute without permission from the
 # author. Not for commercial use.
 
-# Modified by: 
+# Modified by:
 # Hao Fang <hfang@uw.edu>
 # Tsung-Yi Lin <tl483@cornell.edu>
 
@@ -26,8 +26,8 @@ def precook(s, n=4, out=False):
     can take string arguments as well."""
     words = s.split()
     counts = defaultdict(int)
-    for k in xrange(1,n+1):
-        for i in xrange(len(words)-k+1):
+    for k in range(1,n+1):
+        for i in range(len(words)-k+1):
             ngram = tuple(words[i:i+k])
             counts[ngram] += 1
     return (len(words), counts)
@@ -42,7 +42,7 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
     for ref in refs:
         rl, counts = precook(ref, n)
         reflen.append(rl)
-        for (ngram,count) in counts.iteritems():
+        for (ngram,count) in counts.items():
             maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
 
     # Calculate effective reference sentence length.
@@ -52,32 +52,33 @@ def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
         reflen = float(sum(reflen))/len(reflen)
 
     ## lhuang: N.B.: leave reflen computaiton to the very end!!
-    
+
     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 
     return (reflen, maxcounts)
 
-def cook_test(test, (reflen, refmaxcounts), eff=None, n=4):
+def cook_test(test, refs, eff=None, n=4):
     '''Takes a test sentence and returns an object that
     encapsulates everything that BLEU needs to know about it.'''
 
+    reflen, refmaxcounts = refs
     testlen, counts = precook(test, n, True)
 
     result = {}
 
     # Calculate effective reference sentence length.
-    
+
     if eff == "closest":
         result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
     else: ## i.e., "average" or "shortest" or None
         result["reflen"] = reflen
 
     result["testlen"] = testlen
 
-    result["guess"] = [max(0,testlen-k+1) for k in xrange(1,n+1)]
+    result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
 
     result['correct'] = [0]*n
-    for (ngram, count) in counts.iteritems():
+    for (ngram, count) in counts.items():
         result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
 
     return result
@@ -108,7 +109,7 @@ def __init__(self, test=None, refs=None, n=4, special_reflen=None):
 
     def cook_append(self, test, refs):
         '''called by constructor and __iadd__ to avoid creating new instances.'''
-        
+
         if refs is not None:
             self.crefs.append(cook_refs(refs))
             if test is not None:
@@ -136,7 +137,7 @@ def reflen(self, option=None):
 
     def testlen(self, option=None):
         self.compute_score(option=option)
-        return self._testlen        
+        return self._testlen
 
     def retest(self, new_test):
         if type(new_test) is str:
@@ -151,7 +152,7 @@ def retest(self, new_test):
 
     def rescore(self, new_test):
         ''' replace test(s) with new test(s), and returns the new score.'''
-        
+
         return self.retest(new_test).compute_score()
 
     def size(self):
@@ -170,7 +171,7 @@ def __iadd__(self, other):
             self.crefs.extend(other.crefs)
             self._score = None ## need to recompute
 
-        return self        
+        return self
 
     def compatible(self, other):
         return isinstance(other, BleuScorer) and self.n == other.n
@@ -179,7 +180,7 @@ def single_reflen(self, option="average"):
         return self._single_reflen(self.crefs[0][0], option)
 
     def _single_reflen(self, reflens, option=None, testlen=None):
-        
+
         if option == "shortest":
             reflen = min(reflens)
         elif option == "average":
@@ -194,7 +195,7 @@ def _single_reflen(self, reflens, option=None, testlen=None):
     def recompute_score(self, option=None, verbose=0):
         self._score = None
         return self.compute_score(option, verbose)
-        
+
     def compute_score(self, option=None, verbose=0):
         n = self.n
         small = 1e-9
@@ -212,7 +213,7 @@ def compute_score(self, option=None, verbose=0):
         totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
 
         # for each sentence
-        for comps in self.ctest:            
+        for comps in self.ctest:
             testlen = comps['testlen']
             self._testlen += testlen
 
@@ -222,42 +223,42 @@ def compute_score(self, option=None, verbose=0):
                 reflen = self.special_reflen
 
             self._reflen += reflen
-                
+
             for key in ['guess','correct']:
-                for k in xrange(n):
+                for k in range(n):
                     totalcomps[key][k] += comps[key][k]
 
             # append per image bleu score
             bleu = 1.
-            for k in xrange(n):
+            for k in range(n):
                 bleu *= (float(comps['correct'][k]) + tiny) \
-                        /(float(comps['guess'][k]) + small) 
+                        /(float(comps['guess'][k]) + small)
                 bleu_list[k].append(bleu ** (1./(k+1)))
             ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
             if ratio < 1:
-                for k in xrange(n):
+                for k in range(n):
                     bleu_list[k][-1] *= math.exp(1 - 1/ratio)
 
             if verbose > 1:
-                print comps, reflen
+                print(comps, reflen)
 
         totalcomps['reflen'] = self._reflen
         totalcomps['testlen'] = self._testlen
 
         bleus = []
         bleu = 1.
-        for k in xrange(n):
+        for k in range(n):
             bleu *= float(totalcomps['correct'][k] + tiny) \
                     / (totalcomps['guess'][k] + small)
             bleus.append(bleu ** (1./(k+1)))
         ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
         if ratio < 1:
-            for k in xrange(n):
+            for k in range(n):
                 bleus[k] *= math.exp(1 - 1/ratio)
 
         if verbose > 0:
-            print totalcomps
-            print "ratio:", ratio
+            print(totalcomps)
+            print("ratio:", ratio)
 
         self._score = bleus
         return self._score, bleu_list
diff --git a/utils/coco/pycocoevalcap/cider/cider.py b/utils/coco/pycocoevalcap/cider/cider.py
@@ -1,18 +1,18 @@
 # Filename: cider.py
 #
-# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
+# Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
 #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 #
 # Creation Date: Sun Feb  8 14:16:54 2015
 #
 # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 
-from cider_scorer import CiderScorer
+from .cider_scorer import CiderScorer
 import pdb
 
 class Cider:
     """
-    Main Class to compute the CIDEr metric 
+    Main Class to compute the CIDEr metric
 
     """
     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
@@ -26,7 +26,7 @@ def compute_score(self, gts, res):
         Main function to compute CIDEr score
         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
-        :return: cider (float) : computed CIDEr score for the corpus 
+        :return: cider (float) : computed CIDEr score for the corpus
         """
 
         assert(gts.keys() == res.keys())
@@ -51,4 +51,4 @@ def compute_score(self, gts, res):
         return score, scores
 
     def method(self):
-        return "CIDEr"
+        return "CIDEr"