# Main Code: Compute Bleu Score

In [8]:
import nltk
class Metrics:
    @staticmethod
    def getBleuScore(listOfGroundTruthStrings, predictionStrings):
        """
        listOfGroundTruthStrings - List of Ground Truth strings Ex: [[ref1a, ref1b], [ref2a, ref2b], [ref3a]]
        predictionStrings - Prediction string - Ex: [pred1, pred2, pred3]
        """
        
        listOfGroundTruthWords = [[groundTruthString.split() for groundTruthString in groundTruthStrings] for groundTruthStrings in listOfGroundTruthStrings]
        predictionWords = list(map(str.split, predictionStrings))
        return nltk.translate.bleu_score.corpus_bleu(listOfGroundTruthWords, predictionWords)


In [9]:
def extractGroundTruthsAndPredictions(prediction_groundtruths):
    GroundTruthListKey = 'GroundTruth'
    PredictionKey = 'Prediction'

    listOfGroundTruthStrings = []
    predictionStrings = []

    for val in prediction_groundtruths.values():
        listOfGroundTruthStrings.append(val[GroundTruthListKey])
        predictionStrings.append(val[PredictionKey])     

    return listOfGroundTruthStrings, predictionStrings

# Features_2D - Bleu Score

## Load GroundTruth corresponding Predictions file

In [25]:
import pickle

prediction_groundtruths = pickle.load( open( "./DATA/Predictions/VideoCaptions_2d/video_groundtruth_predictions.pkl", "rb" ) )

## Extract ground truths and prediction string

In [11]:
listOfGroundTruthStrings, predictionStrings = extractGroundTruthsAndPredictions(prediction_groundtruths)

## Bleu Score: 2D

In [13]:
Metrics.getBleuScore(listOfGroundTruthStrings, predictionStrings)

0.47454940871518997

## pickle file to support Python2 version for Meteor scoring 

In [26]:
pickle.dump(prediction_groundtruths, open( "./DATA/Predictions/VideoCaptions_2d/video_groundtruth_predictions_p2.pkl", "wb"), protocol=2)

------------------------------------

# Features_2D_MeanPooling - Bleu Score

## Load GroundTruth corresponding Predictions file

In [27]:
import pickle

prediction_groundtruths = pickle.load( open( "./DATA/Predictions/VideoCaptions_2d_mean/video_groundtruth_predictions.pkl", "rb" ) )

## Extract ground truths and prediction string

In [15]:
listOfGroundTruthStrings, predictionStrings = extractGroundTruthsAndPredictions(prediction_groundtruths)

## Bleu Score: 2D_MeanPooling

In [16]:
Metrics.getBleuScore(listOfGroundTruthStrings, predictionStrings)

0.36888953750480535

## pickle file to support Python2 version for Meteor scoring 

In [28]:
pickle.dump(prediction_groundtruths, open( "./DATA/Predictions/VideoCaptions_2d_mean/video_groundtruth_predictions_p2.pkl", "wb"), protocol=2)

-------

# Features_3D - Bleu Score

## Load GroundTruth corresponding Predictions file

In [29]:
import pickle

prediction_groundtruths = pickle.load( open( "./DATA/Predictions/VideoCaptions_3d/video_groundtruth_predictions.pkl", "rb" ) )

## Extract ground truths and prediction string

In [18]:
listOfGroundTruthStrings, predictionStrings = extractGroundTruthsAndPredictions(prediction_groundtruths)

## Bleu Score: 2D_MeanPooling

In [19]:
Metrics.getBleuScore(listOfGroundTruthStrings, predictionStrings)

0.23616249823541016

## pickle file to support Python2 version for Meteor scoring 

In [30]:
pickle.dump(prediction_groundtruths, open( "./DATA/Predictions/VideoCaptions_3d/video_groundtruth_predictions_p2.pkl", "wb"), protocol=2)

------

### Tried implementing Meteor

## Get Meteor related file 
from this location "http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz"

and then extract tar file and copy meteor-1.5.jar file to the same folder that this notebook is present.

In [2]:
from meteor import Meteor
meteorObj = Meteor()
meteorObj.compute_score({1:['it is a cat', 'it looks like a cat'], 2: ['it is a chair']},{1:['it is cat'], 2:['it is a chair']})

IOError: [Errno 32] Broken pipe

# IGNORE: This part is just for test 

In [8]:
import nltk

hypothesis = 'it may be a cat'.split()
references = ['it is a cat'.split(),\
              'it is cat'.split(),\
              'it seems a cat'.split()\
             ]
#there may be several references
BLEUscore = nltk.translate.bleu_score.sentence_bleu(references, hypothesis)
print(BLEUscore)

0.6223329772884784


Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Test Bleu@N using NLTK

In [1]:
import nltk 

In [7]:
import nltk

hypothesis = ['It', 'is', 'a', 'cat', 'at', 'room']
references = [['It', 'is', 'a', 'cat', 'inside', 'the', 'room'], 'it looks like a cat is inside the room'.split(), \
              'it seems to be a cat inside the room'.split(),\
              'there is a cat inside the room'.split()\
             ]
#there may be several references
BLEUscore = nltk.translate.bleu_score.sentence_bleu(reference, hypothesis)
print(BLEUscore)

0.6389431042462724


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [11]:
import nltk
hypothesis =  ['it is a cat'.split(),\
               'dog is running'.split()\
              ]
             
references = [
              ['it is a cat'.split(),\
               'it is cat'.split(),\
               'it seems a cat'.split()\
              ],\
              [
                'dog is running'.split(),\
                'dog runs'.split()
              ]
            ]
#there may be several references
BLEUscore = nltk.translate.bleu_score.corpus_bleu(references, hypothesis)
print(BLEUscore)

0.8408964152537145
