In [24]:
import itertools
from collections import Counter
from random import shuffle
import math

k = 3
relevances = { 'N', 'R', 'HR' }
relevanceScores = { 'N':0, 'R':1, 'HR':2 }
rankingsOf5 = list(itertools.product(relevances, repeat=5))
pairsOfRankingsOf5 = list(itertools.product(rankingsOf5, rankingsOf5))
shuffle(pairsOfRankingsOf5)
print(len(pairsOfRankingsOf5), "pairs of rankings")

def getContingencies (items, k, relevantDocumentCount):
    retrievedCounter = Counter(items[:k])
    TP = retrievedCounter['R'] + retrievedCounter['HR']
    FP = retrievedCounter['N']
    
    notRetrievedCounter = Counter(items[k:])
    TN = notRetrievedCounter['N']
    FN = relevantDocumentCount - TP
    
    return TP, FP, TN, FN

def getPrecisionAtK (ranking, k):
    TP, FP, TN, FN = getContingencies(ranking, k, relevantDocumentCount)
    precisionAtK = TP / (TP + FP)
#     recallAtK = TP / (TP + FN)
#     F1AtK = 2*precisionAtK*recallAtK
#     if F1AtK > 0.0: 
#         F1AtK /= precisionAtK + recallAtK
#     accuracyAtK = (TP + TN)/(TP + FP + FN + TN)
    return precisionAtK

def getAveragePrecision (ranking, relevantDocumentCount):
    precisionsForAp = []
    for k in range(1, len(ranking)+1):
        precisionAtK = getPrecisionAtK(ranking, k)

        if ranking[k-1] == 'R' or ranking[k-1] == 'HR':
            # save for calculating AP later
            precisionsForAp.append(precisionAtK)
    
    averagePrecision = sum(precisionsForAp)/relevantDocumentCount
    return averagePrecision

def getDiscountedCumulativeGain (ranking):
    dcg = 0.0
    for r in range(1, len(ranking)+1):
        relevanceAtR = relevanceScores[ranking[r-1]]
        gain = (2 ** relevanceAtR) - 1
        discount = math.log2(1 + r)
        dcg += gain/discount
    return dcg

def getProbabilityOfRelevance (relevance):
    # from paper, similar to DCG score
    gain = (2 ** relevanceScores[relevance]) - 1
    discount = 2 ** max(relevanceScores.values())
    probability = gain / discount
#     print('probability of', relevance, 'is', probability)
    return probability

def getExpectedReciprocalRank (ranking):
    err = 0.0
    for r in range(1, len(ranking)+1):
        probabilityOfReachingRankR = 1.0
        for j in range(r-1):
            probabilityOfReachingRankR *= 1 - getProbabilityOfRelevance(ranking[j])
        probabilityOfStoppingAtRankR = getProbabilityOfRelevance(ranking[r-1])
        probabilityOfSatisfaction = probabilityOfReachingRankR * probabilityOfStoppingAtRankR
        expectedProbabilityOfSatisfaction = probabilityOfSatisfaction / r
        err += expectedProbabilityOfSatisfaction
    return err

averagePrecisionsForMapP = []
averagePrecisionsForMapE = []
pairCountForWhichEHasBetterAp = 0
pairCountForWhichEHasBetterNDcg = 0
pairCountForWhichEHasBetterErr = 0
for i, rankingPair in enumerate(pairsOfRankingsOf5):
    P = rankingPair[0]
    E = rankingPair[1]

    # implement 1 of (binary):
    #   precision at rank k
    #   recall at rank k
    #   average precision   <--
    totalCounter = Counter(P) + Counter(E)
    relevantDocumentCount = totalCounter['R'] + totalCounter['HR']
    
    if relevantDocumentCount == 0:
        # result is irrelevant
        continue
    
    averagePrecisionP = getAveragePrecision(P, relevantDocumentCount)
    averagePrecisionE = getAveragePrecision(E, relevantDocumentCount)
    
    # save for calculating MAP later
    averagePrecisionsForMapP.append(averagePrecisionP)
    averagePrecisionsForMapE.append(averagePrecisionE)

    # implement 2 of (multi-graded):
    #   nDCG at rank k
    #   ERR
    
    # Normalized Discounted Cumulative Gain
    # First we have to determine the perfect ranking. Assuming the P and E results are always
    # different, and that both algorithms run on the same corpus of documents, the perfect ranking 
    # would include the results from both rankings.
    mergedRanking = P + E
    perfectRanking = sorted(mergedRanking, key=lambda relevance: relevanceScores[relevance], reverse=True)
    perfectDcgScore = getDiscountedCumulativeGain(perfectRanking[:k])
    dcgAtKP = getDiscountedCumulativeGain(P[:k])
    dcgAtKE = getDiscountedCumulativeGain(E[:k])
    nDcgAtKP = dcgAtKP / perfectDcgScore
    nDcgAtKE = dcgAtKE / perfectDcgScore
    
    
    # Expected Reciprocal Rank
    errP = getExpectedReciprocalRank(P[:k])
    errE = getExpectedReciprocalRank(E[:k])
    
    # calculate delta measures
    deltaAp = averagePrecisionE - averagePrecisionP
    deltaNDcg = nDcgAtKE - nDcgAtKP
    deltaErr = errE - errP
    
    # count pairs for which E outperforms P
    epsilon = 1e-6 # avoid floating point imprecisions
    if deltaAp > epsilon:
        pairCountForWhichEHasBetterAp += 1
    if deltaNDcg > epsilon:
        pairCountForWhichEHasBetterNDcg += 1
    if deltaErr > epsilon:
        pairCountForWhichEHasBetterErr += 1
    
    # only show a few
    if i < 10:
        # show the pair
        print ('\nP: ', P, '\nE: ', E)
        print('perfect ranking:\t', perfectRanking)
        print('perfect DCG score:\t', perfectDcgScore)
        print('AP: \tP:{:.3f} \tE:{:.3f}'.format(averagePrecisionP, averagePrecisionE))
        print('nDCG: \tP:{:.3f} \tE:{:.3f}'.format(nDcgAtKP, nDcgAtKE))
        print('ERR: \tP:{:.3f} \tE:{:.3f}'.format(errP, errE))
        
# print results

# print how many times E outperformed P
print('\n\nOut of {} rankings at k = {}, E outperformed P:'.format(len(pairsOfRankingsOf5)-1, k))
print('AP: \t{:.3%}'.format(pairCountForWhichEHasBetterAp/len(pairsOfRankingsOf5)))
print('nDCG: \t{:.3%}'.format(pairCountForWhichEHasBetterNDcg/len(pairsOfRankingsOf5)))
print('ERR: \t{:.3%}'.format(pairCountForWhichEHasBetterErr/len(pairsOfRankingsOf5)))
        
# we accidentally implemented MAP instead of just AP, but we'll leave it in
meanAveragePrecisionP = sum(averagePrecisionsForMapP)/len(averagePrecisionsForMapP)
meanAveragePrecisionE = sum(averagePrecisionsForMapE)/len(averagePrecisionsForMapE)
print('MAP \tP:{:.3f} \tE:{:.3f}'.format(meanAveragePrecisionP, meanAveragePrecisionE))

59049 pairs of rankings

P:  ('N', 'R', 'N', 'HR', 'HR') 
E:  ('R', 'N', 'N', 'HR', 'R')
perfect ranking:	 ['HR', 'HR', 'HR', 'R', 'R', 'R', 'N', 'N', 'N', 'N']
perfect DCG score:	 6.392789260714372
AP: 	P:0.267 	E:0.350
nDCG: 	P:0.099 	E:0.156
ERR: 	P:0.125 	E:0.250

P:  ('HR', 'R', 'R', 'R', 'N') 
E:  ('R', 'N', 'HR', 'N', 'HR')
perfect ranking:	 ['HR', 'HR', 'HR', 'R', 'R', 'R', 'R', 'N', 'N', 'N']
perfect DCG score:	 6.392789260714372
AP: 	P:0.571 	E:0.324
nDCG: 	P:0.646 	E:0.391
ERR: 	P:0.797 	E:0.438

P:  ('HR', 'HR', 'R', 'N', 'HR') 
E:  ('R', 'R', 'HR', 'N', 'N')
perfect ranking:	 ['HR', 'HR', 'HR', 'HR', 'R', 'R', 'R', 'N', 'N', 'N']
perfect DCG score:	 6.392789260714372
AP: 	P:0.543 	E:0.429
nDCG: 	P:0.844 	E:0.490
ERR: 	P:0.849 	E:0.484

P:  ('HR', 'HR', 'HR', 'HR', 'R') 
E:  ('N', 'N', 'N', 'R', 'R')
perfect ranking:	 ['HR', 'HR', 'HR', 'HR', 'R', 'R', 'R', 'N', 'N', 'N']
perfect DCG score:	 6.392789260714372
AP: 	P:0.714 	E:0.093
nDCG: 	P:1.000 	E:0.000
ERR: 	P:0.859 	E:0.