# Step 1: Stimulate Rankings of Relevance for E and P (5 points)

In [1]:
import itertools as iter
import numpy as np
import random
import math

#The gradings
relevance_gradings = ['N', 'R', 'HR']

# Create all possible versions of length five rankings
list_p = [p for p in iter.product(relevance_gradings, repeat=5)]
list_e = [p for p in iter.product(relevance_gradings, repeat=5)]

print(len(list_e))

# Create all possible pairs of the two lists - if computing power permits
#list_pairs = [list(zip(list_p, p)) for p in itertools.permutations(list_e)]

# Sampling - take one at random from list 1 and one 1 from list 2 until 1000.
sample_size = 1000
sample_list_p = [random.choice(list_p) for _ in range(sample_size)]
sample_list_e = [random.choice(list_e) for _ in range(sample_size)]

sample_pairs = list(zip(sample_list_p, sample_list_e))

for i in range(0,5):
    print(sample_pairs[i])

243
(('N', 'R', 'HR', 'R', 'R'), ('R', 'HR', 'R', 'HR', 'HR'))
(('N', 'HR', 'HR', 'R', 'R'), ('N', 'R', 'N', 'N', 'R'))
(('HR', 'N', 'HR', 'R', 'HR'), ('N', 'HR', 'HR', 'R', 'HR'))
(('R', 'R', 'R', 'N', 'HR'), ('R', 'HR', 'HR', 'HR', 'N'))
(('R', 'HR', 'HR', 'R', 'R'), ('N', 'R', 'HR', 'HR', 'HR'))


# Step 2: Implement Evaluation Measures

### Binary Measure - Average Precision

In [12]:
def calc_ave_precision(list_of_relevance):
    rank_length = len(list_of_relevance)
    running_score = 0
    relevant_count = 0
    for i in range(0, rank_length):
        if(list_of_relevance[i] == "R" or list_of_relevance[i] == "HR"):
            relevant_count += 1
            running_score += relevant_count / (i+1)
    ave_precision = running_score / rank_length
    return ave_precision

p_scores = []
e_scores = []

# Calculate average precision for all queries
for pairs in sample_pairs:
    p_scores.append(calc_ave_precision(pairs[0]))
    e_scores.append(calc_ave_precision(pairs[1]))

# Calculate the average of the average precisions across queries
p_ave_precision_over_queries = sum(p_scores) / len(p_scores)
e_ave_precision_over_queries = sum(e_scores) / len(e_scores)

print("The average precision across the queries for the production algorithm is: %s" % p_ave_precision_over_queries)
print("The average precision across the queries for the experimental algorithm is: %s" % e_ave_precision_over_queries)

The average precision across the queries for the production algorithm is: 0.5469233333333325
The average precision across the queries for the experimental algorithm is: 0.5507399999999996


### Multi-graded Evaluation Measure 1 - Discounted Cummulative Gain at rank k. 

In [11]:
# Gain for each relevance label
relevance_gain_dict = {'HR': 5, 'R': 1, 'N': 0}

# Function that calculates DCG@5
def DCG_Rank_K(ranked_list):
    gain = 0
    discounted_gain = 0
    for i, item in enumerate(ranked_list):
        rel = relevance_gain_dict[item]
        discounted_gain += (2**rel - 1) /  (math.log2(i + 1 + 1))
    return discounted_gain


p_DCG_scores = []
e_DCG_scores = []

# Calculate DCG@5 for each algorithm on all queries
for pairs in sample_pairs:
    p_DCG_scores.append(DCG_Rank_K(pairs[0]))
    e_DCG_scores.append(DCG_Rank_K(pairs[1]))

# Calculate the average of the average precisions across queries
p_ave_DCG_over_queries = sum(p_DCG_scores) / len(p_DCG_scores)
e_ave_DCG_over_queries = sum(e_DCG_scores) / len(e_DCG_scores)

print("The average DCG@5 across the queries for the production algorithm is: %s" % p_ave_DCG_over_queries)
print("The average DCG@5 across the queries for the experimental algorithm is: %s" % e_ave_DCG_over_queries)   

The average DCG@5 across the queries for the production algorithm is: 30.78860108185417
The average DCG@5 across the queries for the experimental algorithm is: 32.21138537731719
