# Step 1: Stimulate Rankings of Relevance for E and P (5 points)

In [5]:
import itertools as iter
import numpy as np
import random
import math

#The gradings
relevance_gradings = ['N', 'R', 'HR']

# Create all possible versions of length five rankings
list_p = [p for p in iter.product(relevance_gradings, repeat=5)]
list_e = [p for p in iter.product(relevance_gradings, repeat=5)]

print(len(list_e))

# Create all possible pairs of the two lists - if computing power permits
#list_pairs = [list(zip(list_p, p)) for p in itertools.permutations(list_e)]

# Sampling - take one at random from list 1 and one 1 from list 2 until 1000.
sample_size = 1000
sample_list_p = [random.choice(list_p) for _ in range(sample_size)]
sample_list_e = [random.choice(list_e) for _ in range(sample_size)]

sample_pairs = list(zip(sample_list_p, sample_list_e))

for i in range(0,5):
    print(sample_pairs[i])

243
(('HR', 'R', 'N', 'R', 'R'), ('HR', 'N', 'HR', 'HR', 'N'))
(('HR', 'HR', 'HR', 'N', 'HR'), ('HR', 'HR', 'HR', 'N', 'R'))
(('HR', 'R', 'N', 'N', 'HR'), ('N', 'N', 'N', 'N', 'R'))
(('HR', 'R', 'N', 'HR', 'R'), ('N', 'N', 'R', 'HR', 'HR'))
(('R', 'R', 'HR', 'R', 'N'), ('HR', 'HR', 'R', 'HR', 'N'))


# Step 2: Implement Evaluation Measures (15 points)

### Binary Measure - Average Precision

In [6]:
def calc_ave_precision(list_of_relevance):
    rank_length = len(list_of_relevance)
    running_score = 0
    relevant_count = 0
    for i in range(0, rank_length):
        if(list_of_relevance[i] == "R" or list_of_relevance[i] == "HR"):
            relevant_count += 1
            running_score += relevant_count / (i+1)
    ave_precision = running_score / rank_length
    return ave_precision

p_AP_scores = []
e_AP_scores = []

# Calculate average precision for all queries
for pairs in sample_pairs:
    p_AP_scores.append(calc_ave_precision(pairs[0]))
    e_AP_scores.append(calc_ave_precision(pairs[1]))

# Calculate the average of the average precisions across queries
p_ave_precision_over_queries = sum(p_AP_scores) / len(p_AP_scores)
e_ave_precision_over_queries = sum(e_AP_scores) / len(e_AP_scores)

print("The average precision across the queries for the production algorithm is: %s" % p_ave_precision_over_queries)
print("The average precision across the queries for the experimental algorithm is: %s" % e_ave_precision_over_queries)

The average precision across the queries for the production algorithm is: 0.5387166666666665
The average precision across the queries for the experimental algorithm is: 0.5604966666666659


### Multi-graded Evaluation Measure 1 - Discounted Cummulative Gain at rank k. 

In [7]:
# Gain for each relevance label
relevance_gain_dict = {'HR': 5, 'R': 1, 'N': 0}

# Function that calculates DCG@5
def DCG_Rank_K(ranked_list):
    gain = 0
    discounted_gain = 0
    for i, item in enumerate(ranked_list):
        rel = relevance_gain_dict[item]
        discounted_gain += (2**rel - 1) /  (math.log2(i + 1 + 1))
    return discounted_gain


p_DCG_scores = []
e_DCG_scores = []

# Calculate DCG@5 for each algorithm on all queries
for pairs in sample_pairs:
    p_DCG_scores.append(DCG_Rank_K(pairs[0]))
    e_DCG_scores.append(DCG_Rank_K(pairs[1]))

# Calculate the average of the average precisions across queries
p_ave_DCG_over_queries = sum(p_DCG_scores) / len(p_DCG_scores)
e_ave_DCG_over_queries = sum(e_DCG_scores) / len(e_DCG_scores)

print("The average DCG@5 across the queries for the production algorithm is: %s" % p_ave_DCG_over_queries)
print("The average DCG@5 across the queries for the experimental algorithm is: %s" % e_ave_DCG_over_queries)   

The average DCG@5 across the queries for the production algorithm is: 32.31213721006813
The average DCG@5 across the queries for the experimental algorithm is: 32.41057379341242


### Multi-graded Evaluation Measure 2 - Rank Biased Precision with persistence parameter $\theta = 0.8$ 

In [22]:
# Gain for each relevance label
relevance_gain_dict = {'HR': 5, 'R': 1, 'N': 0}

# Function that calculates RBP
def calc_RBP(ranked_list):
    theta = 0.8
    expected_utility = 0
    for i, item in enumerate(ranked_list):
        k_rel = relevance_gain_dict[item]
        expected_utility += k_rel * pow(theta, (i+1)*(1-theta))
    return expected_utility

p_RBP_scores = []
e_RBP_scores = []

# Calculate RBP for each algorithm on all queries
for pairs in sample_pairs:
    p_RBP_scores.append(calc_RBP(pairs[0]))
    e_RBP_scores.append(calc_RBP(pairs[1]))

# Calculate the average of the average precisions across queries
p_ave_RBP_over_queries = sum(p_RBP_scores) / len(p_RBP_scores)
e_ave_RBP_over_queries = sum(e_RBP_scores) / len(e_RBP_scores)

print("The average RBP across the queries for the production algorithm is: %s" %p_ave_RBP_over_queries)
print("The average RBP across the queries for the experimental algorithm is: %s" %e_ave_RBP_over_queries)

The average RBP across the queries for the production algorithm is: 8.933936315806799
The average RBP across the queries for the experimental algorithm is: 8.991956475111728


# Step 3: Calculate the $\Delta measure$ (5 poins)

In [23]:
# Function to compare two lists of scores
def compare_lists(list_a, list_b):
    b_better_pairs = []
    for i, item in enumerate(list_a):
        if item < list_b[i]:
            #b_better_pairs.append((list_b[i], item))
            b_better_pairs.append(i)
    return b_better_pairs


# Average Precision Difference Measure
E_better_AP = compare_lists(p_AP_scores, e_AP_scores)
E_better_AP_Pairs = list(map((lambda x: sample_pairs[x]), E_better_AP))
#print(E_better_AP_Pairs[0:5])

# DCG@5 Difference Measure
E_better_DCG = compare_lists(p_DCG_scores, e_DCG_scores)
E_better_DCG_Pairs = list(map((lambda x: sample_pairs[x]), E_better_DCG))
#print(E_better_DCG_Pairs[0:5])

# RBP Difference Measure
E_better_RBP = compare_lists(p_RBP_scores, e_RBP_scores)
E_better_RBP_Pairs = list(map((lambda x: sample_pairs[x]), E_better_RBP))
#print(E_better_RBP_Pairs[0:5])

# Step 4: Implement Interleaving

# Step 5: Implement User Clicks Simulation

# Step 6: Simulate Interleaving Experiment

# Step 7: Results and Analyis