# Jasper Driessens 11349026, Jasper Linmans 10249060 

In [43]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from types import MethodType
from itertools import product
from random import getrandbits
from random import random

In [44]:
class Relevance:
    """Statically used class containing the relevance grade enumeration."""
    N, R, HR = [
        "N ",  # not relevant
        "R ",  # relevant
        "HR"  # highly relevant
    ]

    all = [N, R, HR]


def quantify(grade):
    """Assigns a numerical value to a relevance grade."""
    if grade is Relevance.N:
        return 0
    if grade is Relevance.R:
        return 1
    if grade is Relevance.HR:
        return 2


def relevant(grade):
    """Tells if a relevance grade counts as relevant (R / HR) or not (N)."""
    return grade is Relevance.R or grade is Relevance.HR

In [45]:
def discounted_gain_at(k, ranking):
    """Returns discounted gain resulting from the document at rank `k` in `ranking`. """
    index = k - 1  # convert 1-based rank to 0-based index
    gain = np.power(2, quantify(ranking[index])) - 1  # non-linear gain resulting from document
    discount = np.log2(1 + k)  # discount factor
    return gain / float(discount)


def dcg_at(k, ranking):
    """Returns DCG at rank `k` for `ranking`. Computes `discounted_gain_at`
    at each rank from 1 to and including `k`, and sums these values. """
    return sum([discounted_gain_at(i + 1, ranking) for i in range(k)])


class Ranking:
    """Ranking objects embody an ordering of Relevance labels and represent either
    the result of a query to the E or P algorithm, or an interleaving of those."""

    total_relevant = 20  # Total amount of relevant (R / HR) documents in collection
    persistence = 0.8  # The RBP persistence parameter

    def __init__(self, ranking):
        """Creates a new Ranking object according to ordered Relevance labels `ranking`."""
        self.ranking = ranking  # Internal ordering of Relevance labels, representing search results
        self.n = len(ranking)
        self.perfect = [Relevance.HR] * self.n  # Optimum ranking of the same length, for DCG normalization

    def __eq__(self, other):
        """Ranking objects are equal when they contain the same relevance label ordering. """
        return self.ranking == other.ranking if isinstance(other, self.__class__) else NotImplemented

    def __ne__(self, other):
        return not self.__eq__(other) if isinstance(other, self.__class__) else NotImplemented

    def __hash__(self):
        return hash(tuple(sorted(self.ranking)))

    def __str__(self):
        return "Ranking" + str(self.ranking)        
        
    def relevant_at(self, k):
        """Return amount of relevant (R / HR) documents from ranks 1 to and including `k`."""
        return sum(relevant(grade) for grade in self.ranking[:k])

    def precision_at(self, k):
        """Return precision at rank `k`: amount of relevant (R / HR) documents from ranks 1 to
        and including `k`, divided by total amount of documents in that range (which is `k`)."""
        return self.relevant_at(k) / float(k)

    def recall_at(self, k):
        """Return recall at rank `k`: amount of relevant (R / HR) documents from ranks 1 to
        and including `k`, divided by total amount of relevant (R / HR) documents in collection
        (given by `total_relevant`). """
        return self.relevant_at(k) / float(self.total_relevant)

    def average_precision(self):
        """Return average precision of this Ranking: average of `precision_at` evaluated at
        each rank where this Ranking has a relevant (H / HR) document."""
        precisions = [self.precision_at(i + 1) for i in range(self.n) if relevant(self.ranking[i])]
        return sum(precisions) / len(precisions) if len(precisions) > 0 else 0

    def dcg_at(self, k):
        """Returns DCG at rank `k` of this Ranking. Wrapper static dcg_at function."""
        return dcg_at(k, self.ranking)

    def ndcg_at(self, k):
        """Returns normalized DCG at rank `k` of this Ranking. Normalizes by computing
        DCG at rank `k` of the best possible ranking (stored in `perfect`) and dividing
        the regular (unnormalized) DCG by this value."""
        return dcg_at(k, self.ranking) / dcg_at(k, self.perfect)

    def observation_probability_at(self, k):
        return (1 - self.persistence) * np.power(self.persistence)

    def rank_biased_precision(self):
        return sum([self.ranking[k] * self.observation_probability_at(k) for k in range(self.n)])

In [46]:
class Origin:
    P, E = ['P ', 'E ']

class RankingPair:
    """RankingPair objects embody a pair of Ranking objects, representing
    the results of the P and E algorithms to a query.

    Contains all the delta measure methods, which are not further documented
    since they are self-explanatory."""

    def __init__(self, p, e):
        self.p = p  # The results of the P algorithm
        self.e = e  # The results of the E algorithm

    def delta_precision_at(self, k):
        return self.e.precision_at(k) - self.p.precision_at(k)

    def delta_recall_at(self, k):
        return self.e.recall_at(k) - self.p.recall_at(k)

    def delta_average_precision(self):
        return self.e.average_precision() - self.p.average_precision()

    def delta_dcg_at(self, k):
        return self.e.dcg_at(k) - self.p.dcg_at(k)

    def delta_ndcg_at(self, k):
        return self.e.ndcg_at(k) - self.p.ndcg_at(k)

    def delta_rbp(self):
        return self.e.rank_biased_precision() - self.p.rank_biased_precision()

    def __str__(self):
        return "RankingPair[P=" + str(self.p.ranking) + ", E=" + str(self.e.ranking) + "]"

Step 1: Simulate Rankings of Relevance for E and P (5 points)

Step 2: Implement Evaluation Measures (15 points)

    Implemented in the classes inserted above

In [47]:
def generate_rankings(length, grades):
    rankings = list(product(grades, repeat=length))
    return [Ranking(list(ranking)) for ranking in rankings]


def generate_pairs(rankings):
    pairs = list(product(rankings, repeat=2))
    return [RankingPair(p, e) for p, e in pairs]


def generate_all_pairs():
    return generate_pairs(generate_rankings(5, Relevance.all))


def generate_all_winners(delta_method, parameter=None):
    all_pairs = generate_all_pairs()
    if parameter is None:
        winners = [pair for pair in all_pairs if MethodType(delta_method, pair)() > 0]
    else:
        winners = [pair for pair in all_pairs if MethodType(delta_method, pair)(parameter) > 0]
    return winners

In [48]:
def test_generator():
    all_pairs = generate_all_pairs()
    print all_pairs[0]
    print all_pairs[-1]
    print all_pairs[9235]
    print len(all_pairs)

def test_winner_generator():
    all = generate_all_pairs()
    winners = generate_all_winners(RankingPair.delta_recall_at, 5)
    amount_of_winners = sum(winner.delta_recall_at(1) > 0 for winner in winners)
    amount_of_losers = sum(winner.delta_recall_at(1) < 0 for winner in winners)
    amount_of_ties = len(all) - amount_of_losers - amount_of_winners

    print "#all: " + str(len(all))
    print "#winners: " + str(len(winners))
    print "winners %: " + str(len(winners) / float(len(all)) * 100)
    print "winners check: " + str(amount_of_winners)
    print "losers check: " + str(amount_of_losers)
    print "ties check: " + str(amount_of_ties)

Step 3: Calculate the 𝛥measure (5 points)

In [49]:
def test_deltas():
    ranking1 = Ranking([Relevance.HR, Relevance.N, Relevance.N, Relevance.N, Relevance.N])
    ranking2 = Ranking([Relevance.N, Relevance.N, Relevance.N, Relevance.N, Relevance.HR])
    pair = RankingPair(ranking1, ranking2)
    print "average precision:       " + str(pair.delta_average_precision())
    print "cdg at 1:                " + str(pair.delta_dcg_at(1))
    print "cdg at 3:                " + str(pair.delta_dcg_at(3))
    print "cdg at 5:                " + str(pair.delta_dcg_at(5))
    print "ncdg at 1:               " + str(pair.delta_ndcg_at(1))
    print "ncdg at 3:               " + str(pair.delta_ndcg_at(3))
    print "ncdg at 5:               " + str(pair.delta_ndcg_at(5))
    print "precision at 1:          " + str(pair.delta_precision_at(1))
    print "precision at 3:          " + str(pair.delta_precision_at(3))
    print "precision at 5:          " + str(pair.delta_precision_at(5))
    print "recall at 1:             " + str(pair.delta_recall_at(1))
    print "recall at 3:             " + str(pair.delta_recall_at(3))
    print "recall at 5:             " + str(pair.delta_recall_at(5))

Step 4: Implement Interleaving (15 points)

In [50]:
class Interleaving:
    def __init__(self, ranking, origins):
        self.ranking = ranking
        self.origins = origins

        
class Winner:
    P, E, T = ['P', 'E', 'T']

    
class Interleaver:
    def __init__(self):
        self.click_model = None

    def run_simulation(self, pair):
        ranking, origins = self.interleave(pair)
        clicks = self.click_model.simulate_clicks_on(ranking)
        production_clicks = 0
        experiment_clicks = 0

        for i in range(len(ranking)):
            if clicks[i]:
                if origins[i] is Origin.P:
                    production_clicks += 1
                else:
                    experiment_clicks += 1

        return production_clicks, experiment_clicks
    
    def evaluate(self, pair):
        production_clicks, experiment_clicks = self.run_simulation(pair)
        if experiment_clicks == production_clicks:
            return Winner.T
        else:
            return Winner.P if production_clicks > experiment_clicks else Winner.E

        
class BalancedInterleaver(Interleaver):
    def __init__(self):
        Interleaver.__init__(self)

    def interleave(self, pair):
        length = min(len(pair.p.ranking), len(pair.e.ranking))
        production_first = getrandbits(1)

        origins = ([Origin.P, Origin.E] if production_first else [Origin.E, Origin.P]) * length
        ranking = [None] * length * 2

        ranking[int(not production_first)::2] = pair.p.ranking
        ranking[int(production_first)::2] = pair.e.ranking

        return ranking, origins


class TeamDraftInterleaver(Interleaver):
    def __init__(self):
        Interleaver.__init__(self)

    def interleave(self, pair):
        length = min(len(pair.p.ranking), len(pair.e.ranking))

        origins, ranking = [], []

        for i in range(length):
            if getrandbits(1):
                origins.extend([Origin.P, Origin.E])
                ranking.extend([pair.p.ranking[i], pair.e.ranking[i]])
            else:
                origins.extend([Origin.E, Origin.P])
                ranking.extend([pair.e.ranking[i], pair.p.ranking[i]])

        return ranking, origins


class ProbabilisticInterleaver(Interleaver):
    tau = 3

    def __init__(self):
        Interleaver.__init__(self)

    def interleave(self, pair):
        length = min(len(pair.p.ranking), len(pair.e.ranking))

        origins, ranking = [], []

        remaining_p = pair.p.ranking[:]
        remaining_e = pair.e.ranking[:]

        for i in range(length):
            if getrandbits(1):
                ranking.append(self.sample_element_from(remaining_p))
                ranking.append(self.sample_element_from(remaining_e))
                origins.extend([Origin.P, Origin.E])
            else:
                ranking.append(self.sample_element_from(remaining_e))
                ranking.append(self.sample_element_from(remaining_p))
                origins.extend([Origin.E, Origin.P])

        return ranking, origins

    def sample_element_from(self, remaining_list):
        remaining_amount = len(remaining_list)
        probabilities = self.softmax_probabilities(remaining_amount)
        sampled_index = np.random.choice(range(remaining_amount), p=probabilities)
        popped = remaining_list.pop(sampled_index)
        return popped

    def softmax_probabilities(self, length):
        unnormalized = [1 / float(np.power(rank, self.tau)) for rank in range(1, length + 1)]
        normalization_factor = float(sum(unnormalized))
        normalized = [x / normalization_factor for x in unnormalized]
        return normalized

Step 5: Implement User Clicks Simulation (25 points)

In [51]:
def parse_yandex_file(file_name):
    queries = []

    with open(file_name) as f:
        for line in f.read().splitlines():
            if 'Q' in line:
                session_id, query_id, ranking = parse_query(line)
                query = next((q for q in queries if q.session_id == session_id and q.id == query_id), None)

                if query is None:
                    queries.append(Query(session_id, query_id, ranking))
                else:
                    query.ranking += ranking

            else:
                session_id, url_id = parse_click(line)
                query = next(q for q in reversed(queries) if url_id in q.ranking and session_id == q.session_id)
                query.clicks.append(query.ranking.index(url_id))

    return queries


def parse_query(string):
    split = string.split()
    session_id = split[0]
    query_id = split[3]
    ranking = split[5:]
    return session_id, query_id, ranking


def parse_click(string):
    split = string.split()
    session_id = split[0]
    url_id = split[3]
    return session_id, url_id


class Query:
    def __init__(self, session_id, query_id, ranking):
        self.session_id = session_id
        self.id = query_id
        self.ranking = ranking
        self.clicks = []

In [52]:
class ClickModel(object):
    def __init__(self):
        self.params_initialized = False

    def simulate_clicks_on(self, ranking):
        probabilities = self.probabilities(ranking)
        clicks = [random() < p for p in probabilities]
        return clicks


class RandomClickModel(ClickModel):
    yandex_file_name = 'YandexRelPredChallenge.txt'
    params_file_name = 'RandomClickModelParams.txt'

    def __init__(self):
        ClickModel.__init__(self)
        self.rho = None

    def learn(self):
        queries = parse_yandex_file(self.yandex_file_name)

        results_count = 0
        click_count = 0

        for query in queries:
            results_count += len(query.ranking)
            click_count += len(query.clicks)

        self.rho = click_count / float(results_count)

        with open(self.params_file_name, 'w+') as f:
            f.write(str(self.rho))

        self.params_initialized = True

    def load(self):
        with open(self.params_file_name) as f:
            self.rho = float(f.readline())
        self.params_initialized = True

    def probabilities(self, ranking):
        if not self.params_initialized:
            raise AssertionError('Parameters have not been initialized.')
        return [self.rho] * len(ranking)

Step 6: Simulate Interleaving Experiment (10 points)

Step 7: Results and Analysis (25 points)

In [53]:
def compare(pairs, interleaver, click_model):
    click_model.load()
    interleaver.click_model = click_model
    production_wins, experiment_wins, ties = 0, 0, 0

    for pair in pairs:
        winner = interleaver.evaluate(pair)
        if winner is Winner.P:
            production_wins += 1
        elif winner is Winner.E:
            experiment_wins += 1
        else:
            ties += 1

    return (experiment_wins + 0.5 * ties) / float(len(pairs))

def analyse():
    pairs1 = generate_all_winners(RankingPair.delta_average_precision)
    inter1 = BalancedInterleaver()
    model1 = RandomClickModel()
    combo1 = compare(pairs1, inter1, model1)
    print "Average Precision + Balanced Interleaver + Random Click Model: " + str(combo1)

    pairs2 = generate_all_winners(RankingPair.delta_average_precision)
    inter2 = ProbabilisticInterleaver()
    model2 = RandomClickModel()
    combo2 = compare(pairs2, inter2, model2)
    print "Average Precision + Probabil Interleaver + Random Click Model: " + str(combo2)

    pairs3 = generate_all_winners(RankingPair.delta_ndcg_at, 5)
    inter3 = BalancedInterleaver()
    model3 = RandomClickModel()
    combo3 = compare(pairs3, inter3, model3)
    print "Normalized DCG @5 + Balanced Interleaver + Random Click Model: " + str(combo3)

    pairs4 = generate_all_winners(RankingPair.delta_ndcg_at, 5)
    inter4 = BalancedInterleaver()
    model4 = RandomClickModel()
    combo4 = compare(pairs4, inter4, model4)
    print "Normalized DCG    + Balanced Interleaver + Random Click Model: " + str(combo4)
    
analyse()

Average Precision + Balanced Interleaver + Random Click Model: 0.499419779891
Average Precision + Probabil Interleaver + Random Click Model: 0.499681814779
Normalized DCG @5 + Balanced Interleaver + Random Click Model: 0.498706427015
Normalized DCG    + Balanced Interleaver + Random Click Model: 0.504578567538
