In [143]:
import collections as cl
import random
import numpy as np

# 1. Random Click model (RCM)

## $\rho = \frac{\sum_{s\in S}\sum_{u\in s}c_{u}^{(s)}}{\sum_{s\in S}\vert S \vert}$

In other words, $\rho = \frac{ \text{number of clicks}}{\text{number of documents shown}}$

In [144]:
class RCM:
    def __init__(self, log_filename):
        log_file = open(log_filename)
        self.probability = self.get_parameter(log_file)
        
    def _is_querry(self, line):
        return line.split()[2].lower() == 'q'

    def _get_url_list(self, line):
        assert line.split()[2].lower() == 'q'
        return line[5:]
    
    def get_parameter(self, training_data):
        '''
        (a) A method that learns the parameters of the model given a set of training data.
        '''
        documents_shown = 0
        clicks = 0
        for line in training_data:
            if self._is_querry(line): # is querry
                url_list = self._get_url_list(line)
                number_of_urls = len(url_list)
                documents_shown += number_of_urls
            else:# is click
                clicks += 1
        return clicks / documents_shown
    
    def click_probabilities(self, urls):
        '''
        (b) A method that predicts the click probability given a ranked list of relevance labels.
            For RCM, all links have the same probability.
        '''
        return [self.probability for i in range(0,len(urls))]
    
    def clicks(self, click_probabilities):
        '''
        (c) A method that decides - stochastically - whether a document is clicked based on their probabilities.
        '''
        return [np.random.binomial(1, prob) for prob in click_probabilities]
 
        

In [145]:
model = RCM("./../resources/YandexRelPredChallenge.txt")
model.clicks(model.click_probabilities([i for i in range(0,10)]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

## 2. Simple Dependent Click Model (SDCM)

In [148]:
class SDCM:
    def __init__(self, log_filename):
        self.MAX_REL = 2
        log_file = open(log_filename)
        self.rank_probabilities = self.get_parameters(log_file)
        self.attractiveness = lambda x: (2**x-1) / 2**self.MAX_REL
        
    def _is_querry(self, line):
        return line[2].lower() == 'q'

    def _get_url_list(self, line):
        assert line[2].lower() == 'q'
        return line[5:]
    
    def get_rank(self, querry, click):
        if click[3] not in querry[5:]: # weird..
            return -1
        else:
            querry = querry[5:]
            return querry.index(click[3])
    
    def get_parameters(self,training_data):
        '''
        (a) A method that learns the parameters of the model given a set of training data.
        '''
        last_clicked_rank= -1
        last_querry = -1
        
        last_click_rank_counter = cl.Counter()
        click_rank_counter = cl.Counter()
        
        for line in training_data:
            line = line.split()
            if self._is_querry(line): # is querry
                last_querry = line
                if last_clicked_rank != -1:  #the previusly click was the last one
                    last_click_rank_counter[last_clicked_rank] += 1
                    last_clicked_rank = -1  # we counted it, so we 'remove' it.
            else:# is click
                last_clicked_rank = self.get_rank(last_querry, line)
                click_rank_counter[last_clicked_rank] += 1
        # to take into consideration the last click in the log file.
        if last_clicked_rank != -1:
            last_click_rank_counter[last_clicked_rank] += 1
            last_clicked_rank = -1  # we countend, so we 'remove' it.
            
        return 1 - np.array([last_click_rank_counter[r]/click_rank_counter[r] for r in range(0,10)])
    
    def get_atractiveness(self, urls):
        '''
        (b) A method that predicts the link atractiveness given a list of relevance labels.
        '''
        return [self.attractiveness(i) for i in urls]
    
    def clicks(self, atractiveness):
        '''
        (c) A method that decides - stochastically - whether a document is clicked based 
            on their atractiveness and probabilities.
        '''
        clicks = np.zeros(len(atractiveness))
        for i, a in enumerate(atractiveness):
            if np.random.binomial(1, a) == 1:
                clicks[i] = 1
                if np.random.binomial(1,self.rank_probabilities[i]) == 0: # we should not contiue
                    break
            else:
                clicks[i] = 0
        return clicks.astype(int).tolist()
 
        

In [166]:
model = SDCM("./../resources/YandexRelPredChallenge.txt")
model.clicks(model.get_atractiveness([2,2,1,0,1,2,0,0,1,1]))

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]