This Jupyter Notebook contains the first project for Information Retrieval 1 taught at the UvA. Code is made by Oscar Ligthart, Nicole Ferreira Silverio and Arend van Dormalen.

ANSWER TO THEORETICAL QUESTION 1A

The chance of a type 1 error ($\alpha$) increases each time an experiment is repeated, if it's not corrected. The new $\alpha$ for _m_ experiments is $1 − (1 − \alpha)^m ≈ m\alpha$.

ANSWER TO THEORETICAL QUESTION 2

Assume two ranked lists created by two different rankers. List $l1$ contains documents $d1$, $d2$ and $d3$ in that order. List $l2$ contains documents $d2$, $d3$ and $d4$ in that order. Now assume that the only relevant document is $d3$, which will therefore be clicked on most often. From our judgment, it is obvious that $l2$ is the most relevant list as it has placed $d3$ on a higher position. However, in Team Draft Interleaving, these algorithms will be evaluated as having equal performance.

In this situation, $d3$ will always be the third item on the interleaved list. After the first coin flip, $d2$ will be removed from $l1$ as this document has already been supplied by $l2$.  At the second coin flip, $d3$ will be the next document for both lists. This causes the relevance for both lists to be the same, as they now both have the same chance of supplying the only relevant document to the interleaved list.

In [None]:
import itertools
import numpy as np
import random
import re

# first get the sequence options
relevance = ['N', 'R', 'HR']
options = list(itertools.product(relevance, repeat = 5))

# create all possible pairs in sequence options
pair_index = list(itertools.permutations(range(len(options)), 2))


pairs = []

for temp_pair in pair_index:
    pairs.append([options[temp_pair[0]], options[temp_pair[1]]])    


In [None]:
def get_average_precision(ranking):
    rel = 0
    AP_numerator = 0
    # get amount of relevant documents
    for i, doc in enumerate(ranking):
        if doc == 'R' or doc == 'HR':
            rel += 1
            AP_numerator += rel/(i+1)
            
    return rel, AP_numerator

# new dict for average precision for both P and E (key is pair, value is average precisions))
AP_results = {}

# get precision for all pairs
for pair in pairs:
    # first calculate numerator for average precision for P
    P = pair[0]    
    P_rel, P_AP_numerator = get_average_precision(P)
 
    # now calculate numerator for average precision for E
    E = pair[1]
    E_rel, E_AP_numerator = get_average_precision(E)

    # get total number of relevant documents returned from query
    total_rel = P_rel + E_rel
    
    # calculate average precision for both P and E
    P_AP = P_AP_numerator/total_rel
    E_AP = E_AP_numerator/total_rel
    
    # store results in a dict
    AP_results[(P,E)] = (P_AP, E_AP)

#for value in AP_results.values():
#    print(value)
    
#print(len(AP_results.values()))
print(len(pairs))

####### NOW GET DELTA MEASURES #########
AP_delta_values = []
for key, value in AP_results.items():
    if value[1] > value[0]:
        delta_value = value[1] - value[0]
        AP_delta_values.append(delta_value)

print(len(AP_delta_values))


In [None]:
##### nDCG cell #####
def get_nDCG(ranking):
    DCG = 0
    
    # loop through ranking
    for i, rank in enumerate(ranking):
        # decide what the relative rank is
        if rank == 'HR':
            rel_r = 2
        elif rank == 'R':
            rel_r = 1
        elif rank == 'N':
            rel_r = 0
        
        DCG += (2**rel_r - 1)/(np.log2(1+(i+1)))
    
    return DCG

# new dict for average precision for both P and E (key is pair, value is average precisions))
nDCG_results = {}

# get nDCG for all pairs
for pair in pairs:    
    # first for P
    P = pair[0]
    P_DCG = get_nDCG(P)
    
    # then for E
    E = pair[1]
    E_DCG = get_nDCG(E)
    
    nDCG_results[(P,E)] = (P_DCG, E_DCG)
    
##### NOW GET THE DELTA MEASURES #####
nDCG_delta_values = []
for key, value in nDCG_results.items():
    if value[1] > value[0]:
        delta_value = value[1] - value[0]
        nDCG_delta_values.append(delta_value)


print(len(nDCG_delta_values))


In [None]:
##### ERR cell #####

def get_ERR(ranking):
    
    ERR = 0
    p = 1
    max_rel = 2
    
    # loop through ranking
    for i, rank in enumerate(ranking):
        
        # start at second rank
        if i != 0:
            
            # decide what the relative rank is
            if rank == 'HR':
                rel_r = 2
            elif rank == 'R':
                rel_r = 1
            elif rank == 'N':
                rel_r = 0

            # Calculate R with the mapping function
            R = (2**rel_r - 1)/(2**max_rel)

            # Modify ERR value
            ERR += p * (R/i)

            # Modify p
            p = p*(1-R)
    
    return ERR

# new dict for ERR values for both P and E (key is pair, value is ERR value))
ERR_results = {}

# get ERR for all pairs
for pair in pairs:
    
    # first for P
    P = pair[0]
    P_ERR = get_ERR(P)
    
    # then for E
    E = pair[1]
    E_ERR = get_ERR(E)
    
    ERR_results[(P,E)] = (P_ERR, E_ERR)
    
##### NOW GET THE DELTA MEASURES #####
ERR_delta_values = []
for key, value in ERR_results.items():
    if value[1] > value[0]:
        delta_value = value[1] - value[0]
        ERR_delta_values.append(delta_value)

print(len(ERR_delta_values))


In [None]:
##### Balanced Interleaving #####

all_results = []
all_origins = []

# Iterate through pairs
for pair in pairs:

    # Flip a coin, assign winning and losing
    # P = pair[0], E = pair[1]
    coin_winner = random.randint(0,1)
    winner = pair[coin_winner]
    loser = pair[1 - coin_winner]

    # initiate lists
    resulting_list = []
    origin_list = []

    # iterate through lists, fill up results and origin list
    for i in range(len(winner)):
        resulting_list.append(winner[i])
        origin_list.append(coin_winner)
        resulting_list.append(loser[i])
        origin_list.append(1-coin_winner)

    all_results.append(resulting_list)
    all_origins.append(origin_list)

In [None]:
##### Random Click Model #####

# Learns parameter
def learn_param_RCM(data):
    
    # open file and read
    lines=data.readlines()

    clicks = 0
    documents = 0

    # Acquire total amount of queries and clicks
    for line in lines:
        items = re.split(r'\t+',line)
        if items[2] == "Q":
            # Per query 10 documents are shown
            documents += 10
        elif items[2] == "C":
            clicks += 1
    
    # Calculate rho
    rho = clicks/documents
    
    return rho

# Predicts a click probability
def predict_prob_RCM(ranking, param):
    
    # get the click probability for every document in ranking
    click_prob = []
    for doc in ranking:
        click_prob.append(param)
        
    return click_prob

# Decide whether document is clicked on
def click_doc_RCM(click_prob):
    clicked = []
    for prob in click_prob:
        chance = random.random()
        if chance <= prob:
            clicked.append(1)
        else:
            clicked.append(0)
    return clicked




In [None]:
##### Simple Dynamic Bayesian Model #####

# Learns parameter
def learn_param_DBM(file):
    
    lines = file.readlines()

    #previous_session = 0 # Keep track of session number to determine if click is last click.
    previous_type = ""
    
    clicks = 0
    #last_clicks_session = 0
    last_clicks_query = 0

    lines.reverse() # Reversed order, so it is detectable if a click is last.
    for line in lines:
        items = re.split(r'\t+',line) #strip tabs
        #current_session = items[0]
        current_type = items[2]
        #if current_type == "C" and current_session != previous_session:
            #last_clicks_session += 1
        if current_type == "C" and previous_type == "Q": 
            last_clicks_query += 1
        if current_type == "C":
            clicks += 1
        #previous_session = current_session
        previous_type = current_type

    sigma = last_clicks_query/clicks
    
    return sigma
        

# Predicts a click probability
def click_prob_DBM(rank, sigma):
    # for the click probability, we'll need P(A) and P(E)
    
    # first get alpha, which will be set according to the level of relevance of a document
    if rank == 'HR':
        alpha = 0.8
    elif rank == 'R':
        alpha = 0.4
    elif rank == 'N':
        alpha = 0 
    
    # check if user will click on the document (depending on alpha)
    x = random.random()
    if x <= alpha:
        P_A = 1
    else:
        P_A = 0
            
    # since we are using a simple DBM, gamma will always be one    
    gamma = 1
    
    return P_A, gamma  
       
        
# Decide which documents are clicked
def click_doc_DBM(ranking, sigma):
    # this function takes a ranking list and a value for the parameter sigma as input and uses
    # these to determine which documents in the ranking list are clicked on
    
    # set P(E) to 1 (first snippet is always read)
    P_E = 1
    
    clicked = []
    
    # run through the ranking to decide whether a document will be clicked or not
    for rank in ranking:
        P_A, gamma = click_prob(rank, 0.51)
        
        # based on probability, set click to 1 or 0
        if P_A == 1 and P_E == 1:
            P_C = 1
        else:
            P_C = 0
     
        clicked.append(P_C)
        
        # if user has clicked, check if user is satisfied
        if P_C == 1:
            # now check if user is satisfied
            x = random.random()
            if x <= sigma:
                # if satisfied, user will not read any more snippets (thus click nothing)
                P_E = 0
            else:
                # if user is not satisfied, user will read next snippet (thus possibly click)
                P_E = 1 
        
    return clicked       

            

In [None]:
# Simulate random click model

# get parameter out of data
f=open("YandexRelPredChallenge.txt","r")
rho = learn_param_RCM(f)
f.close()

# keep track of which algorithm won
E_win = 0
P_win = 0

for ranking in all_results:
    
    print(ranking)
    # predict probability of clicking
    click_prob = predict_prob_RCM(ranking, rho)
    clicked = click_doc_RCM(click_prob)

    # now shuffle the origin list so documents are picked at random
    origin_shuffle = random.sample(origin_list, len(origin_list))
    
    for i, click in enumerate(clicked):
        if click == 1 and origin_shuffle[i] == 1:
            E_click += 1
        elif click == 1 and origin_shuffle[i] == 0:
            P_click += 1

# proportion of times E won
E_win/ (E_win + P_win)

print(origin_list)
print(og_shuffle)
print(clicked)


In [None]:
# Simulate dynamic bayesian model

f=open("YandexRelPredChallenge.txt","r")
sigma = learn_param(f)
f.close()

click_doc(ranking)