# Dense and Sparse Simulations Position Bias

In [31]:
pip install faker --quiet

Note: you may need to restart the kernel to use updated packages.


In [32]:
import numpy as np
import pandas as pd
import faker
from sklearn.datasets import make_regression
from tabulate import tabulate

In [33]:
np.random.seed(42)
faker.Faker.seed(42)

In [34]:
fake = faker.Faker()

In [35]:
# True bias
true_bias = np.array([0.7, 0.6, 0.5, 0.4, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05])

In [36]:
def generate_jobs(njobs):
    """ 
    Generate random job ids beginning with J for clarity
    """
    jobs = []
    for idx in range(njobs):
        jobs.append(str(fake.uuid4()))
        jobs[-1] = "J" + jobs[-1][1:]
    
    return np.array(jobs)

In [37]:
def generate_queries(nqueries):
    """ 
    Generate random query ids beginning with Q for clarity
    """
    queries = []
    for idx in range(nqueries):
        queries.append(str(fake.uuid4()))
        queries[-1] = "Q" + queries[-1][1:]
    
    return np.array(queries)

In [38]:
def generate_relevance(uniform = True, nfeats = 0, njobs = 105, nqueries = 50):
    """
    Simulate relevance that is uniformly distributed or non-uniformly distributed
    Additionally generates features if relevance is non-unifomrly distributed
    """

    if uniform:
        return np.random.random((njobs, nqueries))

    # Simulate non-uniformly distributed relevance and features
    nfeats = 20
    features, relevance = make_regression(n_samples= nqueries * njobs, n_features = nfeats, n_informative = nfeats, noise= 100, random_state=99)
        
    # Recode relevance so that it is between 0 and 1
    relevance = relevance - np.min(relevance)
    relevance /= (np.max(relevance) - np.min(relevance))
        
    # Formatting
    features = features.reshape(njobs, nqueries, nfeats)
    relevance = relevance.reshape(njobs, nqueries)
        
    return relevance, features

In [39]:
def generate_data(sparse = False, uniform = True):
    """
    Generates data for either a sparse or dense simulations
    """
    
    # Generates dense simulation
    if not sparse:
        jobs = generate_jobs(njobs = 105)
        queries = generate_queries(nqueries = 50)
        relevance = generate_relevance(njobs = 105, nqueries = 50)
        
        return jobs, queries, relevance

    # Generates sparse simulations
    jobs = generate_jobs(njobs = 200)
    queries = generate_queries(nqueries = 1000)
        
    if not uniform:
        relevance, features = generate_relevance(uniform = False, nfeats = 20, njobs = 200, nqueries = 1000)
        return jobs, queries, relevance, features
    
    relevance = generate_relevance(njobs = 200, nqueries = 1000)

    return jobs, queries, relevance

In [40]:
def format_data(data, features = False):
    """
    Format the simulated data so that it has multiple rows per session
    Example row: (query, job, click, job rank, true relevance)
    """
    
    # Assign ranks
    data = data.assign(
        rank = data['query_id'].apply(lambda qid : list(range(10)))
    )
    
    if features:
        data = data.explode(['job_ids', 'click', 'rank', 'true_rel', 'features'])
    else:
        data = data.explode(['job_ids', 'click', 'rank', 'true_rel']) 

    # Assign query_doc ids
    data = data.assign(
        qd_id = data[
            ['query_id', 'job_ids']
        ].apply(
            lambda r : r['query_id'] + "_" + r['job_ids'],
            axis = 1
        ).astype("category")
    )
    
    return data

In [41]:
def simulate_search_results(sparse = False, uniform = True, corr = True):
    """
    Simulate search sessions for a dense or sparse simulation
    Additionally has options for introducing correlation between rank and relevance
    Can simulate search sessions based on non-uniform or uniformly distributed relevance
    """
    
    # Generate jobs, queries and relevance for the corresponding scenario
    if not sparse and uniform:
        jobs, queries, relevance = generate_data()
    elif sparse and uniform:
        jobs, queries, relevance = generate_data(sparse = True)
    elif sparse and not uniform:
        jobs, queries, relevance, features = generate_data(sparse = True, uniform = False)
        
    nqueries = len(queries)
    njobs = len(jobs)
    data = []
    count = 1

    # Generate fake search data
    for qidx in range(nqueries):
        
        # Only have a small amount of observations per query for sparse simulation
        if sparse is True:
            if count != 100:
                nobs = int(np.random.uniform(1, 4))
                count += 1
            # Make sure some queries are more popular
            nobs = int(np.random.uniform(9, 500))
            count = 1
        else:
            # Dense simulation has a larger amount of observations per query
            nobs = int(np.random.uniform(2000, 10000))

        for oidx in range(nobs):
            # Random user preferences, i.e. noise
            user_prefs = np.random.uniform(0.7, 1.0, (njobs,))

            # Combine with relevance for user specific sorting
            qd_relevance = np.multiply(relevance[:, qidx], user_prefs)
            sort_idx = np.argsort(-qd_relevance)
            
            # If there is no rank-relevance correlation;
            # Randomly select 10 from the top 50 that were shown
            if corr is False:
                search_results = sort_idx[0:50]
                np.random.shuffle(search_results)
                search_results = search_results[0:10]
            else:
                # If there is rank-relevance correlation;
                # Select results based on higher relevance with equal step size
                step_size = int(np.round((njobs / 10) - 1))
                search_results = sort_idx[0:step_size*10:step_size]

            # Calculate probability of click including position bias
            search_results_relevance = qd_relevance[search_results]
            click_prob = np.multiply(search_results_relevance, true_bias)

            # Simulate clicks
            clicks = (np.random.uniform(0, 1, (10,)) < click_prob) * 1
        
            if uniform is False:
                data.append((queries[qidx], jobs[search_results], clicks, relevance[search_results, qidx], features[search_results, qidx]))
            else:
                data.append((queries[qidx], jobs[search_results], clicks, relevance[search_results, qidx]))
    
    # Format data
    if not uniform:
        data = pd.DataFrame(data, columns = ['query_id', 'job_ids', 'click', 'true_rel', 'features'])
        return format_data(data, features = True)
    
    data = pd.DataFrame(data, columns = ['query_id', 'job_ids', 'click', 'true_rel'])
    return format_data(data)

In [42]:
def analyze_qd_counts(data):
    """
    Analyzes the distribution of query-document pairs, to assess sparsity
    """
    
    qd_ids = data['qd_id'].cat.codes.astype(int).to_numpy()
    qd_cnt = np.bincount(qd_ids)
    
    percentage_one_observation = (qd_cnt == 1).sum() / len(qd_cnt) * 100
    percentage_between_2_and_10 = ((qd_cnt >= 2) & (qd_cnt <= 10)).sum() / len(qd_cnt) * 100
    percentage_above_10 = (qd_cnt > 10).sum() / len(qd_cnt) * 100

    table = [
        ["Amount of query-document pairs that occurred only once", percentage_one_observation], 
        ["Amount of query-document pairs that occurred 2-10 times:", percentage_between_2_and_10], 
        ["Amount of query-document pairs that occurred more than 10 times", percentage_above_10],
        ["Highest occurence count: ", qd_cnt.max()],
        ["Amount of rows:", len(data)],
        ["Amount of query-document pairs: ", len(np.unique(qd_ids))]
    ]

    return tabulate(table)

In [43]:
# Densely simulated data, with and without rank-relevance correlation
dense_data_corr = simulate_search_results()
dense_data_random = simulate_search_results(corr = False)

# Sparsely simulated data, with uniform and non-uniform rank-relevance correlation
sparse_data_uniform = simulate_search_results(sparse = True)
sparse_data_peaked = simulate_search_results(sparse = True, uniform = False)

In [44]:
print("Dense Data, Rank-Relevance correlation: ", analyze_qd_counts(dense_data_corr))
print("Dense Data, No Rank-Relevance correlation: ", analyze_qd_counts(dense_data_random))
print("Sparse Data, Uniform Relevance: ", analyze_qd_counts(sparse_data_uniform))
print("Sparse Data, Non-Uniform Relevance: ", analyze_qd_counts(sparse_data_peaked))

Dense Data, Rank-Relevance correlation:  ---------------------------------------------------------------  --------------
Amount of query-document pairs that occurred only once              0.0849979
Amount of query-document pairs that occurred 2-10 times:            0.488738
Amount of query-document pairs that occurred more than 10 times    99.4263
Highest occurence count:                                         5369
Amount of rows:                                                     3.08152e+06
Amount of query-document pairs:                                  4706
---------------------------------------------------------------  --------------
Dense Data, No Rank-Relevance correlation:  ---------------------------------------------------------------  --------------
Amount of query-document pairs that occurred only once              1.00032
Amount of query-document pairs that occurred 2-10 times:            1.80703
Amount of query-document pairs that occurred more than 10 times    97.192

In [45]:
%store simulate_search_results

Proper storage of interactively declared classes (or instances
of those classes) is not possible! Only instances
of classes in real modules on file system can be %store'd.

