In [1]:
import altair as alt
import numpy as np
import pandas as pd

from math import sqrt
from scipy.special import logit, expit

from raking.run_raking import run_raking

Parameters of the simulation.

In [2]:
# Choose prevalence, bias, number of samples, bounds for sample size and number of simulations
mu = 0.1
sigma = 0.5
n_samples = 10
bounds1 = [100, 200]
bounds2 = [1000, 2000]
n_sims = 500

Functions to run the simulation.

In [3]:
# Function to generate the data
def generate_data(n_samples, bounds, mu, sigma):
    n_i = rng.uniform(bounds[0], bounds[1], n_samples)
    y_i = np.zeros(n_samples)
    if sigma == 0:
        mu_i = np.repeat(mu, n_samples)
    else:
        mu_i = expit(logit(mu) + rng.normal(0, sigma, n_samples))
    for i in range(0, n_samples):
        y_i[i] = rng.binomial(n_i[i], mu_i[i], 1)[0]
    return (n_i, y_i)

In [4]:
# Function to estimate mu, its variance and the correction factor
def estimate_prevalence(n_i, y_i):
    mu_hat = np.sum(y_i) / np.sum(n_i)
    var_mu = mu_hat * (1 - mu_hat) / np.sum(n_i)
    r_i = (y_i - n_i * mu_hat) / np.sqrt(n_i * mu_hat * (1 - mu_hat))
    alpha = np.std(r_i)
    return (mu_hat, var_mu, alpha)

In [5]:
# Function to create observations data frame
def create_obs(cause, mu_hat, var_mu, alpha, type_weight):
    if type_weight == 'sq_root_std':
        weights = [var_mu[0]**0.25, var_mu[1]**0.25]
        weights_corrected = [alpha[0]**0.5 * var_mu[0]**0.25, alpha[1]**0.5 * var_mu[1]**0.25]
    elif type_weight == 'std':
        weights = [var_mu[0]**0.5, var_mu[1]**0.5]
        weights_corrected = [alpha[0] * var_mu[0]**0.5, alpha[1] * var_mu[1]**0.5]
    elif type_weight == 'var':
        weights = [var_mu[0], var_mu[1]]
        weights_corrected = [alpha[0]**2 * var_mu[0], alpha[1]**2 * var_mu[1]]
    df_obs = pd.DataFrame({'cause': cause, \
                           'value': [mu_hat[0], mu_hat[1]], \
                           'weights': weights, \
                           'weights_corrected': weights_corrected})
    return df_obs

In [6]:
# Function to run one simulation
def run_one_sim(n_samples, bounds1, bounds2, mu, sigma, type_weight):
    (n1, y1) = generate_data(n_samples, bounds1, mu, 0)
    (n2, y2) = generate_data(n_samples, bounds2, mu, sigma)
    (mu1, var1, alpha1) = estimate_prevalence(n1, y1)
    (mu2, var2, alpha2) = estimate_prevalence(n2, y2)
    df_obs = create_obs([1, 2], [mu1, mu2], [var1, var2], [alpha1, alpha2], type_weight)
    df_margin = pd.DataFrame({'value_agg_over_cause': [2 * mu]})
    # Raking with no weights
    (df_obs, dummy1, dummy2, dummy3) = run_raking( \
        1, df_obs, [df_margin], ['cause'], cov_mat=False, weights=None
    )
    df_obs.rename(columns={'raked_value':'no_weights'}, inplace=True)
    # Raking with non corrected weights
    (df_obs, dummy1, dummy2, dummy3) = run_raking( \
        1, df_obs, [df_margin], ['cause'], cov_mat=False, weights='weights'
    )
    df_obs.rename(columns={'raked_value':'without_correction'}, inplace=True)
    # Eaking with corrected weights
    (df_obs, dummy1, dummy2, dummy3) = run_raking( \
        1, df_obs, [df_margin], ['cause'], cov_mat=False, weights='weights_corrected'
    )
    df_obs.rename(columns={'raked_value':'with_correction'}, inplace=True)
    return df_obs

In [7]:
# Function to run multiple simulations
def run_all_sims(n_samples, bounds1, bounds2, mu, sigma, type_weight, n_sims):
    df = []
    for i in range(0, n_sims):
        df_tmp = run_one_sim(n_samples, bounds1, bounds2, mu, sigma, type_weight)
        df_tmp['sim'] = i + 1
        df.append(df_tmp)
    df = pd.concat(df)
    return df

Run the simulation with weights equal to the variance.

In [8]:
rng = np.random.default_rng(0)
df = run_all_sims(n_samples, bounds1, bounds2, mu, sigma, 'var', n_sims)

Figure for paper

In [9]:
df_plot = pd.DataFrame({'Cause': np.concatenate([df.cause.to_numpy(), \
                                                 df.cause.to_numpy(), \
                                                 df.cause.to_numpy()]), \
                        'Rate': np.concatenate([df.value.to_numpy(), \
                                                df.no_weights.to_numpy(), \
                                                df.with_correction.to_numpy()]), \
                        'Value': ['Initial'] * (2 * n_sims) + \
                                 ['No wgt.'] * (2 * n_sims) + \
                                 ['Wgt.'] * (2 * n_sims)})

In [10]:
base = alt.Chart(df_plot)
points = base.mark_boxplot(extent='min-max').encode(
    x=alt.X('Value:N').title(''),
    y=alt.Y('Rate:Q').scale(zero=False),
    color=alt.Color('Value:N'),
)
rule = base.mark_rule(strokeDash=[2, 2]).encode(
    y=alt.datum(mu)
)
chart = alt.layer(rule, points).properties(
    height=200,
    width=200
).facet(
    column=alt.Column('Cause:N', header=alt.Header(titleFontSize=20, labelFontSize=16))
).configure_axis(
    labelFontSize=16,
    titleFontSize=20
).configure_legend(
    labelFontSize=16,
    titleFontSize=20,
    labelLimit=0
)

In [11]:
chart.save('simulation_weights_var.svg')
chart