# Raking in 2D

This notebook explains how to use the raking method on a small 2D example.

In [1]:
# Python modules
import altair as alt
import numpy as np
import pandas as pd
from scipy.stats import norm

Modify this to see how much samples you need to get a good estimate of the mean and the covariance of the raked values with the Monte Carlo method.

In [2]:
# Parameters
N = 100000 # Number of samples for Monte Carlo method

## Generating the data

We first begin by generating a balanced $3 \times 5$ matrix. We compute the corresponding margins and then we add noise to the data. This way we have margins that sum correctly (thus the raking problem indeed has a solution), but we avoid having observations that already match the margins before the raking. 

In [3]:
# Define variables and margins names
x1 = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3]
x2 = [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5]
margins = ['s1_1', 's1_2', 's1_3', 's1_4', 's1_5', 's2_1', 's2_2']

# Generate balanced table
rng = np.random.default_rng(0)
beta_ij = rng.uniform(low=2.0, high=3.0, size=(3, 5))

# Compute the margins
s1 = np.sum(beta_ij, axis=0)
s2 = np.sum(beta_ij, axis=1)

# Add noise to the data
mean = beta_ij.flatten(order='F') + rng.normal(0.0, 0.1, size=15)

We then generate samples for the observations in order to be able to compare the two methods of uncertainty propagation (Monte Carlo simulation or combination of Delta method and Implicit Function Theorem).

In [4]:
# Define the covariance matrix of the observations
cov = 0.01 * np.ones((15, 15))
np.fill_diagonal(cov, np.arange(0.01, 0.16, 0.01))

# Generate samples
y = rng.multivariate_normal(mean, cov, N)

## Applying the raking procedure to the true expectancy of the observations

We start by computing the constraint matrix and the margin vector.

In [5]:
from raking.compute_constraints import constraints_2D

In [6]:
(A, s) = constraints_2D(s1, s2, 3, 5)

We then rake the true expectancy of the observations using the $\chi^2$ distance.

In [7]:
from raking.raking_methods import raking_chi2

In [8]:
(beta_0, lambda_k) = raking_chi2(mean, A, s)

We can verify that the solution of the raking problem verify the constraints:

$A \beta_0 = s$

In [9]:
np.allclose(np.matmul(A, beta_0), s)

True

We store the results into a dataframe for plotting.

In [10]:
df_raked = pd.DataFrame({'X1': x1, \
                         'X2': x2, \
                         'observations': mean, \
                         'raked_values': beta_0})

## Propagate the uncertainties using Delta method and IFT

We start by computing the gradient of the raked values with respect to the observations $y$: $\frac{\partial \beta}{\partial y}$ and the gradient of the raked values with respect to the margins $s$: $\frac{\partial \beta}{\partial s}$ using the Implicit Function Theorem.

In [11]:
from raking.uncertainty_methods import compute_gradient

In [12]:
(Dphi_y, Dphi_s) = compute_gradient(beta_0, lambda_k, mean, A, 'chi2')

We store the results into a dataframe for plotting.

In [13]:
df_y = []
df_s = []
for i in range(0, 15):
    df_y.append(pd.DataFrame({'raked_1': np.repeat(x1[i], 15), \
                              'raked_2': np.repeat(x2[i], 15), \
                              'X1': x1, \
                              'X2': x2, \
                              'grad_y': Dphi_y[i, :]}))
    df_s.append(pd.DataFrame({'raked_1': np.repeat(x1[i], 7), \
                              'raked_2': np.repeat(x2[i], 7), \
                              'margins': margins, \
                              'grad_s': Dphi_s[i, :]}))
df_y = pd.concat(df_y)
df_s = pd.concat(df_s)

Then we compute the covariance matrix of the raked values using the Delta method.

In [14]:
from raking.uncertainty_methods import compute_covariance

In [15]:
covariance_0 = compute_covariance(Dphi_y, Dphi_s, cov, np.zeros((7, 7)), np.zeros((15, 7)))

## Monte Carlo simulation

We first apply the raking procedure to each sample.

In [16]:
beta = np.zeros((N, 15))
for n in range(0, N):
    y_n = y[n, :]
    (beta_n, lambda_k) = raking_chi2(y_n, A, s)
    beta[n, :] = beta_n

We compute the mean of the raked values.

In [17]:
mean_samples = np.mean(beta, 0)

We compute the covariance of the raked values.

In [18]:
covariance_samples = np.matmul(np.transpose(beta - mean_samples), beta - mean_samples) / (N - 1)

We also want to verify how well the delta method and the Monte Carlo simulation approximate the expectancy and the covariance of the raked values. We assume that if we do 1000000 simulations, we approximate well the true solution. We first generate 1000000 samples.

In [19]:
y_ref = rng.multivariate_normal(mean, cov, 1000000)

We then apply the raking procedure to each sample.

In [20]:
beta_ref = np.zeros((1000000, 15))
for n in range(0, 1000000):
    y_ref_n = y_ref[n, :]
    (beta_ref_n, lambda_k) = raking_chi2(y_ref_n, A, s)
    beta_ref[n, :] = beta_ref_n

We compute the mean of the raked values.

In [21]:
mean_ref = np.mean(beta_ref, 0)

We compute the cobariance of the raked values.

In [22]:
covariance_ref = np.matmul(np.transpose(beta_ref - mean_ref), beta_ref - mean_ref) / (1000000 - 1)

## Plots

We gather initial values and corresponding raked values, with their associated variances.

In [23]:
initial = pd.DataFrame({'X1': x1, \
                         'X2': x2, \
                         'variance': np.arange(0.01, 0.16, 0.01)})
variance = pd.DataFrame({'X1': x1, \
                         'X2': x2, \
                         'variance': np.diag(covariance_0)})
df_obs = df_raked.drop(columns=['raked_values']).rename(columns={'observations': 'Value'})
df_obs['Type'] = 'Initial'
df_obs['width'] = 1
df_obs = df_obs.merge(initial, how='inner', \
    left_on=['X1', 'X2'], \
    right_on=['X1', 'X2'])
df_raked = df_raked.drop(columns=['observations']).rename(columns={'raked_values': 'Value'})
df_raked['Type'] = 'Raked'
df_raked['width'] = 2
df_raked = df_raked.merge(variance, how='inner', \
    left_on=['X1', 'X2'], \
    right_on=['X1', 'X2'])
df_raked = pd.concat([df_obs, df_raked])
df_raked['Upper'] = df_raked['Value'] + np.sqrt(df_raked['variance'])
df_raked['Lower'] = df_raked['Value'] - np.sqrt(df_raked['variance'])

We plot initial values and raked values with their uncertainties (plus or minus the standard deviation).

In [24]:
bar = alt.Chart(df_raked).mark_errorbar(clip=True, opacity=0.5).encode(
    alt.X('Upper:Q', scale=alt.Scale(zero=False), axis=alt.Axis(title='Raked value')),
    alt.X2('Lower:Q'),
    alt.Y('X1:N', axis=alt.Axis(title='X1')),
    color=alt.Color('Type:N', legend=None),
    strokeWidth=alt.StrokeWidth('width:Q', legend=None)
)
point = alt.Chart(df_raked).mark_point(
    filled=True
).encode(
    alt.X('Value:Q'),
    alt.Y('X1:N'),
    color=alt.Color('Type:N'),
    shape=alt.Shape('Type:N')
)
chart = alt.layer(point, bar).resolve_scale(
    shape='independent',
    color='independent'
).facet(
    column=alt.Column('X2:N', header=alt.Header(title='X2', titleFontSize=24, labelFontSize=24)),
).configure_axis(
    labelFontSize=24,
    titleFontSize=24
).configure_legend(
    labelFontSize=24,
    titleFontSize=24
)
chart

In [25]:
#chart.save('synthetics_raked_values_1.svg')

In [26]:
bar = alt.Chart(df_raked).mark_errorbar(clip=True, opacity=0.5).encode(
    alt.X('Upper:Q', scale=alt.Scale(zero=False), axis=alt.Axis(title='Raked value')),
    alt.X2('Lower:Q'),
    alt.Y('X2:N', axis=alt.Axis(title='X2')),
    color=alt.Color('Type:N', legend=None),
    strokeWidth=alt.StrokeWidth('width:Q', legend=None)
)
point = alt.Chart(df_raked).mark_point(
    filled=True
).encode(
    alt.X('Value:Q'),
    alt.Y('X2:N'),
    color=alt.Color('Type:N'),
    shape=alt.Shape('Type:N')
)
chart = alt.layer(point, bar).resolve_scale(
    shape='independent',
    color='independent'
).facet(
    column=alt.Column('X1:N', header=alt.Header(title='X1', titleFontSize=24, labelFontSize=24)),
).configure_axis(
    labelFontSize=24,
    titleFontSize=24
).configure_legend(
    labelFontSize=24,
    titleFontSize=24
)
chart

In [27]:
#chart.save('synthetics_raked_values_2.svg')

Now we take one observation ($X_1 = 3$ and $X_2 = 4$) and we look at how it will influence the raked values.

In [28]:
index = np.argmax(np.abs(df_y.grad_y))
index_var1 = df_y.iloc[index].X1
index_var2 = df_y.iloc[index].X2
df_y_loc = df_y.loc[(df_y.X1==index_var1)&(df_y.X2==index_var2)]
max_scale = max(abs(df_y_loc['grad_y'].min()), abs(df_y_loc['grad_y'].max()))

base = alt.Chart(df_y_loc).encode(
    x=alt.X('raked_1:N', axis=alt.Axis(title='X1')),
    y=alt.Y('raked_2:N', axis=alt.Axis(title='X2')),
)

heatmap = base.mark_rect().encode(
    color=alt.Color('grad_y:Q',
        scale=alt.Scale(scheme='redblue', domain=[-max_scale, max_scale], reverse=True),
        legend=alt.Legend(title=['Effect of', 'one obs.']))
)

text = base.mark_text(baseline='middle').encode(
    alt.Text('grad_y:Q', format='.2f')
)

chart = alt.layer(heatmap, text
).properties(
    width=120,
    height=180
).configure_title(
    fontSize=12
).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=10
)
chart

In [29]:
#chart.save('synthetics_most_important_initial_value.svg')

Now we take one raked value ($X_1 = 3$ and $X_2 = 4$) and we look at how it is nfluenced by the observations.

In [30]:
index = np.argmax(np.abs(df_y.grad_y))
index_raked_1 = df_y.iloc[index].raked_1
index_raked_2 = df_y.iloc[index].raked_2
df_y_loc = df_y.loc[(df_y.raked_1==index_raked_1)&(df_y.raked_2==index_raked_2)]
max_scale = max(abs(df_y_loc['grad_y'].min()), abs(df_y_loc['grad_y'].max()))

base = alt.Chart(df_y_loc).encode(
    x=alt.X('X1:N', axis=alt.Axis(title='X1')),
    y=alt.Y('X2:N', axis=alt.Axis(title='X2')),
)

heatmap = base.mark_rect().encode(
    color=alt.Color('grad_y:Q',
        scale=alt.Scale(scheme='redblue', domain=[-max_scale, max_scale], reverse=True),
        legend=alt.Legend(title=['Effect of', 'all obs.']))
)

text = base.mark_text(baseline='middle').encode(
    alt.Text('grad_y:Q', format='.2f')
)

chart = alt.layer(heatmap, text
).properties(
    width=120,
    height=180
).configure_title(
    fontSize=12
).configure_axis(
    labelFontSize=12,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=10
)
chart

In [31]:
#chart.save('synthetics_most_important_raked_value.svg')

We compare the raked value of the mean of the observations with the mean of the raked values for each sample of observations.

In [32]:
delta_method = pd.DataFrame({'X1': x1, \
                             'X2': x2, \
                             'delta_method': df_raked.loc[df_raked.Type=='Raked'].Value})
monte_carlo = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'monte_carlo': mean_samples})
reference = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'reference': mean_ref})
df = delta_method.merge(
    monte_carlo, how='inner', on=['X1', 'X2']).merge(
    reference, how='inner', on=['X1', 'X2'])

#min_x = min(df['delta_method'].min(), df['monte_carlo'].min(), df['reference'].min())
#max_x = max(df['delta_method'].max(), df['monte_carlo'].max(), df['reference'].max())
min_x = 1.8
max_x = 3.0

points_delta = alt.Chart(df).mark_point(size=60).encode(
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    y=alt.Y('delta_method:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Delta method and IFT')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

points_samples = alt.Chart(df).mark_point(size=60).encode(
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    y=alt.Y('monte_carlo:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Monte Carlo simulation')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

diagonal = alt.Chart().mark_rule(strokeDash=[8, 8]).encode(
    x=alt.value(0),
    x2=alt.value('width'),
    y=alt.value('height'),
    y2=alt.value(0)
)

chart_delta = (diagonal + points_delta)

chart_samples = (diagonal + points_samples)

chart = alt.hconcat(chart_delta, chart_samples).properties(
    title=str(N) + ' samples',
).configure_title(
    anchor='middle',
    fontSize=24
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)
chart

In [33]:
#chart.save('mean_comparison_' + str(N) + 'samples.svg')

We compare the covariance matrix of the mean of the observations with the covariance matrix of the raked values for each sample of observations.

In [34]:
delta_method = pd.DataFrame({'X1': x1, \
                             'X2': x2, \
                             'delta_method': np.diag(covariance_0)})
monte_carlo = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'monte_carlo': np.diag(covariance_samples)})
reference = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'reference': np.diag(covariance_ref)})
df = delta_method.merge(
    monte_carlo, how='inner', on=['X1', 'X2']).merge(
    reference, how='inner', on=['X1', 'X2'])

#min_x = min(df['delta_method'].min(), df['all_draws'].min(), df['reference'].min())
#max_x = max(df['delta_method'].max(), df['all_draws'].max(), df['reference'].max())
min_x = 0.01
max_x = 0.07

points_delta = alt.Chart(df).mark_point(size=60).encode(
#    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Sample variance (1000000 draws)')),
#    y=alt.Y('delta_method:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Delta method and IFT')),
    y=alt.Y('delta_method:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Variance (Delta method + IFT)')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

points_samples = alt.Chart(df).mark_point(size=60).encode(
#    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Sample variance (1000000 draws)')),
#    y=alt.Y('monte_carlo:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Monte Carlo simulation')),
    y=alt.Y('monte_carlo:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Sample variance (' + str(N) + ' draws)')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

diagonal = alt.Chart().mark_rule(strokeDash=[8, 8]).encode(
    x=alt.value(0),
    x2=alt.value('width'),
    y=alt.value('height'),
    y2=alt.value(0)
)

chart_delta = (diagonal + points_delta)

chart_samples = (diagonal + points_samples)

chart = alt.hconcat(chart_delta, chart_samples).properties(
    title=str(N) + ' samples',
).configure_title(
    anchor='middle',
    fontSize=24
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)
chart

In [35]:
#chart.save('variance_comparison_' + str(N) + 'samples.svg')
chart.save('variance_comparison_' + str(N) + 'draws.svg')

We compare the 2.5% percentiles for both methods.

In [36]:
delta_method = pd.DataFrame({'X1': x1, \
                             'X2': x2, \
                             'delta_method': df_raked.loc[df_raked.Type=='Raked'].Value + \
                                 np.sqrt(np.diag(covariance_0)) * norm.ppf(0.025, 0, 1)})
monte_carlo = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'monte_carlo': np.quantile(beta, 0.025, axis=0)})
reference = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'reference': np.quantile(beta_ref, 0.025, axis=0)})
df = delta_method.merge(
    monte_carlo, how='inner', on=['X1', 'X2']).merge(
    reference, how='inner', on=['X1', 'X2'])

#min_x = min(df['delta_method'].min(), df['monte_carlo'].min(), df['reference'].min())
#max_x = max(df['delta_method'].max(), df['monte_carlo'].max(), df['reference'].max())
min_x = 1.2
max_x = 2.8

points_delta = alt.Chart(df).mark_point(size=60).encode(
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    y=alt.Y('delta_method:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Delta method and IFT')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

points_samples = alt.Chart(df).mark_point(size=60).encode(
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    y=alt.Y('monte_carlo:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Monte Carlo simulation')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

diagonal = alt.Chart().mark_rule(strokeDash=[8, 8]).encode(
    x=alt.value(0),
    x2=alt.value('width'),
    y=alt.value('height'),
    y2=alt.value(0)
)

chart_delta = (diagonal + points_delta)

chart_samples = (diagonal + points_samples)

chart = alt.hconcat(chart_delta, chart_samples).properties(
    title=str(N) + ' samples',
).configure_title(
    anchor='middle',
    fontSize=24
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)
chart

In [37]:
#chart.save('lower_interval_comparison_' + str(N) + 'samples.svg')

We compare the 97.55% percentiles for both methods.

In [38]:
delta_method = pd.DataFrame({'X1': x1, \
                             'X2': x2, \
                             'delta_method': df_raked.loc[df_raked.Type=='Raked'].Value + \
                                 np.sqrt(np.diag(covariance_0)) * norm.ppf(0.975, 0, 1)})
monte_carlo = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'monte_carlo': np.quantile(beta, 0.975, axis=0)})
reference = pd.DataFrame({'X1': x1, \
                          'X2': x2, \
                          'reference': np.quantile(beta_ref, 0.975, axis=0)})
df = delta_method.merge(
    monte_carlo, how='inner', on=['X1', 'X2']).merge(
    reference, how='inner', on=['X1', 'X2'])

#min_x = min(df['delta_method'].min(), df['monte_carlo'].min(), df['reference'].min())
#max_x = max(df['delta_method'].max(), df['monte_carlo'].max(), df['reference'].max())
min_x = 2.2
max_x = 3.5

points_delta = alt.Chart(df).mark_point(size=60).encode(
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    y=alt.Y('delta_method:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Delta method and IFT')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

points_samples = alt.Chart(df).mark_point(size=60).encode(
    x=alt.X('reference:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Reference')),
    y=alt.Y('monte_carlo:Q', scale=alt.Scale(domain=[min_x, max_x], zero=False), axis=alt.Axis(title='Monte Carlo simulation')),
    color=alt.Color('X1:N', legend=alt.Legend(title='X1')),
    shape=alt.Shape('X2:N', legend=alt.Legend(title='X2'))
)

diagonal = alt.Chart().mark_rule(strokeDash=[8, 8]).encode(
    x=alt.value(0),
    x2=alt.value('width'),
    y=alt.value('height'),
    y2=alt.value(0)
)

chart_delta = (diagonal + points_delta)

chart_samples = (diagonal + points_samples)

chart = alt.hconcat(chart_delta, chart_samples).properties(
    title=str(N) + ' samples',
).configure_title(
    anchor='middle',
    fontSize=24
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
)
chart

In [39]:
#chart.save('upper_interval_comparison_' + str(N) + 'samples.svg')