# Tutorial for the core API

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from copy import deepcopy

## Setup parameters

In [2]:
from rashomon import hasse

There are 3 features
- Feature 1 takes on four values, {0, 1, 2, 3}
- Feature 2 takes on three values, {0, 1, 2}
- Feature 3 takes on three values, {0, 1, 2}

In [3]:
M = 3
R = np.array([4, 3, 3])

First, we find all the profiles corresponding to this setup. For the profiles, only the number of features matters.

In [4]:
num_profiles = 2**M
profiles, profile_map = hasse.enumerate_profiles(M)

print("Profiles")
print(profiles)

print("\nMap from each profile tuple to its index in `profiles` list")
print(profile_map)

Profiles
[(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]

Map from each profile tuple to its index in `profiles` list
{(0, 0, 0): 0, (0, 0, 1): 1, (0, 1, 0): 2, (0, 1, 1): 3, (1, 0, 0): 4, (1, 0, 1): 5, (1, 1, 0): 6, (1, 1, 1): 7}


Next, we find all the possible feature combinations (i.e., policies) in our example.

In [5]:
all_policies = hasse.enumerate_policies(M, R)
num_policies = len(all_policies)

print(f"All {num_policies} policies")
print(all_policies)

All 36 policies
[(0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 1, 0), (0, 1, 1), (0, 1, 2), (0, 2, 0), (0, 2, 1), (0, 2, 2), (1, 0, 0), (1, 0, 1), (1, 0, 2), (1, 1, 0), (1, 1, 1), (1, 1, 2), (1, 2, 0), (1, 2, 1), (1, 2, 2), (2, 0, 0), (2, 0, 1), (2, 0, 2), (2, 1, 0), (2, 1, 1), (2, 1, 2), (2, 2, 0), (2, 2, 1), (2, 2, 2), (3, 0, 0), (3, 0, 1), (3, 0, 2), (3, 1, 0), (3, 1, 1), (3, 1, 2), (3, 2, 0), (3, 2, 1), (3, 2, 2)]


## Partition matrix

In [6]:
from rashomon import extract_pools

We will look only at the (1, 1, 1) profile for the purpose of illustration.

In [7]:
policies_111 = [x for x in all_policies if x[0] > 0 and x[1] > 0 and x[2] > 0]
policies_111

[(1, 1, 1),
 (1, 1, 2),
 (1, 2, 1),
 (1, 2, 2),
 (2, 1, 1),
 (2, 1, 2),
 (2, 2, 1),
 (2, 2, 2),
 (3, 1, 1),
 (3, 1, 2),
 (3, 2, 1),
 (3, 2, 2)]

Let us say that the partition is as follows:
- $\pi_1$ = {(1, 1, 1), (1, 2, 1)}
- $\pi_2$ = {(1, 1, 2), (1, 2, 2)}
- $\pi_3$ = {(2, 1, 1), (2, 2, 1), (3, 1, 1), (3, 2, 1)}
- $\pi_4$ = {(2, 1, 2), (2, 2, 2), (3, 1, 2), (3, 2, 2)}

This corresponds to the following $\Sigma$ matrix. The `np.inf` implies that that feature does not take those factor levels.

In [8]:
sigma_111 = np.array([[0, 1],
                  [1, np.inf],
                  [0, np.inf]])
sigma_111

array([[ 0.,  1.],
       [ 1., inf],
       [ 0., inf]])

This is how we extract the pools from the matrix.

In [9]:
pi_pools, pi_policies = extract_pools.extract_pools(policies_111, sigma_111)

`pi_pools` is a dictionary that maps each pool index to a list of _indices_ of feature combinations in that pool

In [10]:
pi_pools

{0: [0, 2], 1: [1, 3], 2: [4, 6, 8, 10], 3: [5, 7, 9, 11]}

`pi_policies` is a dictionary that maps each feature combination (through its index) to the index of the pool it belongs to

In [11]:
pi_policies

{0: 0, 2: 0, 1: 1, 3: 1, 4: 2, 6: 2, 8: 2, 10: 2, 5: 3, 7: 3, 9: 3, 11: 3}

`extract_pools` also has an optional argument `lattice_edges` where you provide the edges in the Hasse. If you call `extract_pools` on the same Hasse very often, it is more efficient to pre-compute the lattice edges once and pass in this argument

In [12]:
hasse_edges = extract_pools.lattice_edges(policies_111)

pi_pools, pi_policies = extract_pools.extract_pools(policies_111, sigma_111, lattice_edges=hasse_edges)

## Generate data

Since there are 4 pools, we only need to select 4 distributions for the outcome. For simplicity, say the outcomes come from $N(\mu_{\pi}, \sigma_{\pi}^2)$ with the following parameters for each pool

In [13]:
mu_111 = np.array([0, 2, 4, -2])
var_111 = np.array([1, 1, 1, 1])

Fix 50 samples per feature and generate the data

In [14]:
np.random.seed(3)

num_samples_per_feature = 50
num_data = len(policies_111) * num_samples_per_feature

X = np.zeros(shape=(num_data, M))
D = np.zeros(shape=(num_data, 1), dtype='int_')
y = np.zeros(shape=(num_data, 1))

idx_ctr = 0
for k, feature in enumerate(policies_111):
    # policy_idx = [i for i, x in enumerate(all_policies) if x == policy]

    pool_id = pi_policies[k]
    mu_i = mu_111[pool_id]
    var_i = var_111[pool_id]
    y_i = np.random.normal(mu_i, var_i, size=(num_samples_per_feature, 1))

    start_idx = idx_ctr * num_samples_per_feature
    end_idx = (idx_ctr + 1) * num_samples_per_feature

    X[start_idx:end_idx, ] = feature
    D[start_idx:end_idx, ] = k
    y[start_idx:end_idx, ] = y_i

    idx_ctr += 1

`X` is the feature matrix.

`D` tells us the feature indices i.e., `D[i, 0]` is the feature index of `X[i, ]`.

`y` is the outcome vector

In [15]:
print(X[:10,])

print(D[:10])

print(y[:10])

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
[[0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]]
[[ 1.78862847]
 [ 0.43650985]
 [ 0.09649747]
 [-1.8634927 ]
 [-0.2773882 ]
 [-0.35475898]
 [-0.08274148]
 [-0.62700068]
 [-0.04381817]
 [-0.47721803]]


We can calculate the mean outcome of each feature through the following object. The first column contains the sums of outcomes for each feature. The second column is the count. So dividing the first column by the second lets us find the average. We keep the sums and counts separately for internal computation purposes

In [16]:
from rashomon import loss

policy_means_111 = loss.compute_policy_means(D, y, len(policies_111))
print(policy_means_111)

[[ -14.696341     50.        ]
 [ 103.83263356   50.        ]
 [   1.45837656   50.        ]
 [  95.24720387   50.        ]
 [ 221.50472005   50.        ]
 [ -92.80830557   50.        ]
 [ 189.57330566   50.        ]
 [-107.31751574   50.        ]
 [ 209.98734846   50.        ]
 [-103.75982477   50.        ]
 [ 200.56828936   50.        ]
 [ -99.03931665   50.        ]]


## Finding the Rashomon set for profile (1, 1, 1)

In [17]:
from rashomon import aggregate

%load_ext autoreload
%autoreload 2

Let us set the maximum number of pools to be $H = \infty$ and the Rashomon threshold to be $\theta = 8$ and regularization $\lambda = 1$

In [18]:
H = np.inf
theta = 8
lamb = 1

RPS_111 = aggregate.RAggregate_profile(M, R, H, D, y, theta, profile=(1, 1, 1), reg=lamb)

The output of `RAggregate_profile` is an object of type `RashomonSet`. Here are some useful things we can do with this. Observe that the true partition `sigma_111` is the second partition in the RPS and has the least loss.

In [19]:
# Show the partition matrix for each member of the RPS
for sig in RPS_111.sigma:
    print(sig)

# Count the number of pools in each Rashomon partition
print(RPS_111.pools)

[[ 1.  1.]
 [ 1. inf]
 [ 0. inf]]
[[ 0.  1.]
 [ 1. inf]
 [ 0. inf]]
[[ 1.  0.]
 [ 1. inf]
 [ 0. inf]]
[[ 0.  0.]
 [ 1. inf]
 [ 0. inf]]
[2. 4. 4. 6.]


In [20]:
# Calculate the loss
# This needs to be done only when calling `RAggregate_profile`
# When calling the main function `RAggregate`, loss is automatically calculated
RPS_111.calculate_loss(D, y, policies_111, policy_means_111, reg=lamb)

# Print the loss for each member in the RPS
print(RPS_111.loss)

[6.8207948  5.04016569 7.86851401 7.04010387]


Additionally, there is an internal function that manually checks every single partition to see if it belongs to the RPS.

In [21]:
RPS_111_brute_force = aggregate._brute_RAggregate_profile(M, R, H, D, y, theta, profile=(1, 1, 1), reg=lamb)

# Verify that the brute force computation matches the branch-and-bound algorithm
RPS_111_brute_force.P_hash == RPS_111.P_hash

True

## For all profiles

Fix the partition matrices and outcome parameters for all other profiles

In [22]:
# Profile (0, 0, 0)
sigma_000 = None
mu_000 = np.array([0])
var_000 = np.array([1])

# Profile (0, 0, 1)
sigma_001 = np.array([[1]])
mu_001 = np.array([-2])
var_001 = np.array([1])

# Profile (0, 1, 0)
sigma_010 = np.array([[1]])
mu_010 = np.array([1])
var_010 = np.array([1])

# Profile (0, 1, 1)
sigma_011 = np.array([[1], [0]])
mu_011 = np.array([1, -2])
var_011 = np.array([1, 1])

# Profile (1, 0, 0)
sigma_100 = np.array([[0, 1]])
mu_100 = np.array([0, 2])
var_100 = np.array([1, 1])

# Profile (1, 0, 1)
sigma_101 = np.array([[0, 1], [0, np.inf]])
mu_101 = np.array([0, 2, 1, -2])
var_101 = np.array([1, 1, 1, 1])

# Profile (1, 1, 0)
sigma_110 = np.array([[0, 1], [1, np.inf]])
mu_110 = np.array([0, -2])
var_110 = np.array([1, 1])

sigma = [sigma_000, sigma_001, sigma_010, sigma_011, sigma_100, sigma_101, sigma_110, sigma_111]
mu = [mu_000, mu_001, mu_010, mu_011, mu_100, mu_101, mu_110, mu_111]
var = [var_000, var_001, var_010, var_011, var_100, var_101, var_110, var_111]

### Find all pools for each profile

This code block does what we did previously for a single profile. Since `extract_pools` only works for indexing within a Hasse, we need to carefully map the universal indexing of features across all profiles to its corresponding index within the profile that it belongs to. This is why this code chunk appears more complicated than it actually is.

In [23]:
policies_profiles = {}
policies_profiles_masked = {}
policies_ids_profiles = {}
pi_policies = {}
pi_pools = {}
for k, profile in enumerate(profiles):

    policies_temp = [(i, x) for i, x in enumerate(all_policies) if hasse.policy_to_profile(x) == profile]
    unzipped_temp = list(zip(*policies_temp))
    policies_ids_k = list(unzipped_temp[0])
    policies_k = list(unzipped_temp[1])
    policies_profiles[k] = deepcopy(policies_k)
    policies_ids_profiles[k] = policies_ids_k

    profile_mask = list(map(bool, profile))

    # Mask the empty arms
    for idx, pol in enumerate(policies_k):
        policies_k[idx] = tuple([pol[i] for i in range(M) if profile_mask[i]])
    policies_profiles_masked[k] = policies_k

    if np.sum(profile) > 0:
        pi_pools_k, pi_policies_k = extract_pools.extract_pools(policies_k, sigma[k])
        if len(pi_pools_k.keys()) != mu[k].shape[0]:
            print(f"Profile {k}. Expected {len(pi_pools_k.keys())} pools. Received {mu[k].shape[0]} means.")
        pi_policies[k] = pi_policies_k
        # pi_pools_k has indicies that match with policies_profiles[k]
        # Need to map those indices back to all_policies
        pi_pools[k] = {}
        for x, y in pi_pools_k.items():
            y_full = [policies_profiles[k][i] for i in y]
            y_agg = [all_policies.index(i) for i in y_full]
            pi_pools[k][x] = y_agg
    else:
        pi_policies[k] = {0: 0}
        pi_pools[k] = {0: [0]}

### Generate data

Again, this repeats what we did for a single profile for all profiles.

In [24]:
def generate_data(mu, var, n_per_pol, all_policies, pi_policies, M):
    num_data = num_policies * n_per_pol
    X = np.zeros(shape=(num_data, M))
    D = np.zeros(shape=(num_data, 1), dtype='int_')
    y = np.zeros(shape=(num_data, 1))

    idx_ctr = 0
    for k, profile in enumerate(profiles):
        policies_k = policies_profiles[k]

        for idx, policy in enumerate(policies_k):
            policy_idx = [i for i, x in enumerate(all_policies) if x == policy]

            pool_id = pi_policies[k][idx]
            mu_i = mu[k][pool_id]
            var_i = var[k][pool_id]
            y_i = np.random.normal(mu_i, var_i, size=(n_per_pol, 1))

            start_idx = idx_ctr * n_per_pol
            end_idx = (idx_ctr + 1) * n_per_pol

            X[start_idx:end_idx, ] = policy
            D[start_idx:end_idx, ] = policy_idx[0]
            y[start_idx:end_idx, ] = y_i

            idx_ctr += 1

    return X, D, y

In [25]:
num_samples_per_feature = 50000

X, D, y = generate_data(mu, var, num_samples_per_feature, all_policies, pi_policies, M)
policy_means = loss.compute_policy_means(D, y, num_policies)

### Finding the Rashomon Set

In [26]:
H = np.inf
theta = 13
lamb = 1

R_set, R_profiles = aggregate.RAggregate(M, R, H, D, y, theta, reg=lamb)

The output of `RAggregate` is different from that of `RAggregate_profile`. For starters, the output is a tuple.

The first item `R_set` is a list. Each item in `R_set` is a list itself. The length of this list is the number of profiles. Each item in `R_set[i]` gives an index for a partition of that profile. So `R_set[i][k]` is the partition of the k-th profile in the i-th Rashomon partition in the RPS.

The second item `R_profiles` is a list whose length is the number of profiles. Each item is the `RashomonSet` object that we saw earlier. The indices in `R_set` correspond to the partitions in `R_profiles`. So the actual partition of `R_set[i][k]` is retrieved by accessing `R_profiles[k].sigma[R_set[i][k]]`.

In [27]:
R_set

[[0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 2],
 [0, 0, 0, 0, 0, 0, 0, 3],
 [0, 0, 0, 0, 0, 0, 0, 4],
 [0, 0, 0, 0, 0, 0, 0, 5],
 [0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 0, 2, 0],
 [0, 0, 0, 0, 0, 0, 3, 0],
 [0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 1],
 [0, 0, 0, 0, 0, 2, 0, 0],
 [0, 0, 0, 0, 0, 2, 0, 1],
 [0, 0, 0, 0, 0, 3, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 1],
 [0, 0, 0, 0, 2, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 1],
 [0, 0, 0, 2, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0]]

In [28]:
R_profiles

[[None],
 [array([[1.]]), array([[0.]])],
 [array([[1.]]), array([[0.]])],
 [array([[1.],
        [1.]]), array([[1.],
        [0.]]), array([[0.],
        [1.]])],
 [array([[1., 1.]]), array([[0., 1.]]), array([[1., 0.]])],
 [array([[ 1.,  1.],
        [ 1., inf]]), array([[ 0.,  1.],
        [ 1., inf]]), array([[ 1.,  1.],
        [ 0., inf]]), array([[ 1.,  0.],
        [ 1., inf]])],
 [array([[ 1.,  1.],
        [ 1., inf]]), array([[ 0.,  1.],
        [ 1., inf]]), array([[ 1.,  0.],
        [ 1., inf]]), array([[ 1.,  1.],
        [ 0., inf]])],
 [array([[ 1.,  1.],
        [ 1., inf],
        [ 1., inf]]), array([[ 1.,  1.],
        [ 1., inf],
        [ 0., inf]]), array([[ 0.,  1.],
        [ 1., inf],
        [ 0., inf]]), array([[ 0.,  1.],
        [ 1., inf],
        [ 1., inf]]), array([[ 1.,  0.],
        [ 1., inf],
        [ 1., inf]]), array([[ 1.,  1.],
        [ 0., inf],
        [ 1., inf]])]]

Now, let us see how to access these.

In [29]:
i = 3

RPS_partitions_i = R_set[i]

total_loss = 0
total_pools = 0
for k, profile in enumerate(profiles):
    print("Profile", profile)

    R_partition_i_k = R_profiles[k].sigma[RPS_partitions_i[k]]
    print("Partition")
    print(R_partition_i_k)

    # Notice that unlike the per-profile case, the loss of this partition is already pre-computed
    loss_i_k = R_profiles[k].loss[RPS_partitions_i[k]]
    print(f"Loss = {loss_i_k}")
    
    pools_i_k = R_profiles[k].pools[RPS_partitions_i[k]]
    print(f"Number of pools = {pools_i_k}")

    total_loss += loss_i_k
    total_pools += pools_i_k

    print("---")

print(f"Total loss = {total_loss}, Total number of pools = {total_pools}")

Profile (0, 0, 0)
Partition
None
Loss = 1.0276327713315392
Number of pools = 1
---
Profile (0, 0, 1)
Partition
[[1.]]
Loss = 1.0549152190530173
Number of pools = 1.0
---
Profile (0, 1, 0)
Partition
[[1.]]
Loss = 1.0554461937366306
Number of pools = 1.0
---
Profile (0, 1, 1)
Partition
[[1.]
 [1.]]
Loss = 1.3612916033997204
Number of pools = 1.0
---
Profile (1, 0, 0)
Partition
[[1. 1.]]
Loss = 1.157395257884124
Number of pools = 1.0
---
Profile (1, 0, 1)
Partition
[[ 1.  1.]
 [ 1. inf]]
Loss = 1.5540328652364819
Number of pools = 1.0
---
Profile (1, 1, 0)
Partition
[[ 1.  1.]
 [ 1. inf]]
Loss = 1.3142552442601412
Number of pools = 1.0
---
Profile (1, 1, 1)
Partition
[[ 0.  1.]
 [ 1. inf]
 [ 1. inf]]
Loss = 4.444784097194952
Number of pools = 2.0
---
Total loss = 12.969753252096606, Total number of pools = 9.0


By default `RAggregate` uses only one process. But we can parallelize finding Rashomon sets for each profile by changing the `num_workers` argument

In [30]:
import time

In [31]:
# num_workers = 1

start = time.time()
R_set1, R_profiles1 = aggregate.RAggregate(M, R, H, D, y, theta, reg=lamb, num_workers=1)
end = time.time()
elapsed = end - start

print(f"With 1 worker, RAggregate took {elapsed} s.")

With 1 worker, RAggregate took 9.232525825500488 s.


In [32]:
# num_workers = 2

start = time.time()
R_set2, R_profiles2 = aggregate.RAggregate(M, R, H, D, y, theta, reg=lamb, num_workers=2)
end = time.time()
elapsed = end - start

print(f"With 2 workers, RAggregate took {elapsed} s.")

With 2 workers, RAggregate took 8.007990837097168 s.


In [33]:
# Check whether the results are the same
print(R_set1 == R_set2)

True


The difference of 1 second seems negligible but the gains will be more substantial when there are more features.