In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from copy import deepcopy
from sklearn.metrics import mean_squared_error

%matplotlib inline

# Data processing

### Read data

In [80]:
dta_fname = "../Data/Does_Price_Matter_AER_merged.dta"

raw_df = pd.read_stata(dta_fname)
colnames = raw_df.columns
print(colnames)

cols_to_keep = [
                'treatment', 'control',
                'ratio',
                # 'ratio2', 'ratio3',
                'size',
                # 'size25', 'size50', 'size100', 'sizeno',
                'ask',
                # 'ask1', 'ask2', 'ask3',
                # 'askd1', 'askd2', 'askd3',
                'amount', 'gave', 'amountchange',
                'red0'#, 'blue0'
               ]
df = raw_df[cols_to_keep]

df.head()

Index(['treatment', 'control', 'ratio', 'ratio2', 'ratio3', 'size', 'size25',
       'size50', 'size100', 'sizeno', 'ask', 'askd1', 'askd2', 'askd3', 'ask1',
       'ask2', 'ask3', 'amount', 'gave', 'amountchange', 'hpa', 'ltmedmra',
       'freq', 'years', 'year5', 'mrm2', 'dormant', 'female', 'couple',
       'state50one', 'nonlit', 'cases', 'statecnt', 'stateresponse',
       'stateresponset', 'stateresponsec', 'stateresponsetminc', 'perbush',
       'close25', 'red0', 'blue0', 'redcty', 'bluecty', 'pwhite', 'pblack',
       'page18_39', 'ave_hh_sz', 'median_hhincome', 'powner', 'psch_atlstba',
       'pop_propurban'],
      dtype='object')


Unnamed: 0,treatment,control,ratio,size,ask,amount,gave,amountchange,red0
0,0,1,Control,Control,Control,0.0,0,-45.0,0.0
1,0,1,Control,Control,Control,0.0,0,-25.0,0.0
2,1,0,1,"$100,000",1x,0.0,0,-50.0,0.0
3,1,0,1,Unstated,1x,0.0,0,-25.0,0.0
4,1,0,1,"$50,000",1x,0.0,0,-15.0,1.0


### Remove control

In [81]:
# avg_control = np.mean(df[df["control"] == 1]["amountchange"])
# df_treatment = df[df["treatment"] == 1]
# df_treatment.loc[:, "amountchange"] = df_treatment.loc[:, "amountchange"] - avg_control
df_treatment = deepcopy(df)
df_treatment = df_treatment.dropna()

### Process data

In [82]:
ratio_map = {"Control": 0, 1: 1, 2: 2, 3: 3}
size_map = {"Control": 0, "$25,000": 1, "$50,000": 2, "$100,000": 3, "Unstated": 4}
ask_map = {"Control": 0, "1x": 1, "1.25x": 2, "1.50x": 3}
redblue_map = {0: 1, 1:2, np.nan: 0}

df_treatment.loc[:, "ratio"] = df_treatment.loc[:, "ratio"].map(ratio_map)
df_treatment.loc[:, "size"] = df_treatment.loc[:, "size"].map(size_map)
df_treatment.loc[:, "ask"] = df_treatment.loc[:, "ask"].map(ask_map)
df_treatment.loc[:, "red0"] = df_treatment.loc[:, "red0"].map(redblue_map)
df_treatment = df_treatment.astype({
    "ratio": np.int64, "size": np.int64, "ask": np.int64, "red0": np.int64})

df_treatment = df_treatment.drop(["treatment", "control"], axis=1)

df_treatment.head()

Unnamed: 0,ratio,size,ask,amount,gave,amountchange,red0
0,0,0,0,0.0,0,-45.0,1
1,0,0,0,0.0,0,-25.0,1
2,1,3,1,0.0,0,-50.0,1
3,1,4,1,0.0,0,-25.0,1
4,1,2,1,0.0,0,-15.0,2


In [84]:
Z = df_treatment.to_numpy()
X = Z[:, [0, 1, 2, 6]] #- 1
y = Z[:, 3] / 100 # (np.max(y) - np.min(y))
y = y.reshape((-1, 1))

num_data = X.shape[0]

In [83]:
df_treatment.groupby('red0').count()

Unnamed: 0_level_0,ratio,size,ask,amount,gave,amountchange
red0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,29806,29806,29806,29806,29806,29806
2,20242,20242,20242,20242,20242,20242


# Analysis

### Rashomon

In [6]:
from rashomon import tva
from rashomon import loss
from rashomon import counter
from rashomon import metrics
from rashomon import extract_pools
from rashomon.aggregate import RAggregate_profile, RAggregate
from rashomon.sets import RashomonSet, RashomonProblemCache, RashomonSubproblemCache


%load_ext autoreload
%autoreload 2

Setup the policy and profile list

In [85]:
M = 4
# R = np.array([3, 4, 3])
R = np.array([4, 5, 4, 3])

num_profiles = 2**M
profiles, profile_map = tva.enumerate_profiles(M)

all_policies = tva.enumerate_policies(M, R)
num_policies = len(all_policies)

In [86]:
policies_profiles = {}
policies_profiles_masked = {}
policies_ids_profiles = {}
# pi_policies = {}
# pi_pools = {}
for k, profile in enumerate(profiles):

    policies_temp = [(i, x) for i, x in enumerate(all_policies) if tva.policy_to_profile(x) == profile]
    unzipped_temp = list(zip(*policies_temp))
    policies_ids_k = list(unzipped_temp[0])
    policies_k = list(unzipped_temp[1])
    policies_profiles[k] = deepcopy(policies_k)
    policies_ids_profiles[k] = policies_ids_k

    
    profile_mask = list(map(bool, profile))

    # Mask the empty arms
    for idx, pol in enumerate(policies_k):
        policies_k[idx] = tuple([pol[i] for i in range(M) if profile_mask[i]])
    policies_profiles_masked[k] = policies_k

    # if np.sum(profile) > 0:
    #     pi_pools_k, pi_policies_k = extract_pools.extract_pools(policies_k, sigma[k])
    #     if len(pi_pools_k.keys()) != mu[k].shape[0]:
    #         print(f"Profile {k}. Expected {len(pi_pools_k.keys())} pools. Received {mu[k].shape[0]} means.")
    #     pi_policies[k] = pi_policies_k
    #     pi_pools[k] = {}
    #     for x, y in pi_pools_k.items():
    #         y_full = [policies_profiles[k][i] for i in y]
    #         y_agg = [all_policies.index(i) for i in y_full]
    #         pi_pools[k][x] = y_agg
    # else:
    #     pi_policies[k] = {0: 0}
    #     pi_pools[k] = {0: [0]}

In [87]:
D = np.zeros(shape=y.shape, dtype=np.int64)
profiles_in_data = []
for i in range(num_data):
    policy_i = tuple([int(x) for x in X[i, :]])
    policy_idx = [idx for idx in range(num_policies) if all_policies[idx] == policy_i]
    profiles_in_data.append(tva.policy_to_profile(policy_i))
    D[i, 0] = int(policy_idx[0])

policy_means = loss.compute_policy_means(D, y, num_policies)

In [380]:
mu_policies = policy_means[:, 0] / policy_means[:, 1]
mu_policies[np.isnan(mu_policies)] = -np.inf

print(mu_policies)
print(np.max(mu_policies))
all_policies[np.argmax(mu_policies)]

[0.00081327       -inf       -inf       -inf       -inf       -inf
       -inf       -inf       -inf       -inf       -inf       -inf
       -inf       -inf       -inf       -inf       -inf       -inf
       -inf       -inf       -inf       -inf       -inf       -inf
       -inf 0.00061207 0.00071367 0.00092233       -inf 0.0008127
 0.0009806  0.00089189       -inf 0.00078041 0.0015162  0.00078664
       -inf 0.00092573 0.00127077 0.00102909       -inf       -inf
       -inf       -inf       -inf 0.00142179 0.0008127  0.001507
       -inf 0.00057514 0.00133621 0.00102047       -inf 0.00052859
 0.00086207 0.0011153        -inf 0.00110118 0.00122306 0.00080819
       -inf       -inf       -inf       -inf       -inf 0.0012438
 0.00090517 0.00140281       -inf 0.00081122 0.00088134 0.00069073
       -inf 0.0012605  0.00057929 0.00070259       -inf 0.0008903
 0.00096444 0.00092171]
0.001516198704103672


  mu_policies = policy_means[:, 0] / policy_means[:, 1]


(1, 3, 2)

In [89]:
R_set, rashomon_profiles = RAggregate(M, R, 12, D, y, 0.07, reg=1e-7)

print(len(R_set))

210


In [90]:
best_profile_counter = np.zeros(shape=(num_profiles,))

all_losses = []

reg = 1e-7
best_loss = np.inf
best_policy_o = None

for r_set in R_set:
    pi_policies_profiles_r = {}

    for k, profile in enumerate(profiles):
        if rashomon_profiles[k].sigma[r_set[k]] is None and rashomon_profiles[k].Q[r_set[k]] == 0:
            pi_policies_r_k = {i: None for i in range(len(policies_profiles_masked[k]))}
            pass
        else:
            _, pi_policies_r_k = extract_pools.extract_pools(
                policies_profiles_masked[k],
                rashomon_profiles[k].sigma[r_set[k]]
            )
        pi_policies_profiles_r[k] = pi_policies_r_k
    
    pi_pools_r, pi_policies_r = extract_pools.aggregate_pools(pi_policies_profiles_r, policies_ids_profiles)
    pool_means_r = loss.compute_pool_means(policy_means, pi_pools_r)
    
    y_pred = metrics.make_predictions(D, pi_policies_r, pool_means_r)

    this_loss = mean_squared_error(y, y_pred) + reg * len(pi_pools_r)
    all_losses.append((this_loss, mean_squared_error(y, y_pred), len(pi_pools_r)))
    
    # best_pools = np.argwhere(pool_means_r == np.max(pool_means_r))[0]
    best_pools = np.argwhere(pool_means_r == np.nanmax(pool_means_r))[0]

    for best_pool in best_pools:
        best_policy_id = pi_pools_r[best_pool][0]
        best_policy = all_policies[best_policy_id]
        best_profile = tva.policy_to_profile(best_policy)
        profile_id = profile_map[best_profile]

        best_profile_counter[profile_id] += 1
        
        if this_loss < best_loss:
            best_loss = this_loss
            best_policy_o = best_policy
            print(best_loss, best_policy, len(pi_pools_r))
            for k, v in pi_pools_r.items():
                print(k, ":", [all_policies[x] for x in v], v)
                print(pool_means_r[k])
            print("---")

best_profile_freq = best_profile_counter / np.sum(best_profile_counter)

print(f"Best model loss {best_loss} and best policy {best_policy_o}")

0.007557480448911568 (2, 1, 1, 2) 10
0 : [(0, 0, 0, 1)] [1]
0.008974972579519394
1 : [(0, 0, 0, 2)] [2]
0.00687424789410349
2 : [(1, 1, 1, 1), (1, 1, 2, 1), (1, 1, 3, 1)] [76, 79, 82]
0.005685459940652819
3 : [(1, 1, 1, 2), (1, 1, 2, 2), (1, 1, 3, 2)] [77, 80, 83]
0.010282588878760256
4 : [(1, 2, 1, 1), (1, 2, 2, 1), (1, 2, 3, 1), (1, 3, 1, 1), (1, 3, 2, 1), (1, 3, 3, 1), (1, 4, 1, 1), (1, 4, 2, 1), (1, 4, 3, 1)] [88, 91, 94, 100, 103, 106, 112, 115, 118]
0.009925237421701354
5 : [(1, 2, 1, 2), (1, 2, 2, 2), (1, 2, 3, 2), (1, 3, 1, 2), (1, 3, 2, 2), (1, 3, 3, 2), (1, 4, 1, 2), (1, 4, 2, 2), (1, 4, 3, 2)] [89, 92, 95, 101, 104, 107, 113, 116, 119]
0.009740642499263189
6 : [(2, 1, 1, 1), (2, 1, 2, 1), (2, 1, 3, 1), (3, 1, 1, 1), (3, 1, 2, 1), (3, 1, 3, 1)] [136, 139, 142, 196, 199, 202]
0.010423880597014925
7 : [(2, 1, 1, 2), (2, 1, 2, 2), (2, 1, 3, 2), (3, 1, 1, 2), (3, 1, 2, 2), (3, 1, 3, 2)] [137, 140, 143, 197, 200, 203]
0.014796564195298375
8 : [(2, 2, 1, 1), (2, 2, 2, 1), (2, 2, 3,

In [91]:
all_losses

[(0.007557480448911568, 0.007556480448911568, 10),
 (0.0075576796741105585, 0.007556879674110559, 8),
 (0.007557703460915919, 0.007556703460915919, 10),
 (0.007557733066041814, 0.007556733066041814, 10),
 (0.007557861484243355, 0.007556861484243355, 10),
 (0.007557877863800896, 0.0075568778638008954, 10),
 (0.007557998830353427, 0.007557198830353428, 8),
 (0.0075580068824672155, 0.007557406882467216, 6),
 (0.007558013131701712, 0.007557413131701712, 6),
 (0.007558029324507407, 0.007557229324507407, 8),
 (0.007558059910681734, 0.007557259910681734, 8),
 (0.007558070483756426, 0.0075570704837564255, 10),
 (0.007558071553495179, 0.007557071553495179, 10),
 (0.007558101201035197, 0.007557101201035197, 10),
 (0.0075581028217123454, 0.007557002821712345, 11),
 (0.007558117904508547, 0.007557317904508547, 8),
 (0.007558134501179389, 0.007557334501179389, 8),
 (0.007558178011438929, 0.007557078011438929, 11),
 (0.007558196443147629, 0.007557196443147629, 10),
 (0.007558238310966175, 0.00755713