# Solutions for chapter 9 exercises

## Set up

In [1]:
# Common libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import seaborn as sns

# Chapter-specific libraries
import random # For functions sample() and shuffle()
from sklearn.preprocessing import MinMaxScaler # To rescale numeric variables
from sklearn.preprocessing import OneHotEncoder # To one-hot encode cat. variables

In [2]:
#Loading the data
dat_df = pd.read_csv("Karlan_List_exercises_data.csv")
dat_df.head(5)

Unnamed: 0,gave,amount,group,freq,gender,state_pol,county_pol,dormant
0,0,0.0,ctrl,2,male,blue,blue,1
1,0,0.0,ctrl,2,male,blue,red,0
2,0,0.0,treat1,3,male,blue,blue,0
3,0,0.0,treat1,15,male,blue,red,0
4,0,0.0,treat1,42,female,red,blue,1


In [None]:
# Reformatting categorical variables
#dat_df['group'] = pd.Categorical(dat_df['group'], ordered = True, categories = ['ctrl', 'treat1', 'treat2', 'treat3'])
#dat_df['gender'] = pd.Categorical(dat_df['gender'], ordered = True, categories = ['male', 'female'])
#dat_df['state_pol'] = pd.Categorical(dat_df['state_pol'], ordered = True, categories = ['blue', 'red'])
#dat_df['county_pol'] = pd.Categorical(dat_df['county_pol'], ordered = True, categories = ['blue', 'red'])

# Exercise 1 - stratified randomization

## 1) Traditional randomization. 
Let’s determine the CI for the difference in donations between two groups with no true differences.

a) Select only the control group (i.e., subjects with no effect) and delete the Group variable from it. Create an ID variable that indexes the rows in your dataset. Create a Taste variable which takes with equal probabilities the values “vanilla” and chocolate”. 

In [3]:
dat_df_ctrl = dat_df.copy().loc[dat_df['group']=='ctrl'].drop('group', axis=1)
dat_df_ctrl.reset_index(inplace=True)

dat_df1 = dat_df_ctrl.copy()
dat_df1['assgnt'] = np.random.uniform(0,1,len(dat_df1))
dat_df1['taste'] = 'vanilla'
dat_df1.loc[dat_df1['assgnt'].between(0, 0.5, inclusive=True), 
               'taste'] = 'chocolate'
dat_df1 = dat_df1.drop('assgnt', axis=1)
dat_df1.head(5)

Unnamed: 0,index,gave,amount,freq,gender,state_pol,county_pol,dormant,taste
0,0,0,0.0,2,male,blue,blue,1,chocolate
1,1,0,0.0,2,male,blue,red,0,chocolate
2,5,0,0.0,20,male,red,red,0,chocolate
3,12,0,0.0,2,male,red,red,1,chocolate
4,14,0,0.0,3,male,red,red,0,chocolate


b) Calculate the 90%-CI for the difference in donation amount between the two taste groups. 

In [10]:
# Metric function
def metric_fun(dat_df):
    model = ols("amount~taste+gender+state_pol+county_pol", data=dat_df)
    res = model.fit(disp=0)
    coeff = res.params['taste[T.vanilla]']
    return coeff

metric_fun(dat_df1)

-0.00019425533341373347

In [8]:
def boot_CI_fun(dat_df, metric_fun, B = 100, conf_level = 0.9):
  #Setting sample size
  N = len(dat_df)
  coeffs = []
  
  for i in range(B):
      sim_data_df = dat_df.sample(n=N, replace = True)
      coeff = metric_fun(sim_data_df)
      coeffs.append(coeff)
  
  coeffs.sort()
  start_idx = round(B * (1 - conf_level) / 2)
  end_idx = - round(B * (1 - conf_level) / 2)
  
  confint = [coeffs[start_idx], coeffs[end_idx]]  
  return(confint)

boot_CI_fun(dat_df1, metric_fun, B = 200)

## 2) Stratified randomization. 
We’ll repeat the process from question 1, but this time stratify the allocation of subjects between vanilla and chocolate taste. Before doing any math: do you expect the CI to be larger or smaller than in the previous question?

*Because stratification reduces the noise around the true value, we should expect a smaller CI.*

a) Copy and paste the necessary functions from the production code folder (stratification.data.prep and stratified.allocation in R, strat_prep_fun and stratified_assgnt_fun in Python). Assign Taste through stratified randomization on the background variables (Gender, Freq, StatePol, CountyPol, Dormant).

In [7]:
def strat_prep_fun(dat_df, id_var):
    
    #Isolating the identification variable
    assert id_var in dat_df.columns,\
        "the id_var string doesn't match any column name"
    dat_out_np = np.array(dat_df.loc[:,id_var].values.tolist())
    dat_out_np = np.reshape(dat_out_np, (len(dat_out_np), 1))
    dat_df = dat_df.drop([id_var], axis=1)
    
    #Input validation
    assert dat_df.select_dtypes(exclude = ['int64', 'float64', 'object', 'category']).empty,\
        "please format all data columns to numeric, integer, category or character (for categorical variables)"
    
    ## Handling categorical variables
    cat_df = dat_df.copy().select_dtypes(include = 'object') #Categorical vars
    if not cat_df.empty:
        # One-hot encoding all categorical variables
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(cat_df)
        cat_np = enc.transform(cat_df).toarray()
        dat_out_np = np.concatenate((dat_out_np, cat_np), axis=1)
        
    ## Handling numerical variables
    num_df = dat_df.copy().select_dtypes(include = ['int64', 'float64']) #Numeric vars
    if not num_df.empty:
        # Normalizing all numeric variables to [0,1]
        scaler = MinMaxScaler()
        scaler.fit(num_df)
        num_np = scaler.transform(num_df)
        dat_out_np = np.concatenate((dat_out_np, num_np), axis=1)
    
    return dat_out_np


def stratified_assgnt_fun(dat_df, id_var, n_groups = 2, group_var_name = "group"):
    
    #Handling situations where the number of rows is not divisible by the number
    #of groups. NOTE: I'll try to implement a better solution when I can
    remainder = len(dat_df) % n_groups
    if remainder != 0:
        dat_df = dat_df.head(len(dat_df)-remainder)
    
    #Prepping the data
    data_np = strat_prep_fun(dat_df, id_var)
    
    #Isolating the identification variable
    dat_ID = data_np[:,0].tolist() # Extract ID for later join
    data_np = data_np[:,1:].astype(float)
    
    ## Matching algorithm
    
    #Setup
    N = len(data_np)
    match_len = n_groups - 1 # Number of matches we want to find
    
    #Calculate distance matrix
    from scipy.spatial import distance_matrix
    d_mat = distance_matrix(data_np, data_np)
    np.fill_diagonal(d_mat,N+1)
    # Set up variables
    rows = [i for i in range(N)]
    available = rows.copy()
    matches_lst = []
    matches_lst_lim = int(N/n_groups)
    
    closest = np.argpartition(d_mat, kth=match_len-1,axis=1)
    
    for n in rows:
        if len(matches_lst) == matches_lst_lim: break
        if n in available:
            for search_lim in range(match_len, N):
                closest_matches = closest[n,:search_lim].tolist()
                matches = list(set(available) & set(closest_matches))
                if len(matches) == match_len:
                    matches.append(n)
                    matches_lst.append(matches)
                    available = [m for m in available if m not in matches]
                    break
                #Handling ties from argpartition
                elif len(matches) > match_len:
                    matches = [x for _, x in sorted(zip(d_mat[n,matches].tolist(), matches))]
                    matches = matches[0:match_len]
                    matches.append(n)
                    matches_lst.append(matches)
                    available = [m for m in available if m not in matches]
                    break
                else:
                    closest[n,:] = np.argpartition(d_mat[n,:], kth=search_lim)
                    
    #Assigning experimental groups to the matched sets
    exp_grps = np.array(list(range(n_groups))*(int(N/n_groups))).reshape((int(N/n_groups),n_groups))
    exp_grps = exp_grps.tolist()
    for j in exp_grps: 
        np.random.shuffle(j)
    #flattening the two lists
    import itertools
    exp_grps = list(itertools.chain(*exp_grps))
    matches_lst2 = list(itertools.chain(*matches_lst))
    exp_grps2 = [x for _,x in sorted(zip(matches_lst2,exp_grps))]
    
    assgnt_df = pd.DataFrame(exp_grps2, columns=[group_var_name])
    assgnt_df[group_var_name] = assgnt_df[group_var_name].astype(str)
    assgnt_df[id_var] = dat_ID
    dat_df = dat_df.merge(assgnt_df, on=id_var, how='inner')
    return dat_df

In [14]:
#Isolating the target and grouping variables
dat_df2 = dat_df_ctrl.copy()
dat_df2_outcomes = dat_df2.loc[:, ['index', 'gave', 'amount']]
dat_df2_outcomes.head(5)

Unnamed: 0,index,gave,amount
0,0,0,0.0
1,1,0,0.0
2,5,0,0.0
3,12,0,0.0
4,14,0,0.0


In [15]:
# Keeping only the variables used for stratification
dat_df2 = dat_df2.drop(['gave','amount'], axis=1)

In [9]:
stratified_data_df = stratified_assgnt_fun(dat_df2, id_var = 'index', n_groups = 2, group_var_name = "taste")

n =  0 

n =  100 

n =  200 

n =  300 

n =  400 

n =  500 

n =  600 

n =  700 

n =  800 

n =  900 

n =  1000 

n =  1100 

n =  1200 

n =  1300 

n =  1400 

n =  1500 

n =  1600 

n =  1700 

n =  1800 

n =  1900 

n =  2000 

n =  2100 

n =  2200 

n =  2300 

n =  2400 

n =  2500 

n =  2600 

n =  2700 

n =  2800 

n =  2900 

n =  3000 

n =  3100 

n =  3200 

n =  3300 

n =  3400 

n =  3500 

n =  3600 

n =  3700 

n =  3800 

n =  3900 

n =  4000 

n =  4100 

n =  4200 

n =  4300 

n =  4400 

n =  4500 

n =  4600 

n =  4700 

n =  4800 

n =  4900 

n =  5000 

n =  5100 

n =  5200 

n =  5300 

n =  5400 

n =  5500 

n =  5600 

n =  5700 

n =  5800 

n =  5900 

n =  6000 

n =  6100 

n =  6200 

n =  6300 

n =  6400 

n =  6500 

n =  6600 

n =  6700 

n =  6800 

n =  6900 

n =  7000 

n =  7100 

n =  7200 

n =  7300 

n =  7400 

n =  7500 

n =  7600 

n =  7700 

n =  7800 

n =  7900 

n =  8000 

n =  8100 

n =  8200 

n =  8300 

n = 

In [19]:
#Renaming the values of the Taste variable
stratified_data_df.loc[stratified_data_df['taste']=='0','taste'] = 'vanilla'
stratified_data_df.loc[stratified_data_df['taste']=='1','taste'] = 'chocolate'
stratified_data_df.head(5)

Unnamed: 0,index,freq,gender,state_pol,county_pol,dormant,taste,gave,amount
0,0,2,male,blue,blue,1,vanilla,0,0.0
1,1,2,male,blue,red,0,chocolate,0,0.0
2,5,20,male,red,red,0,chocolate,0,0.0
3,12,2,male,red,red,1,vanilla,0,0.0
4,14,3,male,red,red,0,chocolate,0,0.0


In [20]:
#Bringing back together the stratified data and the outcome variables
stratified_data_df['gave'] = dat_df2_outcomes['gave']
stratified_data_df['amount'] = dat_df2_outcomes['amount']
stratified_data_df.head(5)

Unnamed: 0,index,freq,gender,state_pol,county_pol,dormant,taste,gave,amount
0,0,2,male,blue,blue,1,vanilla,0,0.0
1,1,2,male,blue,red,0,chocolate,0,0.0
2,5,20,male,red,red,0,chocolate,0,0.0
3,12,2,male,red,red,1,vanilla,0,0.0
4,14,3,male,red,red,0,chocolate,0,0.0


In [21]:
boot_CI_fun(stratified_data_df, metric_fun, B = 100)

[-0.0053213563302114755, 0.0011613166753378893]

# Exercise 2 - advanced

a) Calculate the 90%-CI for the effect of each of the matching ratios on the amount given (compared to the control group). 

In [11]:
# Metric functions
def metric_fun1(dat_df):
    model = ols("amount~group+gender+state_pol+county_pol", data=dat_df)
    res = model.fit(disp=0)
    coeff = res.params['group[T.treat1]']
    return coeff

def metric_fun2(dat_df):
    model = ols("amount~group+gender+state_pol+county_pol", data=dat_df)
    res = model.fit(disp=0)
    coeff = res.params['group[T.treat2]']
    return coeff

def metric_fun3(dat_df):
    model = ols("amount~group+gender+state_pol+county_pol", data=dat_df)
    res = model.fit(disp=0)
    coeff = res.params['group[T.treat3]']
    return coeff

print("90%-CI for the effect of treatment 1: ", boot_CI_fun(dat_df, metric_fun1, B = 200), "\n")
print("90%-CI for the effect of treatment 2: ", boot_CI_fun(dat_df, metric_fun2, B = 200), "\n")
print("90%-CI for the effect of treatment 3: ", boot_CI_fun(dat_df, metric_fun3, B = 200), "\n")

90%-CI for the effect of treatment 1:  [-0.053802058004396025, 0.2971814775590944] 

90%-CI for the effect of treatment 2:  [0.002699141184231823, 0.37770335556133033] 

90%-CI for the effect of treatment 3:  [-0.04078350054006438, 0.3011237825874853] 



b) Is the effect of the 3:1 matching ratio different at the 90% level from the 2:1 ratio? (Trick question!)

In [9]:
# Metric function for difference
def metric_fun_diff(dat_df):
    model = ols("amount~group+gender+state_pol+county_pol", data=dat_df)
    res = model.fit(disp=0)
    coeff2 = res.params['group[T.treat2]']
    coeff3 = res.params['group[T.treat3]']
    diff = coeff3 - coeff2
    return diff

print("difference between effect of treatments 3 and 2: ", metric_fun_diff(dat_df), "\n")
print("90%-CI for difference between effect of treatments 3 and 2: ", boot_CI_fun(dat_df, metric_fun_diff, B = 200), "\n")

difference between effect of treatments 3 and 2:  -0.08549685436111919 

90%-CI for difference between effect of treatments 3 and 2:  [-0.23790661522934484, 0.148799574298987] 



The "trick" here is that to correctly answer that question, you need to calculate the CI of the difference, you cannot directly compare the two CIs from question a) to each other. As you can see, the difference can be as low as -0.24 and as high as 0.15, which is not the values we would naively get by comparing the bounds of the CIs for the two estimates. 