# Solutions for chapter 12 exercises

## Set up

In [14]:
# Common libraries
import pandas as pd
import numpy as np
from statsmodels.formula.api import ols
import seaborn as sns

In [15]:
# Reading the data
dat_df = pd.read_csv("Karlan_List_exercises_data.csv")
dat_df.describe()

Unnamed: 0,gave,amount,freq,dormant
count,50083.0,50083.0,50083.0,50083.0
mean,0.020646,0.915694,8.039355,0.523471
std,0.142197,8.709199,11.394454,0.499454
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,2.0,0.0
50%,0.0,0.0,4.0,1.0
75%,0.0,0.0,10.0,1.0
max,1.0,400.0,218.0,1.0


In [16]:
#Reformating variables as factor
dat_df['group'] = pd.Categorical(dat_df['group'], ordered = True, 
                                 categories = ['ctrl', 'treat1', 'treat2', 'treat3'])
dat_df['gender'] = pd.Categorical(dat_df['gender'], ordered = True, 
                                  categories = ['male', 'female', 'couple'])
dat_df['state_pol'] = pd.Categorical(dat_df['state_pol'], ordered = True, 
                                     categories = ['red', 'blue'])
dat_df['county_pol'] = pd.Categorical(dat_df['county_pol'], ordered = True, 
                                     categories = ['red', 'blue'])

1. Build a 90%-CI for the total effect of gender == female on the amount given by running a regression including other covariates but excluding the tentative mediator.

In [17]:
def tot_metric_fun(dat_df):
    mod = ols("amount~group+gender+state_pol+county_pol", data=dat_df)
    res = mod.fit(disp=0)
    coeff = res.params['gender[T.female]']
    return coeff

def boot_CI_fun(dat_df, metric_fun, B = 100):
  #Setting sample size
  N = len(dat_df)
  conf_level = 0.9
  coeffs = []
  
  
  for i in range(B):
      sim_data_df = dat_df.sample(n=N, replace = True)
      coeff = metric_fun(sim_data_df)
      coeffs.append(coeff)
  
  coeffs.sort()
  start_idx = round(B * (1 - conf_level) / 2)
  end_idx = - round(B * (1 - conf_level) / 2)
  
  confint = [coeffs[start_idx], coeffs[end_idx]]  
  
  return(confint)

tot_eff = tot_metric_fun(dat_df)
print("the central value for the total effect of female donors is ", tot_eff)

tot_boot_CI = boot_CI_fun(dat_df, tot_metric_fun, B = 100)
print("the CI for the total effect of female donors is ", tot_boot_CI)

the central value for the total effect of female donors is  -0.11025975826346435
the CI for the total effect of female donors is  [-0.26433472172808126, 0.02259294608256681]


2. Build a 90%-CI for the direct effect of gender == female on the amount given by running a regression including other covariates and the tentative mediator.

In [18]:
def dir_metric_fun(dat_df):
    mod = ols("amount~group+gender+state_pol+county_pol+freq", data=dat_df)
    res = mod.fit(disp=0)
    coeff = res.params['gender[T.female]']
    return coeff

dir_eff = dir_metric_fun(dat_df)
print("The central value for the direct effect of female donors is ", dir_eff)

dir_boot_CI = boot_CI_fun(dat_df, dir_metric_fun, B = 100)
print("The 90%-CI for the direct effect of female donors is ", dir_boot_CI)

the central value for the direct effect of female donors is  -0.14654594242382069
[-0.30145335059161216, -0.006358189970027166]


3.	Build a 90%-CI for the indirect (mediated) effect of gender == female on the amount given by running the regression of the mediator on gender == female and a regression of the final effect on the mediator, the original cause of interest and relevant covariates, then multiplying the relevant coefficients (warning: do this multiplication inside the Bootstrap loop, not outside of it) 

In [19]:
def ind_metric_fun(dat_df):
    mod1 = ols("freq~gender+state_pol+county_pol", data=dat_df)
    res1 = mod1.fit(disp=0)
    coeff1 = res1.params['gender[T.female]']
    
    mod2 = ols("amount~group+gender+state_pol+county_pol+freq", data=dat_df)
    res2 = mod2.fit(disp=0)
    coeff2 = res2.params['freq']
    
    coeff = coeff1 * coeff2
    
    return coeff

ind_eff = ind_metric_fun(dat_df)
print("The central value for the indirect effect of female donors is ", ind_eff)

ind_boot_CI = boot_CI_fun(dat_df, ind_metric_fun, B = 100)
print("The 90%-CI for the indirect effect of female donors is ", ind_boot_CI)

the central value for the indirect effect of female donors is  0.03624812062074483
the CI for the indirect effect of female donors is  [0.025771158390007716, 0.053955302891071366]


4.	Build a 90%-CI for the percentage mediated by running the same regressions as in 3 but also the regression for the total effect and determining the percentage mediated in each loop. What is your conclusion?

In [22]:
def perc_med_metric_fun(dat_df):
    mod1 = ols("freq~gender+state_pol+county_pol", data=dat_df)
    res1 = mod1.fit(disp=0)
    coeff1 = res1.params['gender[T.female]']
    
    mod2 = ols("amount~group+gender+state_pol+county_pol+freq", data=dat_df)
    res2 = mod2.fit(disp=0)
    coeff2 = res2.params['freq']
    
    mod_tot = ols("amount~group+gender+state_pol+county_pol", data=dat_df)
    res_tot = mod_tot.fit(disp=0)
    coeff_tot = res_tot.params['gender[T.female]']
    
    coeff = coeff1 * coeff2 / coeff_tot
    
    return coeff

perc_med = perc_med_metric_fun(dat_df)
print("The central value for the percentage mediated effect of female donors is ", perc_med)
perc_med_boot_CI = boot_CI_fun(dat_df, perc_med_metric_fun, B = 1000)
print("The 90%-CI for the percentage mediated effect of female donors is ", perc_med_boot_CI)

The central value for the percentage mediated effect of female donors is  -0.3287520414667552
The 90%-CI for the percentage mediated effect of female donors is  [-1.7571573844588422, 0.9462565154481545]


The 90%-CI for the percentage mediated effect of female donors is approx. [-164%; 85%], with a central value of -30%. The direct and indirect effect are going in opposite directions and mostly cancel each other (a complete cancellation would be a percentage mediated of -100%)