# Chapter 12: Mediation and instrumental variables

## Mediation

In [1]:
#Common libraries
import pandas as pd
from statsmodels.formula.api import ols
import numpy as np

In [3]:
#Loading the data from the chapter on moderation
hist_data_df = pd.read_csv('chap11-historical_data.csv')

In [4]:
#Regressions used in the text
ols("duration~play_area", data=hist_data_df).fit().summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.21
Model:,OLS,Adj. R-squared:,0.21
Method:,Least Squares,F-statistic:,166000.0
Date:,"Fri, 02 Apr 2021",Prob (F-statistic):,0.0
Time:,07:44:27,Log-Likelihood:,-2417800.0
No. Observations:,623610,AIC:,4836000.0
Df Residuals:,623608,BIC:,4836000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.8039,0.018,1287.327,0.000,23.768,23.840
play_area,12.5570,0.031,407.397,0.000,12.497,12.617

0,1,2,3
Omnibus:,65895.576,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,95301.029
Skew:,0.817,Prob(JB):,0.0
Kurtosis:,3.999,Cond. No.,2.42


In [5]:
ols("groceries_purchases~play_area", data=hist_data_df).fit().summary()

0,1,2,3
Dep. Variable:,groceries_purchases,R-squared:,0.164
Model:,OLS,Adj. R-squared:,0.164
Method:,Least Squares,F-statistic:,122100.0
Date:,"Fri, 02 Apr 2021",Prob (F-statistic):,0.0
Time:,08:39:19,Log-Likelihood:,-3004900.0
No. Observations:,623610,AIC:,6010000.0
Df Residuals:,623608,BIC:,6010000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,49.1421,0.047,1036.494,0.000,49.049,49.235
play_area,27.6200,0.079,349.485,0.000,27.465,27.775

0,1,2,3
Omnibus:,133724.275,Durbin-Watson:,1.985
Prob(Omnibus):,0.0,Jarque-Bera (JB):,331024.147
Skew:,1.187,Prob(JB):,0.0
Kurtosis:,5.666,Cond. No.,2.42


In [None]:
ols("groceries_purchases~duration", data=hist_data_df).fit().summary()

In [None]:
ols("groceries_purchases~duration+play_area", data=hist_data_df).fit().summary()

In [None]:
def percentage_mediated_fun(dat_df):
    total_effect = ols("groceries_purchases~play_area", data=dat_df).fit(disp=0).params['play_area']
    coeff_med1 = ols("duration~play_area", data=dat_df).fit(disp=0).params['play_area']
    coeff_med2 = ols("groceries_purchases~duration", data=dat_df).fit(disp=0).params['duration']
    mediated_effect = coeff_med1 * coeff_med2
    percentage_mediated = mediated_effect / total_effect
    return percentage_mediated 
percentage_mediated_fun(hist_data_df)

In [None]:
def boot_CI_fun(dat_df, metric_fun, B = 100):
  #Setting sample size
  N = len(dat_df)
  conf_level = 0.9
  coeffs = []
  
  for i in range(B):
      sim_data_df = dat_df.sample(n=N, replace = True)
      coeff = metric_fun(sim_data_df)
      coeffs.append(coeff)
  
  coeffs.sort()
  start_idx = round(B * (1 - conf_level) / 2)
  end_idx = - round(B * (1 - conf_level) / 2)
  
  confint = [coeffs[start_idx], coeffs[end_idx]]  
  
  return(confint)
boot_CI_fun(hist_data_df, percentage_mediated_fun)

## Instrumental Variables

### Data

In [None]:
#Loading the experimental data from the chapter 9
exp_data_df = pd.read_csv('chap9-experimental_data.csv')

#Reformat group variable to binary in experimental data
exp_data_df.group = np.where(exp_data_df.group == 'treat', 1, 0)

### Libraries

In [None]:
#Common libraries
import pandas as pd
from statsmodels.formula.api import ols
import numpy as np

from linearmodels.iv import IV2SLS

### Understanding and Applying IVs

In [None]:
#Reduced regression, coeff = 1.6
red_mod = ols("M6Spend~group+age+reason", data=exp_data_df).fit(disp=0)
red_mod.summary()

In [None]:
#First stage regression, coeff = 0.5
S1_mod = ols("call_CSAT~group+age+reason", data=exp_data_df).fit(disp=0)
S1_mod.summary()

In [None]:
#Baseline (biased) regression, coeff = 4.00
lm_mod = ols("M6Spend~call_CSAT+age+reason", data=exp_data_df).fit(disp=0)
lm_mod.summary()

In [None]:
#IV regression, coeff = 2.99
iv_mod = IV2SLS.from_formula('M6Spend ~ 1 + age + reason + [call_CSAT ~ group]', 
                             exp_data_df).fit()
print(iv_mod.params)