# Solutions for chapter 8 exercises

## Set up

In [8]:
# Common libraries
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
#Loading the data
dat_df = pd.read_csv("Bertrand_Mullainathan_exercises_data.csv")
dat_df.head(5)

Unnamed: 0,call,education,yearsexp,race,gender,college,eoe,wanted
0,0,4,6,w,f,1,1,supervisor
1,0,3,6,w,f,0,1,supervisor
2,0,4,6,b,f,1,1,supervisor
3,0,3,6,b,f,0,1,supervisor
4,0,3,22,w,f,0,1,secretary


In [18]:
# Reformatting categorical variables
dat_df['race'] = pd.Categorical(dat_df['race'], ordered = True, categories = ['w', 'b'])
dat_df['gender'] = pd.Categorical(dat_df['gender'], ordered = True, categories = ['m', 'f'])
dat_df['wanted'] = dat_df['wanted'].astype('category')

# Exercise 1

Build a 90%-CI for the regression coefficient representing the effect of race on the probability of getting a call back, with no covariate, then with all the covariates. 

In [19]:
# Visualize the basic regression at hand
model = smf.logit('call ~ education + yearsexp + race + gender + college + eoe + wanted', data = dat_df)
res = model.fit(disp=0)
res.summary()

0,1,2,3
Dep. Variable:,call,No. Observations:,4870.0
Model:,Logit,Df Residuals:,4858.0
Method:,MLE,Df Model:,11.0
Date:,"Mon, 03 May 2021",Pseudo R-squ.:,0.01798
Time:,08:51:57,Log-Likelihood:,-1339.0
converged:,True,LL-Null:,-1363.5
Covariance Type:,nonrobust,LLR p-value:,9.392e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.6748,0.430,-6.214,0.000,-3.519,-1.831
race[T.b],-0.4434,0.108,-4.115,0.000,-0.655,-0.232
gender[T.m],-0.0131,0.153,-0.086,0.932,-0.313,0.286
wanted[T.offsupport],0.7217,0.213,3.394,0.001,0.305,1.138
wanted[T.retailsales],0.4082,0.202,2.020,0.043,0.012,0.804
wanted[T.salesrep],0.1866,0.217,0.861,0.389,-0.238,0.611
wanted[T.secretary],0.3615,0.191,1.892,0.059,-0.013,0.736
wanted[T.supervisor],0.1351,0.263,0.514,0.607,-0.380,0.650
education,-0.1067,0.151,-0.708,0.479,-0.402,0.189


In [29]:
## Metric function
def log_full_reg_fun(df):
    model = smf.logit('call ~ education + yearsexp + race + gender + college + eoe + wanted', data = df)
    res = model.fit(disp=0)
    coeff = res.params['race[T.b]']
    return coeff
def log_single_reg_fun(df):
    model = smf.logit('call ~ race', data = df)
    res = model.fit(disp=0)
    coeff = res.params['race[T.b]']
    return coeff

print("coeff for regression with single coefficient: ", log_single_reg_fun(dat_df))
print("coeff for full regression: ", log_full_reg_fun(dat_df))

coeff for full regression:  -0.44338442730802197
coeff for regression with single coefficient:  -0.4381802134565714


In [30]:
## Bootstrap CI function
def boot_CI_fun(df, metric_fun, B = 100, conf_level = 0.9):
  #Setting sample size
  N = len(df)
  conf_level = conf_level
  coeffs = []
  
  for i in range(B):
      sim_data_df = df.sample(n=N, replace = True)
      coeff = metric_fun(sim_data_df)
      coeffs.append(coeff)
  
  coeffs.sort()
  start_idx = round(B * (1 - conf_level) / 2)
  end_idx = - round(B * (1 - conf_level) / 2)
  confint = [coeffs[start_idx], coeffs[end_idx]]  
  return(confint)

print("CI for regression with single coefficient: ", boot_CI_fun(df=dat_df, metric_fun=log_single_reg_fun))
print("CI for full regression: ", boot_CI_fun(df=dat_df, metric_fun=log_full_reg_fun))

CI for regression with single coefficient:  [-0.6392257591264762, -0.24039310308215162]
CI for full regression:  [-0.6547851925390121, -0.24857979140513203]
