# Econometría Aplicada II
## Tarea 1
Importar librerías

In [1]:
%%capture
# Clonar repo si estamos en colab
if 'google.colab' in str(get_ipython()):
    !git clone https://github.com/ArturoSbr/EmtrAp2-hw01
    # !pip install scipy==1.7.3
    %cd EmtrAp2-hw01/cod

# Libs
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.api import OLS
from matplotlib import pyplot as plt

Importar datos

In [2]:
d1 = pd.read_csv('../dat/baseline.csv')
d2 = pd.read_csv('../dat/endline.csv')
d3 = pd.read_csv('../dat/completa.csv')

### 1. Balance
Tabla de balance por grupo de acuerdo a `T_nap`

In [3]:
# Declarar número de individuos por grupo
n0, n1 = d1.groupby('T_nap').size()

# Seleccionar 12 variables basales
X = ['time_in_office','age_','female_','education_','sleep_night','no_of_children_','act_inbed',
     'an_12_number_of_awakenings','an_13_average_awakening_length','unemployed']

# Medias de variables basales por grupo
t = d1.groupby('T_nap')[X].agg(['mean','var'])

# Ajustes estéticos a t
t = t.transpose().reset_index()
t.columns = ['var','fun','control','treatment']
t = t.pivot(index='var', columns='fun')
t.columns = ['_'.join(col) for col in t.columns]

# Calcular estadísticos
t['tau'] = t['control_mean'] - t['treatment_mean']
t['tau_var'] = t['control_var'].div(n0) + t['treatment_var'].div(n1)
t['t'] = t['tau'].div(np.sqrt(t['tau_var']))
t['p'] = (1 - stats.norm().cdf(t['t'].abs())) * 2

# Resultado presentable
t = t[['control_mean','treatment_mean','tau','t','p']].reset_index()
t

Unnamed: 0,var,control_mean,treatment_mean,tau,t,p
0,act_inbed,7.992301,8.071253,-0.078952,-0.839935,0.400945
1,age_,34.938053,34.964602,-0.026549,-0.039027,0.968869
2,an_12_number_of_awakenings,31.715624,32.188211,-0.472587,-0.621987,0.53395
3,an_13_average_awakening_length,4.478105,4.477173,0.000931,0.007186,0.994267
4,education_,10.336283,10.030973,0.30531,1.128002,0.259319
5,female_,0.650442,0.672566,-0.022124,-0.496037,0.619869
6,no_of_children_,1.300885,1.40708,-0.106195,-1.068706,0.285202
7,sleep_night,5.559958,5.595827,-0.035869,-0.434612,0.663844
8,time_in_office,7.968159,7.942971,0.025188,0.366232,0.714192
9,unemployed,0.247788,0.283186,-0.035398,-0.339135,0.734508


Evaluación conjunta de significancia

In [4]:
# T_nap en función de controles
m = OLS(endog=d1['T_nap'], exog=d1[X].assign(const = 1)).fit()

# p-value de prueba
m.f_pvalue

0.9615038681207262

### 2. Efectos de tratamiento
Declarar todas las variables dependientes

In [5]:
# Crear índice de habilidades cognitivas
d2['cog'] = d2[['corsi_measure','hf_measure','pvt_measure']] \
    .apply(lambda x: (x - x.mean()) / x.std()).mean(axis=1)

# Crear índice de preferencias
# ?

# Crear índice de bienestar
d2['well'] = d2['ds_g1_satisfaction']

# Variables dependientes
Y = ['productivity','earnings','typing_time_hr','cog','well']

#### a) Estimadores de Neyman

In [6]:
# Función para estimador de Neyman
def neyman(frame, treatment_col, values_col):
    # Sacar arreglos C y T
    a, b = frame.dropna(subset=[values_col]).groupby(treatment_col)[values_col].apply(np.array)
    # Estadístico t
    tau = np.mean(b) - np.mean(a)
    t = tau / np.sqrt(np.var(a, ddof=1) / len(a) + np.var(b, ddof=1) / len(b))
    # p-value
    p = 2 * (1 - stats.norm().cdf(np.abs(t)))
    return (values_col, tau, t, p)

# Diferencias de Neyman
t_ney = pd.DataFrame(data=[neyman(d2, 'T_nap', y) for y in Y], columns=['depvar','diff','t-stat','p'])
t_ney

Unnamed: 0,depvar,diff,t-stat,p
0,productivity,-171.240947,-0.976215,0.328958
1,earnings,-14.753231,-1.072117,0.283667
2,typing_time_hr,-0.087307,-0.634157,0.525978
3,cog,0.027056,0.492507,0.622361
4,well,0.215989,1.044944,0.296049


#### b) Estimadores OLS sin controles

In [7]:
# Correr regresiones
d = []
for y in Y:
    m = OLS(endog=d2[y], exog=d2.assign(const = 1)[['const','T_nap']], missing='drop')
    m = m.fit(cov_type='HC2')
    d.append([y, m.params['T_nap'], m.tvalues['T_nap'], m.pvalues['T_nap']])

# Regresiones a tabla
t_ols = pd.DataFrame(data=d, columns=['depvar','diff','t-stat','p'])
t_ols

Unnamed: 0,depvar,diff,t-stat,p
0,productivity,-171.240947,-0.976215,0.328958
1,earnings,-14.753231,-1.072117,0.283667
2,typing_time_hr,-0.087307,-0.634157,0.525978
3,cog,0.027056,0.492507,0.622361
4,well,0.215989,1.044944,0.296049


#### c) Estimadores con controles
De acuerdo al paper, $X_i$ contiene `age_` en cuartiles, `female_` y la variable que indica si $i$ fue asignado a trabajar o a tomarse un break en vez de tomar una siesta.

Como esta pregunta usa la base con promedios durante los 20 días de estudio, la variable que indica la actividad asignada cada día a los individuos del grupo de control no está disponible.

In [8]:
# Edad a cuartiles y luego a dummies
d2['age_q'] = pd.qcut(x=d2['age_'], q=4, labels=[f'q{i}' for i in range(1,5)])
d2 = pd.get_dummies(data=d2, prefix='age_', prefix_sep='', columns=['age_q'], )

# Tratamiento y controles
X = ['T_nap','const','age_q2','age_q3','age_q4','female_']

# Correr regresiones
d = []
for y in Y:
    m = OLS(endog=d2[y], exog=d2.assign(const = 1)[X], missing='drop')
    m = m.fit(cov_type='HC2')
    d.append([y, m.params['T_nap'], m.tvalues['T_nap'], m.pvalues['T_nap']])

# Regresiones a tabla
t_ctr = pd.DataFrame(data=d, columns=['depvar','diff','t-stat','p'])
t_ctr

Unnamed: 0,depvar,diff,t-stat,p
0,productivity,-196.63582,-1.178943,0.238421
1,earnings,-14.563905,-1.060604,0.28887
2,typing_time_hr,-0.083263,-0.608418,0.54291
3,cog,0.029186,0.534748,0.592824
4,well,0.232225,1.121202,0.262202


#### d) Resultados a tabla

In [9]:
# Concatenar resultados
t = pd.concat([t_ney.assign(method='Neyman'),
               t_ols.assign(method='OLS'),
               t_ctr.assign(method='MLR')]).melt(id_vars=['method','depvar'])

# Tabla presentable
t = t.pivot(index=['depvar','variable'], columns='method', values='value')
t

Unnamed: 0_level_0,method,MLR,Neyman,OLS
depvar,variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cog,diff,0.029186,0.027056,0.027056
cog,p,0.592824,0.622361,0.622361
cog,t-stat,0.534748,0.492507,0.492507
earnings,diff,-14.563905,-14.753231,-14.753231
earnings,p,0.28887,0.283667,0.283667
earnings,t-stat,-1.060604,-1.072117,-1.072117
productivity,diff,-196.63582,-171.240947,-171.240947
productivity,p,0.238421,0.328958,0.328958
productivity,t-stat,-1.178943,-0.976215,-0.976215
typing_time_hr,diff,-0.083263,-0.087307,-0.087307


#### e) Nuevas variables dependientes

In [10]:
# Nuevas variables dependientes
Y = ['nap_time_mins','sleep_report','happy','cog','typing_time_hr']

# Correr regresiones
d = []
for y in Y:
    m = OLS(endog=d2[y], exog=d2.assign(const = 1)[X], missing='drop')
    m = m.fit(cov_type='HC2')
    d.append([y, m.params['T_nap'], m.tvalues['T_nap'], m.pvalues['T_nap']])

# Resultados a tabla
t = pd.DataFrame(data=d, columns=['variable','diff','t-stat','p-value'])
t.round(3)

Unnamed: 0,variable,diff,t-stat,p-value
0,nap_time_mins,11.556,35.829,0.0
1,sleep_report,0.049,1.135,0.256
2,happy,0.034,1.002,0.316
3,cog,0.029,0.535,0.593
4,typing_time_hr,-0.083,-0.608,0.543


De acuerdo al modelo $\bar{y}_i = \beta T_i + X_i^T\gamma$, donde $X_i$ controla por la edad (en cuartiles) y por sexo, el efecto de estimado de tratamiento de tomar una siesta es:
1. Aumentar el promedio de minutos dormidos durante la siesta en de 0 a 11.6 minutos.
1. Aumentar el promedio de número de horas de sueño reportadas en 0.05 horas por día (pero no tiene significancia estadística).
1. Aumentar el promedio de la calificación de felicidad reportada en 0.03 puntos (pero no tiene significancia estadística)
1. Aumentar el índice promedio de desempeño cognitivo en 0.03 desviaciones estándar (pero no tiene significancia estadística)
1. Reducir el promedio de horas trabajadas en 0.083 unidades diarias (pero no tiene significancia estadística)

### 3. Fischer's Exact Test

In [None]:
stats.permutation_test(data=(d2.loc[d2['T_nap'].eq(1) & d2['tot_earnings'].notna(), 'tot_earnings'],
                             d2.loc[d2['T_nap'].eq(0) & d2['tot_earnings'].notna(), 'tot_earnings']),
                       statistic=lambda x, y: np.mean(x) - np.mean(y),
                       n_resamples=1000,
                       random_state=42)

### 4. Estratificación

In [12]:
# Create cases
t = d1[['earnings','sleep_night']].apply(lambda x: (x > x.median()).astype(int), axis=0)
t = pd.concat([d1['pid'], t], axis=1)
t.columns = ['pid','aboveEarn','aboveSleep']

# Merge cases to `d2`
d2 = d2.merge(t, on='pid')

#### a) Número de observaciones asignadas a tratamiento en cada estrato

In [14]:
# asdf
t = d2.groupby(['aboveEarn','aboveSleep'])['T_nap'].sum()
t

aboveEarn  aboveSleep
0          0             59
           1             54
1          0             53
           1             60
Name: T_nap, dtype: int64

El número de observaciones asignadas a tratamiento en cada estrato es prácticamente el mismo. La razón por la cual los números no son idénticos es porque los autores asignaron el tratamiento dentro de cada estrato con una función de probabilidad binomial con media igual a $\frac{1}{2}$ en vez de fijar el número de observaciones asignadas a tratamiento.

#### b) Efectos por estrato y agregados
Efectos por estrato

In [16]:
# Inicializar lista para agregar datos
d = []

# 
for e, s in t.index:
    print(e,s)

0 0
0 1
1 0
1 1


In [21]:
# Inicializar lista
d = []

# Efecto por estrato
for e, s in t.index:
    m = d2['aboveEarn'].eq(e) & d2['aboveSleep'].eq(s)
    # Para cada variable
    for y in Y:
        d.append([e, s, m.sum(), m.sum() / len(d2)] + list(neyman(d2[m], 'T_nap', y)))

# Resultados a tabla
t = pd.DataFrame(data=d, columns=['aboveEarn','aboveSleep','ng','w','depvar','tau','t','p'])
t = t.sort_values(['depvar','aboveEarn','aboveSleep'])
t

Unnamed: 0,aboveEarn,aboveSleep,ng,w,depvar,tau,t,p
3,0,0,122,0.269912,cog,0.129588,1.162648,0.244972
8,0,1,104,0.230088,cog,0.108214,1.111781,0.266232
13,1,0,104,0.230088,cog,-0.087779,-0.765489,0.44398
18,1,1,122,0.269912,cog,-0.045687,-0.444522,0.656665
2,0,0,122,0.269912,happy,-0.044045,-0.682321,0.495036
7,0,1,104,0.230088,happy,0.000106,0.001627,0.998702
12,1,0,104,0.230088,happy,0.130437,1.825007,0.068
17,1,1,122,0.269912,happy,0.063518,0.981307,0.326441
0,0,0,122,0.269912,nap_time_mins,10.941749,16.943943,0.0
5,0,1,104,0.230088,nap_time_mins,12.636905,20.838268,0.0


Efectos agregados

In [27]:
t['tau'].multiply(t['w']).groupby(t['depvar']).sum()

depvar
cog                0.027348
happy              0.035292
nap_time_mins     11.548798
sleep_report       0.048494
typing_time_hr    -0.083247
dtype: float64

#### c) Efectos estratificados con OLS

In [51]:
# Declarar columnas necesarias
d2['const'] = 1
d2[['aboveEarnT','aboveSleepT']] = d2[['aboveEarn','aboveSleep']].multiply(d2['T_nap'], axis=0)

# Variables independientes
X = ['const','T_nap','aboveEarn','aboveEarnT','aboveSleep','aboveSleepT']

# Regresiones
d = []
for y in Y:
    m = OLS(endog=d2[y], exog=d2[X], missing='drop').fit(cov_type='HC2')
    d.append(pd.concat([m.params, m.bse, m.tvalues, m.pvalues], axis=1).assign(depvar=y))

# A tabla
pd.concat(d, axis=0)

Unnamed: 0,0,1,2,3,depvar
const,5.354494e-15,1.661809e-15,3.222088,0.001272602,nap_time_mins
T_nap,11.35784,0.5604503,20.26556,2.5899679999999995e-91,nap_time_mins
aboveEarn,-1.876082e-15,1.357267e-15,-1.38225,0.1668949,nap_time_mins
aboveEarnT,-0.3861348,0.6482858,-0.595624,0.5514262,nap_time_mins
aboveSleep,-2.744478e-15,1.378699e-15,-1.990629,0.04652171,nap_time_mins
aboveSleepT,0.776289,0.646377,1.200985,0.229757,nap_time_mins
const,0.6594689,0.009250301,71.291612,0.0,sleep_report
T_nap,0.1347094,0.1167165,1.154159,0.2484352,sleep_report
aboveEarn,-0.008749581,0.01069882,-0.817808,0.413467,sleep_report
aboveEarnT,-0.08732765,0.07822818,-1.11632,0.2642853,sleep_report


### 5. Atrición
#### a) Reportar atrición

In [57]:
t = d2.groupby('T_nap')['drop_indicator'].agg(['size','sum'])
t['pct'] = t['sum'].div(t['size']) * 100
t.round(2)

Unnamed: 0_level_0,size,sum,pct
T_nap,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,226,89,39.38
1,226,25,11.06


#### b) Nuevo balance
Validez interna

In [94]:
d = []
for x in X:
    b, a = d2[d2['drop_indicator'].eq(0)].groupby('T_nap')[x].apply(np.array)
    test = stats.ttest_ind(a=a, b=b, equal_var=False, nan_policy='omit')
    d.append([x] + list(test))

t = pd.DataFrame(data=d, columns=['variable','t-stat','p-value'])
t.sort_values('variable').round(3)

Unnamed: 0,variable,t-stat,p-value
6,act_inbed,-1.504,0.134
1,age_,0.576,0.565
7,an_12_number_of_awakenings,-0.762,0.447
8,an_13_average_awakening_length,-0.673,0.502
3,education_,-3.132,0.002
2,female_,0.882,0.379
5,no_of_children_,2.387,0.018
4,sleep_night,-0.549,0.583
0,time_in_office,0.365,0.716
9,unemployed,1.029,0.304


Antes de la atrición, ninguna de las 10 variables tenía diferencias entre tratamiento y control que fueran estadísticamente significativas. Después de la atrición, los p-values son más chicos y algunas diferencias tienen significancia a nivel individual. Por ejemplo, las diferencias en `education_` y `no_of_children_` ahora son significativas al 1 y 2 porciento de confianza.

Representatividad externa

In [97]:
# Probar si diferencia es significativa Baseline VS Endline
d = []
for x in X:
    a, b = d2[x], d2.loc[d2['drop_indicator'].eq(0), x]
    test = stats.ttest_ind(a=a, b=b, equal_var=False, nan_policy='omit')
    d.append([x] + list(test))

# Resultados a tabla
t = pd.DataFrame(data=d, columns=['variable','t','p'])
t.sort_values('variable').round(3)

Unnamed: 0,variable,t,p
6,act_inbed,-0.166,0.868
1,age_,-0.878,0.38
7,an_12_number_of_awakenings,0.021,0.983
8,an_13_average_awakening_length,-0.118,0.906
3,education_,-0.748,0.455
2,female_,1.416,0.157
5,no_of_children_,1.247,0.213
4,sleep_night,0.005,0.996
0,time_in_office,-0.559,0.576
9,unemployed,0.866,0.387


La atrición parece no haber afectado la representatividad externa de la muestra. Ninguna de las diferencias son significativas individualmente.

#### c) Conclusión
La atrición fue sistemática entre el grupo de tratamiento y de control. Es decir, parece que el nivel de educación y el número de hijos determinan si alguien abandona o no el experimento. Esto nos lleva a un problema de validez interna porque los grupos de tratamiento y control después de la atrición no están balanceados.

Sin embargo, parece que la atrición no afectó la validez externa de la muestra, pues parece que las personas que abandonaron el estudio no afectaron las distribuciones de las variables de control. Ninguna de las 10 variables muestra una diferencia significativa antes y después de la atrición.

### 5. Lee Bounds
#### a) Perfiles