# Econometría Aplicada II
## Tarea 1
Importar librerías

In [None]:
# Clonar repo si estamos en colab
if 'google.colab' in str(get_ipython()):
    !git clone https://github.com/ArturoSbr/EmtrAp2-hw01
    cd EmtrAp2-hw01/cod

# Libs
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.api import OLS
from matplotlib import pyplot as plt

Importar datos

In [None]:
d1 = pd.read_csv('../dat/baseline.csv')
d2 = pd.read_csv('../dat/endline.csv')
d3 = pd.read_csv('../dat/completa.csv')

### 1. Balance
Tabla de balance por grupo de acuerdo a `T_nap`

In [None]:
# Declarar número de individuos por grupo
n0, n1 = d1.groupby('T_nap').size()

# Seleccionar 12 variables basales
X = ['age_','female','education_','no_of_children_','']
X = ['time_in_office','age_','female_','education_','sleep_night','no_of_children_','act_inbed',
     'an_12_number_of_awakenings','an_13_average_awakening_length','unemployed']

# Medias de variables basales por grupo
t = d1.groupby('T_nap')[X].agg(['mean','var'])

# Ajustes estéticos a t
t = t.transpose().reset_index()
t.columns = ['var','fun','control','treatment']
t = t.pivot(index='var', columns='fun')
t.columns = ['_'.join(col) for col in t.columns]

# Calcular estadísticos
t['tau'] = t['control_mean'] - t['treatment_mean']
t['tau_var'] = t['control_var'].div(n0) + t['treatment_var'].div(n1)
t['t'] = t['tau'].div(np.sqrt(t['tau_var']))
t['p'] = (1 - stats.norm().cdf(t['t'].abs())) * 2

# Resultado presentable
t = t[['control_mean','treatment_mean','tau','t','p']].reset_index()
t

Unnamed: 0,var,control_mean,treatment_mean,tau,t,p
0,act_inbed,7.992301,8.071253,-0.078952,-0.839935,0.400945
1,age_,34.938053,34.964602,-0.026549,-0.039027,0.968869
2,an_12_number_of_awakenings,31.715624,32.188211,-0.472587,-0.621987,0.53395
3,an_13_average_awakening_length,4.478105,4.477173,0.000931,0.007186,0.994267
4,education_,10.336283,10.030973,0.30531,1.128002,0.259319
5,female_,0.650442,0.672566,-0.022124,-0.496037,0.619869
6,no_of_children_,1.300885,1.40708,-0.106195,-1.068706,0.285202
7,sleep_night,5.559958,5.595827,-0.035869,-0.434612,0.663844
8,time_in_office,7.968159,7.942971,0.025188,0.366232,0.714192
9,unemployed,0.247788,0.283186,-0.035398,-0.339135,0.734508


Evaluación conjunta de significancia

In [None]:
# T_nap en función de controles
m = OLS(endog=d1['T_nap'], exog=d1[X].assign(const = 1)).fit()

# p-value de prueba
m.f_pvalue

0.9615038681207262

### 2. Efectos de tratamiento
Declarar todas las variables dependientes

In [None]:
# Crear medida estandarizada de habilidades cognitivas
d2['cogni'] = d2[['corsi_measure','hf_measure','pvt_measure']] \
    .apply(lambda x: (x - x.mean()) / x.std()).mean(axis=1)

# Variables dependientes
Y = ['productivity','tot_earnings','cogni','daily_savings','happy','ds_g1_satisfaction','typing_time_hr']

#### a) Estimadores de Neyman

In [None]:
# Función para estimador de Neyman
def neyman(frame, treatment_col, values_col):
    # Sacar arreglos C y T
    a, b = frame.dropna(subset=[values_col]).groupby(treatment_col)[values_col].apply(np.array)
    # Estadístico t
    tau = np.mean(b) - np.mean(a)
    t = tau / np.sqrt(np.var(a, ddof=1) / len(a) + np.var(b, ddof=1) / len(b))
    # p-value
    p = 2 * (1 - stats.norm().cdf(np.abs(t)))
    return (values_col, tau, t, p)

# Diferencias de Neyman
t_ney = pd.DataFrame(data=[neyman(d2, 'T_nap', y) for y in Y], columns=['depvar','diff','t-stat','p'])
t_ney

Unnamed: 0,depvar,diff,t-stat,p
0,productivity,-171.240947,-0.976215,0.328958
1,tot_earnings,-21.102813,-1.510394,0.130943
2,cogni,0.027056,0.492507,0.622361
3,daily_savings,0.757374,0.966742,0.333673
4,happy,0.033502,0.990464,0.321947
5,ds_g1_satisfaction,0.215989,1.044944,0.296049
6,typing_time_hr,-0.087307,-0.634157,0.525978


#### b) Estimadores OLS sin controles

In [None]:
# Correr regresiones
d = []
for y in Y:
    m = OLS(endog=d2[y], exog=d2.assign(const = 1)[['const','T_nap']], missing='drop')
    m = m.fit(cov_type='HC2')
    d.append([y, m.params['T_nap'], m.tvalues['T_nap'], m.pvalues['T_nap']])

# Regresiones a tabla
t_ols = pd.DataFrame(data=d, columns=['depvar','diff','t-stat','p'])
t_ols

Unnamed: 0,depvar,diff,t-stat,p
0,productivity,-171.240947,-0.976215,0.328958
1,tot_earnings,-21.102813,-1.510394,0.130943
2,cogni,0.027056,0.492507,0.622361
3,daily_savings,0.757374,0.966742,0.333673
4,happy,0.033502,0.990464,0.321947
5,ds_g1_satisfaction,0.215989,1.044944,0.296049
6,typing_time_hr,-0.087307,-0.634157,0.525978


#### c) Estimadores con controles
De acuerdo al paper, $X_i$ contiene `age_` en cuartiles, `female_` y la variable que indica si $i$ fue asignado a trabajar o a tomarse un break en vez de tomar una siesta.

Como esta pregunta usa la base con promedios durante los 20 días de estudio, la variable que indica la actividad asignada cada día a los individuos del grupo de control no está disponible.

In [35]:
# Edad a cuartiles y luego a dummies
d2['age_q'] = pd.qcut(x=d2['age_'], q=4, labels=[f'q{i}' for i in range(1,5)])
d2 = pd.get_dummies(data=d2, prefix='age_', prefix_sep='', columns=['age_q'], )

# Tratamiento y controles
X = ['T_nap','const','age_q2','age_q3','age_q4','female_']

# Correr regresiones
d = []
for y in Y:
    m = OLS(endog=d2[y], exog=d2.assign(const = 1)[X], missing='drop')
    m = m.fit(cov_type='HC2')
    d.append([y, m.params['T_nap'], m.tvalues['T_nap'], m.pvalues['T_nap']])

# Regresiones a tabla
t_ctr = pd.DataFrame(data=d, columns=['depvar','diff','t-stat','p'])
t_ctr

Unnamed: 0,depvar,diff,t-stat,p
0,productivity,-196.63582,-1.178943,0.238421
1,earnings,-14.563905,-1.060604,0.28887
2,typing_time_hr,-0.083263,-0.608418,0.54291
3,cog,0.029186,0.534748,0.592824
4,well,0.232225,1.121202,0.262202


#### d) Resultados a tabla

In [47]:
t.head()

Unnamed: 0,method,depvar,diff,p
3,Neyman,cog,0.027056,0.622361
3,OLS,cog,0.027056,0.622361
3,MLR,cog,0.029186,0.592824
1,Neyman,earnings,-14.753231,0.283667
1,OLS,earnings,-14.753231,0.283667


In [49]:
pd.wide_to_long(df=t, stubnames=['diff','p'], i='depvar', j='method')

ValueError: ignored

In [40]:
# Concatenar resultados
t = pd.concat([t_ney.assign(method='Neyman'),
               t_ols.assign(method='OLS'),
               t_ctr.assign(method='MLR')])

# 

# Limpiar tabla
t = t[['method','depvar','diff','p']].sort_values('depvar')
t.pivot(index='depvar', columns='method', values='diff')

method,MLR,Neyman,OLS
depvar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cog,0.029186,0.027056,0.027056
earnings,-14.563905,-14.753231,-14.753231
productivity,-196.63582,-171.240947,-171.240947
typing_time_hr,-0.083263,-0.087307,-0.087307
well,0.232225,0.215989,0.215989


### 3. Fischer's Exact Test

In [None]:
stats.permutation_test(data=(d2.loc[d2['T_nap'].eq(1) & d2['tot_earnings'].notna(), 'tot_earnings'],
                             d2.loc[d2['T_nap'].eq(0) & d2['tot_earnings'].notna(), 'tot_earnings']),
                       statistic=lambda x, y: np.mean(x) - np.mean(y),
                       n_resamples=1000,
                       random_state=42)

TypeError: cannot unpack non-iterable PermutationTestResult object