In [2]:
import pandas as pd 
import numpy as np 
import matplotlib as mpl
import  matplotlib.pyplot as plt
from pylift import TransformedOutcome
pd.set_option('display.max_colwidth', None)

<h2> Data </h2>  To explore the features of the pylift  package data from 
Lalonde dataset which was used to evaluate propensity score in the paper:

Dehejia, R., & Wahba, S. (1999). Causal Effects in Nonexperimental Studies: Reevaluating the Evaluation of Training Programs. Journal of the American Statistical Association, 94(448), 1053-1062. doi:10.2307/2669919

http://sekhon.berkeley.edu/matching/lalonde.html



<h2>Data Preparation </h2>
We read in the data in the same was as suggested by the authors of the pylift package. The dataset was suggested for the purpose of the package exploration.  

The treatment and control group data is avaible separatly therefore the two datasets need to be concatinated. 

In [4]:
cols = ['treat', 'age', 'educ', 'black', 'hisp', 'married', 'nodegr','re74','re75','re78']

control_df = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_control.txt', sep='\s+', header = None, names = cols)

treated_df = pd.read_csv('http://www.nber.org/~rdehejia/data/nswre74_treated.txt', sep='\s+', header = None, names = cols)

lalonde_df = pd.concat([control_df, treated_df], ignore_index=True)


The earnings from 1974 ($re74$) and 1975 ($re75$) are reported in real value. For the analysis we interested in whether person had any earning in those years or not. Therefore we create two dummy variables. The same approach is used for the outcome of interest, which is real earnings in 1978 ($re78$). 

In [6]:
lalonde_df['u74'] = np.where(lalonde_df['re74'] == 0, 1.0, 0.0) 
lalonde_df['u75'] = np.where(lalonde_df['re75'] == 0, 1.0, 0.0)

df = lalonde_df[['nodegr', 'black', 'hisp', 'age', 'educ', 'married', 'u74', 'u75', 'treat', 're78']].copy()

df.rename(columns={'treat':'Treatment', 're78':'Outcome'}, inplace=True)

df['Outcome'] = np.where(df['Outcome'] > 0, 1.0, 0.0)

In [8]:
df.head()
df.describe()

Unnamed: 0,nodegr,black,hisp,age,educ,married,u74,u75,Treatment,Outcome
count,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0,445.0
mean,0.782022,0.833708,0.08764,25.370787,10.195506,0.168539,0.732584,0.649438,0.41573,0.692135
std,0.413337,0.372762,0.28309,7.100282,1.792119,0.374766,0.443109,0.477683,0.493402,0.46213
min,0.0,0.0,0.0,17.0,3.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,20.0,9.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,0.0,24.0,10.0,0.0,1.0,1.0,0.0,1.0
75%,1.0,1.0,0.0,28.0,11.0,0.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,55.0,16.0,1.0,1.0,1.0,1.0,1.0
