In [None]:
from datascience import *
from datascience.predicates import are
import numpy as np
import matplotlib
matplotlib.use('Agg', warn=False)
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
def import_data():
    import pandas as pd
    investments = Table.from_df(pd.read_stata("AEJApp-2010-0343_Data_and_Code/investments_data.dta"))
    investments['ani_sales_a_m'] = np.nan_to_num(investments['ani_sales_a']) / 6
    investments['crop_sales_m'] = np.nan_to_num(investments['crop_sales']) / 12
    investments['home_prod_tot_w3'] = investments['crop_sales_m'] + investments['ani_sales_a_m'] + np.nan_to_num(investments['homeprod']) + np.nan_to_num(investments['ani_prod_sales_a'])
    investments = investments.where(~np.isnan(investments['hhsize_ae2']))
    investments['home_prod_tot_w3_pp_ae2'] = investments['home_prod_tot_w3'] / investments['hhsize_ae2']
    investments = investments.where('home_prod_tot_w3_pp_ae2', are.strictly_between(0, 200)) # Trim zeros and large numbers for visualization purposes
    investments = investments.where('wave', are.equal_to(3) | are.equal_to(7))
    investments = investments.where('up_cwagepm', are.strictly_between(0, 10000))
    investments['Group'] = investments['wave'] == 3
    investments['Average investment income (pesos/year)'] = investments['home_prod_tot_w3_pp_ae2']
    investments.relabel('comuid', "Community ID")
    investments.relabel('min_dist', 'Distance to nearest city')
    investments['Group name'] = investments.apply(lambda w: "Treatment" if w == 7 else "Control", "wave")
    investments = investments.select("Average investment income (pesos/year)", "Group", "Group name", "Community ID", "Distance to nearest city")
    investments.to_csv("investments.csv")
    
    return investments

In [None]:
investments = import_data()

In [None]:
by_comuid = investments.group(['Community ID', 'Group'], np.mean)
for c in investments.labels:
    if c != 'Community ID' and c != 'Group':
        by_comuid.relabel(c + ' mean', c)
by_comuid['Group name'] = by_comuid.apply(lambda w: "Treatment" if w else "Control", "Group")

In [None]:
by_comuid.to_csv("progresa.csv")

In [None]:
def stratified_sample(tbl, column, size):
    vals = np.unique(tbl[column])
    size_per = int(np.round(size / len(vals)))
    result = tbl.where(column, vals[0]).sample(size_per, with_replacement=False)
    for v in vals[1:]:
        result.append(tbl.where(column, v).sample(size_per, with_replacement=False))
    return result

In [None]:
s = stratified_sample(by_comuid, 'Group', 200)
s.scatter('Group', 'Distance to nearest city', alpha=.2)
plots.xticks([0, 1], ['Control', 'Treatment'], rotation='vertical');
s.scatter('Group', "Average investment income (pesos/year)", alpha=.2)
plots.xticks([0, 1], ['Control', 'Treatment'], rotation='vertical');
s.to_csv("progresa_sample.csv")