# ECON 148 Project

### Preliminary Data Cleaning for Replication

In [3]:
# Load in needed packages

import zipfile
import os
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [4]:
# Unzip zip file

zip_path = '113599-V1.zip'

extract_to = 'extracted_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(os.listdir(extract_to))

['2013-0533_do_figures.do', '2013-0533_data_endlines1and2.dta', '2013-0533_data_census.dta', '2013-0533_data_baseline.dta', '2013-0533_do_tables.do', '2013-0533_data_endline1businesstype.dta', 'Readme.pdf']


In [5]:
# Open .do files to understand how to replicate figures

with open(os.path.join(extract_to, '2013-0533_do_figures.do'), 'r') as f:
    content_figures = f.read()

# print(content_figures)

In [62]:
# Open .do files to understand how to replicate tables

with open(os.path.join(extract_to, '2013-0533_do_tables.do'), 'r') as f:
    content_tables = f.read()

# print(content_tables[:5000])

In [7]:
# Create paths for .dta files
file_path = os.path.join(extract_to, '2013-0533_data_endlines1and2.dta')

file_path_2 = os.path.join(extract_to, '2013-0533_data_baseline.dta')

file_path_3 = os.path.join(extract_to, '2013-0533_data_census.dta')

file_path_4 = os.path.join(extract_to, '2013-0533_data_endline1businesstype.dta')

# Create DataFrames for the different .dta files
endlines = pd.read_stata(file_path)

baseline = pd.read_stata(file_path_2)

census = pd.read_stata(file_path_3)

endlines = pd.read_stata(file_path_4)

In [8]:
# First 5 rows of DataFrames

# endlines.head()

In [9]:
# baseline.head()

In [10]:
# census.head()

In [11]:
# endlines.head()

### Table 1A Replication

In [13]:
# Create groups for variables in baseline DataFrame

hh_composition = ["hh_size", "adults", "children", "male_head", "head_age", "head_noeduc"]
credit_access = ["spandana", "othermfi", "bank", "informal", "anyloan"]
loan_amt = ["spandana_amt", "othermfi_amt", "bank_amt", "informal_amt", "anyloan_amt"]
self_emp_activ = ["total_biz", "female_biz", "female_biz_pct"]
businesses = ["bizrev", "bizexpense", "bizinvestment", "bizemployees", "hours_weekbiz"]

# Create group for businesses variables
businesses_allHH = []

# Create variables for businesses variables representing 1 if total_biz is 1 and 0 if total_biz is 0
for var in businesses:
    new_var = f"{var}_allHH"
    businesses_allHH.append(new_var)
    baseline[new_var] = baseline[var]
    baseline.loc[baseline["total_biz"] == 0, new_var] = 0

# Create group for consumption variables

consumption = ["total_exp_mo", "nondurable_exp_mo", "durables_exp_mo", "home_durable_index"]

# Make list with all variables
allvars = hh_composition + credit_access + loan_amt + self_emp_activ + businesses + businesses_allHH + consumption

In [14]:
# Create new DataFrame where treatment variable is not NaN

baseline_treat_notna = baseline[baseline['treatment'].notna()]

In [42]:
# Find summary statistics needed for control group (count, mean, std)

baseline_control_stats = baseline_treat_notna[baseline_treat_notna['treatment'] == 'Control'].describe().iloc[0:3, :]
baseline_control_stats

Unnamed: 0,hhid_baseline,areaid,hh_size,adults,children,male_head,head_age,head_noeduc,spandana_amt,othermfi_amt,...,hours_weekbiz,total_exp_mo,nondurable_exp_mo,durables_exp_mo,home_durable_index,bizrev_allHH,bizexpense_allHH,bizinvestment_allHH,bizemployees_allHH,hours_weekbiz_allHH
count,1220.0,1220.0,1220.0,1220.0,1220.0,1216.0,1216.0,1216.0,1213.0,1213.0,...,295.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0
mean,1461.319672,55.196721,5.037705,3.438525,1.59918,0.907072,41.149671,0.370066,0.0,201.154163,...,76.315254,4888.431152,4734.685367,153.745696,1.941369,3866.590164,874.703064,93.02459,0.040984,18.453279
std,802.220946,29.577734,1.666212,1.465599,1.228256,0.29045,10.839448,0.483021,0.0,2742.363893,...,66.054443,4074.37207,3839.802498,584.594145,0.829068,27146.786829,12932.665039,1559.052264,0.413027,46.053651


In [44]:
# Find summary statistics needed for treatment group (count, mean, std)

baseline_treatment_stats = baseline_treat_notna[baseline_treat_notna['treatment'] == 'Treatment'].describe().iloc[0:3, :]
baseline_treatment_stats

Unnamed: 0,hhid_baseline,areaid,hh_size,adults,children,male_head,head_age,head_noeduc,spandana_amt,othermfi_amt,...,hours_weekbiz,total_exp_mo,nondurable_exp_mo,durables_exp_mo,home_durable_index,bizrev_allHH,bizexpense_allHH,bizinvestment_allHH,bizemployees_allHH,hours_weekbiz_allHH
count,1220.0,1220.0,1220.0,1220.0,1220.0,1217.0,1215.0,1217.0,1218.0,1218.0,...,283.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1219.0,1219.0
mean,1388.204918,48.459016,5.132787,3.427049,1.703279,0.894823,40.906173,0.362366,68.965517,371.510673,...,71.727915,5158.11377,4986.504658,171.609084,1.968313,4770.239344,991.19928,92.92623,0.098441,16.652174
std,805.1649,28.317069,1.784651,1.520035,1.242067,0.306907,10.889716,0.480882,1032.824822,6146.870019,...,51.60722,4605.975586,4381.833958,556.233981,0.848844,30046.876348,9025.120117,1333.801076,1.196666,39.172836


In [64]:
# Find difference in means for the treatment and control group

difference = baseline_treatment_stats - baseline_control_stats
difference

Unnamed: 0,hhid_baseline,areaid,hh_size,adults,children,male_head,head_age,head_noeduc,spandana_amt,othermfi_amt,...,hours_weekbiz,total_exp_mo,nondurable_exp_mo,durables_exp_mo,home_durable_index,bizrev_allHH,bizexpense_allHH,bizinvestment_allHH,bizemployees_allHH,hours_weekbiz_allHH
count,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,5.0,5.0,...,-12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0
mean,-73.114754,-6.737705,0.095082,-0.011475,0.104098,-0.012249,-0.243498,-0.007699,68.965517,170.35651,...,-4.587339,269.682617,251.819291,17.863388,0.026943,903.64918,116.496216,-0.098361,0.057458,-1.801105
std,2.943954,-1.260664,0.118438,0.054436,0.01381,0.016457,0.050268,-0.002139,1032.824822,3404.506126,...,-14.447223,531.603516,542.03146,-28.360164,0.019776,2900.08952,-3907.544922,-225.251188,0.783639,-6.880815
