# ECON 148 Project

### Preliminary Data Cleaning for Replication

In [1]:
# Load in needed packages

import zipfile
import os
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.api import OLS, add_constant
from linearmodels.panel import PanelOLS
from statsmodels.stats.api import anova_lm

In [2]:
# Unzip zip file

zip_path = '../Data/113599-V1.zip'

extract_to = './extracted_data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(os.listdir(extract_to))

['2013-0533_do_figures.do', '2013-0533_data_endlines1and2.dta', '2013-0533_data_census.dta', '2013-0533_data--TO-SUBMIT-', '2013-0533_data_baseline.dta', '2013-0533_do_tables.do', 'LICENSE.txt', '2013-0533_data_endline1businesstype.dta', 'Readme.pdf']


In [34]:
# # Open .do files to understand how to replicate figures
# extract_to = './extracted_data/2013-0533_data--TO-SUBMIT-'
# with open(os.path.join(extract_to, '2013-0533_do_figures.do'), 'r') as f:
#     content_figures = f.read()

# print(content_figures)

In [35]:
# Open .do files to understand how to replicate tables

# with open(os.path.join(extract_to, '2013-0533_do_tables.do'), 'r') as f:
#     content_tables = f.read()

# print(content_tables[:5000])

In [5]:
# Create paths for .dta files
file_path = os.path.join(extract_to, '2013-0533_data_endlines1and2.dta')

file_path_2 = os.path.join(extract_to, '2013-0533_data_baseline.dta')

file_path_3 = os.path.join(extract_to, '2013-0533_data_census.dta')

file_path_4 = os.path.join(extract_to, '2013-0533_data_endline1businesstype.dta')

# Create DataFrames for the different .dta files
endlines = pd.read_stata(file_path)

baseline = pd.read_stata(file_path_2)

census = pd.read_stata(file_path_3)

endlines = pd.read_stata(file_path_4)

In [6]:
# First 5 rows of DataFrames

# endlines.head()

In [7]:
# baseline.head()

In [8]:
# census.head()

In [9]:
# endlines.head()

### Table 1A Replication

In [10]:
# Create groups for variables in baseline DataFrame

hh_composition = ["hh_size", "adults", "children", "male_head", "head_age", "head_noeduc"]
credit_access = ["spandana", "othermfi", "bank", "informal", "anyloan"]
loan_amt = ["spandana_amt", "othermfi_amt", "bank_amt", "informal_amt", "anyloan_amt"]
self_emp_activ = ["total_biz", "female_biz", "female_biz_pct"]
businesses = ["bizrev", "bizexpense", "bizinvestment", "bizemployees", "hours_weekbiz"]

# Create group for businesses variables
businesses_allHH = []

# Create variables for businesses variables representing 1 if total_biz is 1 and 0 if total_biz is 0
for var in businesses:
    new_var = f"{var}_allHH"
    businesses_allHH.append(new_var)
    baseline[new_var] = baseline[var]
    baseline.loc[baseline["total_biz"] == 0, new_var] = 0

# Create group for consumption variables

consumption = ["total_exp_mo", "nondurable_exp_mo", "durables_exp_mo", "home_durable_index"]

# Make list with all variables
allvars = hh_composition + credit_access + loan_amt + self_emp_activ + businesses + businesses_allHH + consumption

In [11]:
# Drop NAs

baseline = baseline[baseline['treatment'].notna()]
baseline = baseline.reset_index()

# Convert treatment to binary: 1 = Treatment, 0 = Control
baseline["treatment_binary"] = baseline["treatment"].astype(str).map({"Control": 0, "Treatment": 1})

In [12]:
# Convert categorical columns to object type first
baseline['spandana'] = baseline['spandana'].astype('object')
baseline['othermfi'] = baseline['othermfi'].astype('object')
baseline['bank'] = baseline['bank'].astype('object')
baseline['informal'] = baseline['informal'].astype('object')
baseline['anyloan'] = baseline['anyloan'].astype('object')

# Replace 'Yes' with 1, 'No' with 0
baseline['spandana'].replace({'Yes': 1, 'No': 0}, inplace=True)
baseline['othermfi'].replace({'Yes': 1, 'No': 0}, inplace=True)
baseline['bank'].replace({'Yes': 1, 'No': 0}, inplace=True)
baseline['informal'].replace({'Yes': 1, 'No': 0}, inplace=True)
baseline['anyloan'].replace({'Yes': 1, 'No': 0}, inplace=True)

# Fill NaN with -1
baseline['spandana'].fillna(-1, inplace=True)
baseline['othermfi'].fillna(-1, inplace=True)
baseline['bank'].fillna(-1, inplace=True)
baseline['informal'].fillna(-1, inplace=True)
baseline['anyloan'].fillna(-1, inplace=True)

In [13]:
import statsmodels.api as sm

# List of variables to include
vars_to_analyze = ['spandana', 'othermfi', 'bank', 'informal', 'anyloan']

# Prepare output list
table1a_binary_results = []

# Ensure binary treatment and areaid category
baseline['treatment_binary'] = baseline['treatment'].map({'Control': 0, 'Treatment': 1}).astype(int)
baseline['areaid'] = baseline['areaid'].astype('category')

for var in vars_to_analyze:
    temp = baseline[[var, 'treatment_binary', 'areaid']].copy()
    temp = temp[temp[var].isin([0, 1])]
    temp = temp.dropna(subset=[var, 'treatment_binary', 'areaid'])

    if temp.empty:
        table1a_results.append([None, None, None, None, None])
        continue

    control = temp[temp['treatment_binary'] == 0]
    N = control.shape[0]
    mean_c = control[var].mean()
    std_c = control[var].std()

    temp['const'] = 1.0
    model = sm.OLS(temp[var], temp[['const', 'treatment_binary']])
    results = model.fit(cov_type='cluster', cov_kwds={'groups': temp['areaid']})

    diff = results.params['treatment_binary']
    p_val = results.pvalues['treatment_binary']

    table1a_binary_results.append([N, mean_c, std_c, diff, p_val])

# Create results DataFrame
table1a_binary_df = pd.DataFrame(table1a_binary_results,
                          columns=["Obs", "Control_mean", "Control_sd", "Difference_in_mean", "P_val"],
                          index=vars_to_analyze)

table1a_binary_df

Unnamed: 0,Obs,Control_mean,Control_sd,Difference_in_mean,P_val
spandana,1213,0.0,0.0,0.006568,0.192208
othermfi,1213,0.010717,0.10301,0.006524,0.450831
bank,1213,0.036274,0.187047,0.001493,0.858449
informal,1213,0.632317,0.482373,0.001509,0.958114
anyloan,1213,0.680132,0.466618,0.002134,0.941766


In [14]:
import statsmodels.api as sm

# Define full list of variables you want to analyze
# (This must be defined earlier in your script)
# Example:
# allvars = ['income', 'savings', 'education', 'spandana', 'othermfi', ...]

# Binary vars to exclude
vars_to_analyze = ['spandana', 'othermfi', 'bank', 'informal', 'anyloan']

# Filter to continuous variables only
non_binary_vars = [var for var in allvars if var not in vars_to_analyze]

# Prepare output list
table1a_nonbinary_results = []


for var in non_binary_vars:
    temp = baseline[[var, 'treatment_binary', 'areaid']].copy()

    # Control group stats
    control = temp[temp['treatment_binary'] == 0]
    N = control.shape[0]
    mean_c = control[var].mean()
    std_c = control[var].std()

    # Regression
    temp['const'] = 1.0
    model = sm.OLS(temp[var], temp[['const', 'treatment_binary']])
    results = model.fit(cov_type='cluster', cov_kwds={'groups': temp['areaid']})

    # Extract results
    diff = results.params['treatment_binary']
    p_val = results.pvalues['treatment_binary']

    table1a_nonbinary_results.append([N, mean_c, std_c, diff, p_val])

# Create final results table
table1a_nonbinary_df = pd.DataFrame(table1a_nonbinary_results,
                          columns=["Obs", "Control_mean", "Control_sd", "Difference_in_mean", "P_val"],
                          index=non_binary_vars)
table1a_nonbinary_df

Unnamed: 0,Obs,Control_mean,Control_sd,Difference_in_mean,P_val
hh_size,1220,5.037705,1.666212,0.095082,0.300973
adults,1220,3.438525,1.465599,-0.011475,0.872767
children,1220,1.59918,1.228256,0.104098,0.095204
male_head,1220,0.907072,0.29045,,
head_age,1220,41.149671,10.839448,,
head_noeduc,1220,0.370066,0.483021,,
spandana_amt,1220,0.0,0.0,,
othermfi_amt,1220,201.154163,2742.363893,,
bank_amt,1220,7438.169827,173268.343989,,
informal_amt,1220,28460.016488,65312.1557,,


In [15]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Assume allvars is already defined
# Example:
# allvars = list(baseline.columns)

# Binary vars to exclude from this loop
binary_vars = ['spandana', 'othermfi', 'bank', 'informal', 'anyloan']
non_binary_vars = [var for var in allvars if var not in binary_vars]

# Store regression results here
table1a_nonbinary_results = []

for var in non_binary_vars:
    temp = baseline[[var, 'treatment_binary', 'areaid']].copy()

    # Drop rows with missing values to avoid crash
    temp = temp.dropna()

    # Control group stats
    control = temp[temp['treatment_binary'] == 0]
    N = control.shape[0]
    mean_c = control[var].mean()
    std_c = control[var].std()

    # Default values if regression fails
    diff = np.nan
    p_val = np.nan

    # Check if both treatment and variable have variation
    if temp[var].std() > 0 and temp['treatment_binary'].std() > 0:
        try:
            temp['const'] = 1.0
            model = sm.OLS(temp[var], temp[['const', 'treatment_binary']])
            results = model.fit(cov_type='cluster', cov_kwds={'groups': temp['areaid']})
            diff = results.params['treatment_binary']
            p_val = results.pvalues['treatment_binary']
        except:
            # You can print/log here if needed
            diff = np.nan
            p_val = np.nan
    else:
        # No variation = no regression → set safe defaults
        diff = 0.0
        p_val = 1.0

    table1a_nonbinary_results.append([N, mean_c, std_c, diff, p_val])

# Build final DataFrame
table1a_nonbinary_df = pd.DataFrame(
    table1a_nonbinary_results,
    columns=["Obs", "Control_mean", "Control_sd", "Difference_in_mean", "P_val"],
    index=non_binary_vars
)

# Optional: sort or save
# table1a_nonbinary_df.to_csv("table1a.csv")

# View the table
table1a_nonbinary_df

Unnamed: 0,Obs,Control_mean,Control_sd,Difference_in_mean,P_val
hh_size,1220,5.037705,1.666212,0.095082,0.300973
adults,1220,3.438525,1.465599,-0.011475,0.872767
children,1220,1.59918,1.228256,0.104098,0.095204
male_head,1216,0.907072,0.29045,-0.012249,0.379004
head_age,1216,41.149671,10.839448,-0.243498,0.675317
head_noeduc,1216,0.370066,0.483021,-0.007699,0.78638
spandana_amt,1213,0.0,0.0,68.965517,0.188973
othermfi_amt,1213,201.154163,2742.363893,170.35651,0.567014
bank_amt,1213,7438.169827,173268.343989,-5419.69692,0.276039
informal_amt,1213,28460.016488,65312.1557,-570.459838,0.855498


In [16]:
# View the table
table1a_nonbinary_df

Unnamed: 0,Obs,Control_mean,Control_sd,Difference_in_mean,P_val
hh_size,1220,5.037705,1.666212,0.095082,0.300973
adults,1220,3.438525,1.465599,-0.011475,0.872767
children,1220,1.59918,1.228256,0.104098,0.095204
male_head,1216,0.907072,0.29045,-0.012249,0.379004
head_age,1216,41.149671,10.839448,-0.243498,0.675317
head_noeduc,1216,0.370066,0.483021,-0.007699,0.78638
spandana_amt,1213,0.0,0.0,68.965517,0.188973
othermfi_amt,1213,201.154163,2742.363893,170.35651,0.567014
bank_amt,1213,7438.169827,173268.343989,-5419.69692,0.276039
informal_amt,1213,28460.016488,65312.1557,-570.459838,0.855498


In [17]:
# Find index position of 'head_age'
insert_loc = table1a_nonbinary_df.index.get_loc('head_noeduc') + 1

# Split the nonbinary table into two parts: before and after insertion point
top = table1a_nonbinary_df.iloc[:insert_loc]
bottom = table1a_nonbinary_df.iloc[insert_loc:]

# Combine with the binary df inserted in the middle
table1a_combined_df = pd.concat([top, table1a_binary_df, bottom])

# Optional: reset the index name if you want to make it cleaner for output
table1a_combined_df.index.name = 'Variable'

In [18]:
# Round all float values to 3 decimal places
table1a_combined_df = table1a_combined_df.round(3)

# View the result
table1a_combined_df

Unnamed: 0_level_0,Obs,Control_mean,Control_sd,Difference_in_mean,P_val
Variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hh_size,1220,5.038,1.666,0.095,0.301
adults,1220,3.439,1.466,-0.011,0.873
children,1220,1.599,1.228,0.104,0.095
male_head,1216,0.907,0.29,-0.012,0.379
head_age,1216,41.15,10.839,-0.243,0.675
head_noeduc,1216,0.37,0.483,-0.008,0.786
spandana,1213,0.0,0.0,0.007,0.192
othermfi,1213,0.011,0.103,0.007,0.451
bank,1213,0.036,0.187,0.001,0.858
informal,1213,0.632,0.482,0.002,0.958


## Table 1B

In [19]:
# Open .do files to understand how to replicate tables

with open(os.path.join(extract_to, '2013-0533_do_tables.do'), 'r') as f:
    content_tables = f.read()

print(content_tables[:7000])

/*******************************************************************************      
Program Name: 	2013-0533_do_tables  
Contact:  		Cynthia Kinnan (c-kinnan@northwestern.edu)
Last Modified: 	5 May 2014
Purpose: 		Replicates all tables from "The miracle of microfinance? Evidence
				from a randomized evaluation" (Banerjee et al.), AEJ, 2014
Files Used: 	2013-0533_data_baseline.dta
				2013-0533_data_endlines1and2.dta
				2013-0533_data_census.dta
				2013-0533_data_endline1businesstype.dta
Files Created:	table1a.txt
				table1b.txt
				table2.txt
				table3.txt
				table3b.txt
				table3c.txt
				table4.txt
				table5.txt
				table6.txt
				table7.txt
				table_index_pvals.txt
				tableA1.txt
				tableA2.txt
				tableA3.txt
				tableA4.txt
				tableA5.txt
*******************************************************************************/
cap log close
clear all
set more off
set mem 100m
pause on

*DATA DIRECTORY
global datadir "C:/Users/hreppst/Dropbox/Spandana/Paper/AEJ Final/Data/"



In [20]:
endlines = pd.read_stata(file_path)
endlines

Unnamed: 0,hhid,areaid,treatment,w,w1,w2,sample1,sample2,old_biz,any_old_biz,...,labor_index_1,consumption_index_1,social_index_1,credit_index_2,biz_index_all_2,biz_index_old_2,income_index_2,labor_index_2,consumption_index_2,social_index_2
0,1,1,Treatment,0.819672,0.777092,0.819672,Yes,Yes,0.0,No,...,-0.104168,-0.166209,-0.078313,-0.420189,-0.319804,,0.089781,0.220555,-0.156785,-0.058573
1,2,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,0.0,No,...,-0.632081,-0.029637,0.201462,-0.322687,-0.319804,,-0.176772,0.665412,0.199052,-0.009926
2,3,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,1.0,Yes,...,-0.327149,-0.087983,-0.096513,-0.478224,-0.015660,-0.268047,-0.441344,-0.297447,0.159275,-0.144019
3,4,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,1.0,Yes,...,0.050992,-0.239553,-0.118175,0.374835,0.044704,-0.181571,-0.439867,0.198170,-0.290614,0.147882
4,5,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,1.0,Yes,...,-0.019456,-0.211210,-0.170317,-0.264876,0.557702,0.591963,-0.462810,-0.018662,-0.169452,-0.138688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6858,7339,50,Treatment,1.000000,1.000000,0.914433,Yes,Yes,0.0,No,...,0.336387,-0.051324,-0.015783,0.381104,0.747859,,1.574043,-0.011070,0.331390,-0.060904
6859,7340,41,Treatment,1.000000,1.000000,0.914433,Yes,Yes,0.0,No,...,0.399695,0.266791,-0.082640,0.375154,-0.319804,,0.311908,0.297568,-0.288137,-0.076314
6860,7341,73,Control,1.000000,1.000000,0.914433,Yes,Yes,0.0,No,...,-0.862942,0.147435,-0.115124,0.598928,-0.319804,,0.676197,1.216774,0.506503,-0.035357
6861,7342,81,Control,1.000000,1.000000,1.000000,Yes,No,0.0,No,...,0.121423,-0.283346,-0.016105,,,,,,,


In [21]:
hh_composition = ["hhsize", "adults", "children", "male_head", "head_age", "head_noeduc"]
credit_access = ["spandana", "othermfi", "anybank", "anyinformal", "anyloan"]
loan_amt = ["spandana_amt", "othermfi_amt", "bank_amt", "informal_amt", "anyloan_amt"]
self_emp_activ = ["total_biz", "female_biz_allHH", "female_biz_pct"]
businesses = ["bizrev", "bizexpense", "bizinvestment", "bizemployees", "hours_week_biz"]
businesses_allHH = ["bizrev_allHH", "bizexpense_allHH", "bizinvestment_allHH", "bizemployees_allHH", "hours_week_biz_allHH"]
consumption = ["total_exp_mo", "nondurable_exp_mo", "durables_exp_mo", "home_durable_index"]

# Combine all lists into a single list
allvars = (
    hh_composition
    + credit_access
    + loan_amt
    + self_emp_activ
    + businesses
    + businesses_allHH
    + consumption
)


In [22]:
# Define the variable lists
endlines_copy = endlines.copy()

businesses = ["bizrev", "bizexpense", "bizinvestment", "bizemployees", "hours_week_biz"]
additional_vars = ["female_biz"]
vars_to_process = businesses + additional_vars

# Loop through rounds (1 and 2)
for i in [1, 2]:
    for var in vars_to_process:
        var_col = f"{var}_{i}"
        allhh_col = f"{var}_allHH_{i}"
        total_biz_col = f"total_biz_{i}"
        
        # Create the _allHH_ version
        endlines_copy[allhh_col] = endlines_copy[var_col]
        
        # Set value to NaN if household did not run a business
        endlines_copy.loc[endlines_copy[total_biz_col] == 0, var_col] = np.nan


In [23]:
# Rename columns: varA_1 → varA1, varA_2 → varA2
for var in allvars:
    for suffix in ['1', '2']:
        old_name = f"{var}_{suffix}"
        new_name = f"{var}{suffix}"
        if old_name in endlines_copy.columns:
            endlines_copy.rename(columns={old_name: new_name}, inplace=True)

# Reshape to long format
# Assumes there's a unique household ID column like 'hhid'
df_long = pd.wide_to_long(endlines_copy, 
                          stubnames=allvars, 
                          i='hhid', 
                          j='endline', 
                          sep='', 
                          suffix='[12]').reset_index()

In [24]:
df_long

Unnamed: 0,hhid,endline,social_index_2,old_biz,wages_nonbiz_2,sample2,anymfi_amt_2,labor_index_2,mfi_loan_cycles_2,hours_headspouse_biz_1,...,hours_week_biz,bizrev_allHH,bizexpense_allHH,bizinvestment_allHH,bizemployees_allHH,hours_week_biz_allHH,total_exp_mo,nondurable_exp_mo,durables_exp_mo,home_durable_index
0,1,1,-0.058573,0.0,7000.0,Yes,0.000000,0.220555,,0.0,...,,0.000000,0.000000,0.0,0.0,0.0,2154.000000,,,2.693964
1,2,1,-0.009926,0.0,4000.0,Yes,0.000000,0.665412,0.0,4.0,...,,0.000000,0.000000,0.0,0.0,0.0,4442.500000,4413.333496,29.166666,2.199033
2,3,1,-0.144019,1.0,0.0,Yes,0.000000,-0.297447,0.0,49.0,...,21.0,1800.000000,205.000000,0.0,0.0,21.0,5207.833496,4995.333496,212.500000,2.455154
3,4,1,0.147882,1.0,0.0,Yes,0.000000,0.198170,0.0,14.0,...,77.0,5000.000000,205.000000,0.0,0.0,77.0,4566.500000,4412.333008,154.166672,1.304456
4,5,1,-0.138688,1.0,0.0,Yes,0.000000,-0.018662,0.0,70.0,...,70.0,12400.000000,8750.000000,0.0,0.0,70.0,5313.333496,,,2.646117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13721,7339,2,-0.060904,0.0,400.0,Yes,13327.663086,-0.011070,1.0,0.0,...,70.0,55147.371094,8433.912109,0.0,3.0,70.0,10279.515625,9543.716797,735.798035,3.167415
13722,7340,2,-0.076314,0.0,9500.0,Yes,8329.789062,0.297568,1.0,0.0,...,,0.000000,0.000000,0.0,0.0,0.0,4781.437988,4757.142578,24.295219,2.161747
13723,7341,2,-0.035357,0.0,13600.0,Yes,16659.578125,1.216774,1.0,3.0,...,,0.000000,0.000000,0.0,0.0,0.0,8617.861328,7875.121582,742.739563,2.897262
13724,7342,2,,0.0,,No,,,,48.0,...,,,,,,,,,,


In [25]:
df_long = df_long[["hhid", "areaid", "endline", "treatment"] + allvars]
df_long_w_dummies = pd.get_dummies(df_long, columns=['endline'], prefix='endline')


endline_1 = df_long_w_dummies[df_long_w_dummies["endline_1"] == 1].copy()
endline_2 = df_long_w_dummies[df_long_w_dummies["endline_2"] == 1].copy()

endline_1["spandana"] = pd.to_numeric(endline_1["spandana"].map({"No":0, "Yes":1}))
endline_1["othermfi"] = pd.to_numeric(endline_1["othermfi"].map({"No":0, "Yes":1}))
endline_1["anybank"] = pd.to_numeric(endline_1["anybank"].map({"No":0, "Yes":1}))
endline_1["anyinformal"] = pd.to_numeric(endline_1["anyinformal"].map({"No":0, "Yes":1}))
endline_1["anyloan"] = pd.to_numeric(endline_1["anyloan"].map({"No":0, "Yes":1}))

endline_1_control =  endline_1[endline_1["treatment"] == "Control"]

endline_1_control.describe()

transpose_1 = endline_1_control.describe().T[["count", "mean", "std"]].reset_index()
transpose_1

Unnamed: 0,index,count,mean,std
0,hhid,3264.0,4083.6731,2101.773755
1,areaid,3264.0,56.95527,29.556583
2,hhsize,3264.0,5.645221,2.151706
3,adults,3264.0,3.886642,1.754296
4,children,3264.0,1.737745,1.309517
5,male_head,3261.0,0.894511,0.30723
6,head_age,3257.0,41.14891,10.221965
7,head_noeduc,3256.0,0.311425,0.463147
8,spandana,3247.0,0.050508,0.219025
9,othermfi,3183.0,0.148602,0.355752


In [26]:
endline_2 = df_long_w_dummies[df_long_w_dummies["endline_2"] == 1].copy()

endline_2["spandana"] = pd.to_numeric(endline_2["spandana"].map({"No":0, "Yes":1}))
endline_2["othermfi"] = pd.to_numeric(endline_2["othermfi"].map({"No":0, "Yes":1}))
endline_2["anybank"] = pd.to_numeric(endline_2["anybank"].map({"No":0, "Yes":1}))
endline_2["anyinformal"] = pd.to_numeric(endline_2["anyinformal"].map({"No":0, "Yes":1}))
endline_2["anyloan"] = pd.to_numeric(endline_2["anyloan"].map({"No":0, "Yes":1}))

endline_2_control =  endline_2[endline_2["treatment"] == "Control"]

endline_2_control.describe()

transpose_2 = endline_2_control.describe().T[["count", "mean", "std"]].reset_index()
transpose_2

Unnamed: 0,index,count,mean,std
0,hhid,3264.0,4083.6731,2101.773755
1,areaid,3264.0,56.95527,29.556583
2,hhsize,2943.0,6.269113,2.5479
3,adults,2943.0,4.038736,1.848216
4,children,2943.0,1.763846,1.321124
5,male_head,2938.0,0.811096,0.391499
6,head_age,2940.0,42.257823,10.153644
7,head_noeduc,2940.0,0.291837,0.454685
8,spandana,2943.0,0.111451,0.314743
9,othermfi,2943.0,0.268434,0.44322


In [27]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Combine endline 1 and endline 2 control groups
endline_1_control = endline_1_control.copy()
endline_2_control = endline_2_control.copy()

endline_1_control["endline2"] = 0
endline_2_control["endline2"] = 1

df_ctrl_combined = pd.concat([endline_1_control, endline_2_control])

# List of binary vars you processed earlier
# Container for results
results = []

for var in allvars:
    row = {'index': var}

    # Drop missing values
    df_sub = df_ctrl_combined[["areaid", "endline2", var]].dropna()

    # Summary stats for each endline
    el1_vals = df_sub[df_sub["endline2"] == 0][var]
    el2_vals = df_sub[df_sub["endline2"] == 1][var]

   

    # Run regression with clustered SEs
    try:
        model = smf.ols(f"{var} ~ endline2", data=df_sub).fit(
            cov_type='cluster',
            cov_kwds={'groups': df_sub['areaid']}
        )
        row['Diff'] = model.params['endline2']
        row['P_Value'] = model.pvalues[-1]
    except Exception as e:
        row['Diff'] = np.nan
        row['P_Value'] = np.nan

    results.append(row)

# Create results table
summary_df = pd.DataFrame(results)
summary_df = summary_df[[
    'index', 'Diff', 'P_Value'
]]


In [28]:
summary_df[['Diff', 'P_Value']] = summary_df[['Diff', 'P_Value']].round(3)


In [29]:
summarys = transpose_1.merge(transpose_2, on="index", how = "inner")
full = summarys.merge(summary_df, on = "index", how="inner")



In [30]:
full = full.round(3).rename({"count_x":"Obs. (1)", "count_y":"Obs. (4)", "mean_x":"Mean (2)", "mean_y":"Mean (5)", "std_x":"SD (3)", "std_y":"SD (6)", "Diff":"Coeff. (7)"}, axis = 1)

In [31]:
full.set_index("index")

Unnamed: 0_level_0,Obs. (1),Mean (2),SD (3),Obs. (4),Mean (5),SD (6),Coeff. (7),P_Value
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
hhsize,3264.0,5.645,2.152,2943.0,6.269,2.548,0.624,0.0
adults,3264.0,3.887,1.754,2943.0,4.039,1.848,0.152,0.0
children,3264.0,1.738,1.31,2943.0,1.764,1.321,0.026,0.242
male_head,3261.0,0.895,0.307,2938.0,0.811,0.391,-0.083,0.0
head_age,3257.0,41.149,10.222,2940.0,42.258,10.154,1.109,0.0
head_noeduc,3256.0,0.311,0.463,2940.0,0.292,0.455,-0.02,0.017
spandana,3247.0,0.051,0.219,2943.0,0.111,0.315,0.061,0.0
othermfi,3183.0,0.149,0.356,2943.0,0.268,0.443,0.12,0.0
anybank,3247.0,0.079,0.27,2943.0,0.073,0.26,-0.006,0.476
anyinformal,3247.0,0.761,0.427,2943.0,0.603,0.489,-0.158,0.0


In [None]:
endlines

In [65]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np

area_controls = [
    'area_pop_base',
    'area_debt_total_base',
    'area_business_total_base',
    'area_exp_pc_mean_base',
    'area_literate_head_base',
    'area_literate_base'
]

variables = [
    'spandana_1', 'othermfi_1', 'anymfi_1', 'anybank_1', 'anyinformal_1', 'anyloan_1',
    'everlate_1', 'mfi_loan_cycles_1', 'spandana_amt_1', 'othermfi_amt_1', 'anymfi_amt_1',
    'bank_amt_1', 'informal_amt_1', 'anyloan_amt_1', 'credit_index_1'
]

weights_var = "w1"
results_table = []

for var in variables:
    df_sub = endlines[["treatment", "areaid", weights_var, var] + area_controls].dropna()

    if df_sub.empty:
        print(f"Skipping {var} — no valid data")
        continue

    # Convert categorical Yes/No to numeric if needed
    if df_sub[var].dtype == "category":
        df_sub[var] = pd.to_numeric(df_sub[var].map({"No": 0, "Yes": 1}))

    formula = f"{var} ~ treatment + " + " + ".join(area_controls)
    model = smf.wls(formula, data=df_sub, weights=df_sub[weights_var]).fit(
        cov_type='cluster', cov_kwds={'groups': df_sub['areaid']}
    )

    coef = model.params.get(1, np.nan)
    se = model.bse.get(1, np.nan)
    r2 = model.rsquared
    pval = model.pvalues.get(1, np.nan)



    # Control group stats
    control_group = df_sub[df_sub["treatment"] == "Control"][var]
    mn1 = control_group.mean()
    sd1 = control_group.std()
    N = df_sub.shape[0]

    # Append results
    results_table.append({
        "variable": var,
        "coef": coef,
        "se": se,
        "mean": mn1,
        "Observations": N,
        "pval": pval
    })

# Convert to DataFrame and display
results_df = pd.DataFrame(results_table)

results_df  


Unnamed: 0,variable,coef,se,mean,Observations,pval
0,spandana_1,0.127384,0.019674,0.05050816,6811,9.501678e-11
1,othermfi_1,-0.011603,0.023774,0.1486019,6657,0.625523
2,anymfi_1,0.083464,0.027049,0.1832461,6811,0.002031282
3,anybank_1,0.002774,0.011618,0.07914998,6811,0.8112854
4,anyinformal_1,-0.05167,0.02124,0.7610102,6811,0.01498647
5,anyloan_1,-0.022498,0.013772,0.8673407,6862,0.1023492
6,everlate_1,-0.059583,0.026387,0.6158301,6475,0.02394465
7,mfi_loan_cycles_1,0.08393,0.041089,0.3298461,6816,0.0410886
8,spandana_amt_1,1333.77209,229.68134,597.4407,6811,6.35814e-09
9,othermfi_amt_1,-93.667608,336.297764,1806.026,6708,0.7806088


In [38]:
endlines[endlines["treatment"]=="Control"]["mfi_loan_cycles_1"]
endlines[["treatment", "areaid", "w1", "mfi_loan_cycles_1"] + area_controls].dropna()

Unnamed: 0,treatment,areaid,w1,mfi_loan_cycles_1,area_pop_base,area_debt_total_base,area_business_total_base,area_exp_pc_mean_base,area_literate_head_base,area_literate_base
0,Treatment,1,0.777092,1.000000,272,81050.0,11,1334.766113,0.500,0.533898
1,Treatment,1,1.000000,0.000000,272,81050.0,11,1334.766113,0.500,0.533898
2,Treatment,1,1.000000,0.000000,272,81050.0,11,1334.766113,0.500,0.533898
3,Treatment,1,1.000000,0.000000,272,81050.0,11,1334.766113,0.500,0.533898
4,Treatment,1,1.000000,0.000000,272,81050.0,11,1334.766113,0.500,0.533898
...,...,...,...,...,...,...,...,...,...,...
6858,Treatment,50,1.000000,2.083333,99,23975.0,12,1341.398560,0.725,0.701571
6859,Treatment,41,1.000000,0.000000,398,35150.0,13,719.778381,0.450,0.531532
6860,Control,73,1.000000,1.000000,273,31635.0,10,1233.135132,0.700,0.727273
6861,Control,81,1.000000,0.000000,88,8950.0,2,687.486694,0.300,0.628319


In [161]:
import pandas as pd
import statsmodels.formula.api as smf
import numpy as np

area_controls = [
    'area_pop_base',
    'area_debt_total_base',
    'area_business_total_base',
    'area_exp_pc_mean_base',
    'area_literate_head_base',
    'area_literate_base'
]

variables = [
    'spandana_2', 'othermfi_2', 'anymfi_2', 'anybank_2', 'anyinformal_2', 'anyloan_2',
    'everlate_2', 'mfi_loan_cycles_2', 'spandana_amt_2', 'othermfi_amt_2', 'anymfi_amt_2',
    'bank_amt_2', 'informal_amt_2', 'anyloan_amt_2', 'credit_index_2'
]

weights_var = "w1"
results_table = []

for var in variables:
    df_sub = endlines[["treatment", "areaid", weights_var, var] + area_controls].dropna()

    if df_sub.empty:
        print(f"Skipping {var} — no valid data")
        continue

    # Convert categorical Yes/No to numeric if needed
    if df_sub[var].dtype == "category":
        df_sub[var] = pd.to_numeric(df_sub[var].map({"No": 0, "Yes": 1}))

    formula = f"{var} ~ treatment + " + " + ".join(area_controls)
    model = smf.wls(formula, data=df_sub, weights=df_sub[weights_var]).fit(
        cov_type='cluster', cov_kwds={'groups': df_sub['areaid']}
    )

    coef = model.params.get(1, np.nan)
    se = model.bse.get(1, np.nan)
    r2 = model.rsquared
    pval = model.pvalues.get(1, np.nan)



    # Control group stats
    control_group = df_sub[df_sub["treatment"] == "Control"][var]
    mn1 = control_group.mean()
    sd1 = control_group.std()
    N = df_sub.shape[0]

    # Append results
    results_table.append({
        "variable": var,
        "coef": coef,
        "se": se,
        "mean": mn1,
        "Observations": N,
        "pval": pval
    })

# Convert to DataFrame and display
results_df = pd.DataFrame(results_table)

results_df

Unnamed: 0,variable,coef,se,mean,Observations,pval
0,spandana_2,0.064744,0.019364,0.1114509,6142,0.000827
1,othermfi_2,-0.039425,0.02625,0.2684336,6142,0.133127
2,anymfi_2,0.003284,0.029373,0.3309548,6142,0.91098
3,anybank_2,0.000594,0.008564,0.07305471,6142,0.944701
4,anyinformal_2,0.002026,0.017613,0.6034659,6142,0.908429
5,anyloan_2,0.000293,0.010344,0.9041794,6142,0.977414
6,everlate_2,0.006881,0.021224,0.5980292,6142,0.745798
7,mfi_loan_cycles_2,0.081811,0.066878,0.724405,5926,0.22122
8,spandana_amt_2,1015.135029,300.724284,1566.64,6142,0.000736
9,othermfi_amt_2,-247.882038,631.572666,4775.06,6142,0.694701


# Table 3B

In [141]:
endlines_copy = endlines.copy()



business_variables_1 = ["bizassets_1", "bizinvestment_1", "bizrev_1", "bizexpense_1", 
                      "bizprofit_1", "bizemployees_1", "biz_index_old_1"]


endlines_copy["any_old_biz"] = endlines_copy["any_old_biz"].map({"No": 0, "Yes": 1})
endlines_copy_old_biz_1 = endlines_copy[endlines_copy["any_old_biz"] == 1]



weights_var = "w1"
results_table = []

for var in business_variables_1:
    df_sub = endlines_copy_old_biz_1[["treatment", "areaid", weights_var, var] + area_controls].dropna()

    if df_sub.empty:
        print(f"Skipping {var} — no valid data")
        continue

    # Convert categorical Yes/No to numeric if needed
    if df_sub[var].dtype == "category":
        df_sub[var] = pd.to_numeric(df_sub[var].map({"No": 0, "Yes": 1}))

    formula = f"{var} ~ treatment + " + " + ".join(area_controls)
    model = smf.wls(formula, data=df_sub, weights=df_sub[weights_var]).fit(
        cov_type='cluster', cov_kwds={'groups': df_sub['areaid']}
    )

    coef = model.params.get(1, np.nan)
    se = model.bse.get(1, np.nan)
    r2 = model.rsquared
    pval = model.pvalues.get(1, np.nan)



    # Control group stats
    control_group = df_sub[df_sub["treatment"] == "Control"][var]
    mn1 = control_group.mean()
    sd1 = control_group.std()
    N = df_sub.shape[0]

    # Append results
    results_table.append({
        "variable": var,
        "coef": coef,
        "se": se,
        "Control Mean": mn1,
        "Observations": N,
        "pval": pval
    })

# Convert to DataFrame and display
results_df_3b1 = pd.DataFrame(results_table)

results_df_3b1  


Unnamed: 0,variable,coef,se,Control Mean,Observations,pval
0,bizassets_1,897.631793,1062.771639,6757.323,2083,0.398326
1,bizinvestment_1,1119.416069,698.071377,677.8936,2083,0.108806
2,bizrev_1,5266.227442,3720.660502,14504.64,1955,0.156951
3,bizexpense_1,1640.230458,3256.508225,12325.42,2020,0.614488
4,bizprofit_1,2105.438656,1099.959613,2037.855,1624,0.055607
5,bizemployees_1,-0.052984,0.082389,0.412666,2088,0.520161
6,biz_index_old_1,0.090042,0.040552,-1.704729e-09,2088,0.02639


In [166]:
business_variables_2 = ["bizassets_2", "bizinvestment_2", "bizrev_2", "bizexpense_2", 
                      "bizprofit_2", "bizemployees_2", "biz_index_old_2"]


endlines_copy_old_biz_1 = endlines_copy[endlines_copy["any_old_biz"] == 1]



weights_var = "w2"
results_table = []

for var in business_variables_2:
    df_sub = endlines_copy_old_biz_1[["treatment", "areaid", weights_var, var] + area_controls].dropna()

    if df_sub.empty:
        print(f"Skipping {var} — no valid data")
        continue

    # Convert categorical Yes/No to numeric if needed
    if df_sub[var].dtype == "category":
        df_sub[var] = pd.to_numeric(df_sub[var].map({"No": 0, "Yes": 1}))

    formula = f"{var} ~ treatment + " + " + ".join(area_controls)
    model = smf.wls(formula, data=df_sub, weights=df_sub[weights_var]).fit(
        cov_type='cluster', cov_kwds={'groups': df_sub['areaid']}
    )

    coef = model.params.get(1, np.nan)
    se = model.bse.get(1, np.nan)
    r2 = model.rsquared
    pval = model.pvalues.get(1, np.nan)



    # Control group stats
    control_group = df_sub[df_sub["treatment"] == "Control"][var]
    mn1 = control_group.mean()
    sd1 = control_group.std()
    N = df_sub.shape[0]

    # Append results
    results_table.append({
        "variable": var,
        "coef": coef,
        "se": se,
        "Control Mean": mn1,
        "Observations": N,
        "pval": pval
    })

# Convert to DataFrame and display
results_df_3b2 = pd.DataFrame(results_table)

results_df_3b2  


Unnamed: 0,variable,coef,se,Control Mean,Observations,pval
0,bizassets_2,1682.025554,1412.432044,10301.05,1878,0.233704
1,bizinvestment_2,-948.62397,587.963752,2292.123,1878,0.106656
2,bizrev_2,343.308893,1262.96169,12563.96,1859,0.785754
3,bizexpense_2,-2644.30649,1491.003766,12418.35,1862,0.076145
4,bizprofit_2,839.203018,944.993535,1948.239,1844,0.374513
5,bizemployees_2,-0.123744,0.099865,0.4623288,1878,0.215303
6,biz_index_old_2,-0.007005,0.026343,1.633004e-09,1878,0.790293


In [174]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.regression.quantile_regression import QuantReg
from scipy.stats import t
from tqdm import tqdm

def clustered_bootstrap_quantreg(df, formula, cluster_col, quantile, n_reps=500, seed=65209844):
    np.random.seed(seed)
    cluster_ids = df[cluster_col].unique()
    coefs = []

    for _ in range(n_reps):
        sampled_clusters = np.random.choice(cluster_ids, size=len(cluster_ids), replace=True)
        sampled_df = pd.concat([df[df[cluster_col] == cid] for cid in sampled_clusters])

        try:
            model = QuantReg.from_formula(formula, sampled_df)
            res = model.fit(q=quantile)
            coefs.append(res.params['treatment'])
        except:
            continue  # skip failed fits (rare)

    return np.array(coefs)

def estimate_qte(df, outcome, cluster_col="areaid", quantiles=np.arange(0.05, 0.98, 0.02), n_reps=500):
    results = []

    for q in tqdm(quantiles):
        # OLS
        ols_model = smf.ols(f"{outcome} ~ treatment", data=df).fit(cov_type='cluster', cov_kwds={'groups': df[cluster_col]})
        ols_beta = ols_model.params['treatment']

        # Quantile Regression (initial fit)
        q_model = QuantReg.from_formula(f"{outcome} ~ treatment", df)
        q_fit = q_model.fit(q=q)
        beta = q_fit.params['treatment']

        # Clustered Bootstrap
        coefs = clustered_bootstrap_quantreg(df, f"{outcome} ~ treatment", cluster_col, q, n_reps=n_reps)
        se = coefs.std(ddof=1)

        # CI bounds (90%)
        df_reps = len(coefs)
        t_crit = t.ppf(0.95, df_reps - 1)
        cil = beta - t_crit * se
        cih = beta + t_crit * se

        results.append([q, ols_beta, beta, cil, cih])

    return pd.DataFrame(results, columns=["qtile", "ols_treatment", "treatment", "treatment_cilo", "treatment_cihi"])


In [189]:
endlines_copy_2 = endlines.copy()

endlines_copy_2['any_old_biz'] = endlines_copy_2['any_old_biz'].map({'Yes': 1, 'No': 0})
endlines_copy_2['any_biz_1'] = endlines_copy_2['any_biz_1'].map({'Yes': 1, 'No': 0})
endlines_copy_2['any_biz_2'] = endlines_copy_2['any_biz_2'].map({'Yes': 1, 'No': 0})




# (a) Households with old businesses at EL1
endlines_copy_2['bizprofit_1_old'] = endlines_copy_2['bizprofit_1'].where(endlines_copy_2['any_old_biz'] == 1)

# (b) Households with new businesses at EL1
endlines_copy_2['bizprofit_1_new'] = endlines_copy_2['bizprofit_1'].where((endlines_copy_2['any_old_biz'] == 0) & (endlines_copy_2['any_biz_1'] == 1))

# (c) Households with any business at EL2
endlines_copy_2['bizprofit_2_biz'] = endlines_copy_2['bizprofit_2'].where(endlines_copy_2['any_biz_2'] == 1)


Unnamed: 0,hhid,areaid,treatment,w,w1,w2,sample1,sample2,old_biz,any_old_biz,...,credit_index_2,biz_index_all_2,biz_index_old_2,income_index_2,labor_index_2,consumption_index_2,social_index_2,bizprofit_1_old,bizprofit_1_new,bizprofit_2_biz
0,1,1,Treatment,0.819672,0.777092,0.819672,Yes,Yes,0.0,0,...,-0.420189,-0.319804,,0.089781,0.220555,-0.156785,-0.058573,,,
1,2,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,0.0,0,...,-0.322687,-0.319804,,-0.176772,0.665412,0.199052,-0.009926,,,
2,3,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,1.0,1,...,-0.478224,-0.015660,-0.268047,-0.441344,-0.297447,0.159275,-0.144019,1595.0,,
3,4,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,1.0,1,...,0.374835,0.044704,-0.181571,-0.439867,0.198170,-0.290614,0.147882,4795.0,,
4,5,1,Treatment,1.000000,1.000000,1.000000,Yes,Yes,1.0,1,...,-0.264876,0.557702,0.591963,-0.462810,-0.018662,-0.169452,-0.138688,3650.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6858,7339,50,Treatment,1.000000,1.000000,0.914433,Yes,Yes,0.0,0,...,0.381104,0.747859,,1.574043,-0.011070,0.331390,-0.060904,,,
6859,7340,41,Treatment,1.000000,1.000000,0.914433,Yes,Yes,0.0,0,...,0.375154,-0.319804,,0.311908,0.297568,-0.288137,-0.076314,,,
6860,7341,73,Control,1.000000,1.000000,0.914433,Yes,Yes,0.0,0,...,0.598928,-0.319804,,0.676197,1.216774,0.506503,-0.035357,,,
6861,7342,81,Control,1.000000,1.000000,1.000000,Yes,No,0.0,0,...,,,,,,,,,,


In [187]:
endlines_copy_2["bizprofit_1"]

0          0.0
1          0.0
2       1595.0
3       4795.0
4       3650.0
         ...  
6858       0.0
6859       0.0
6860       0.0
6861       0.0
6862       0.0
Name: bizprofit_1, Length: 6863, dtype: float32