In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

df = pd.read_stata('data/indonesia_schooling.dta')

df = df.rename(columns={
    'p504thn': 'birth_yr',
    'p509pro': 'province',
    'recp': 'school_construction',
    'lhwage': 'log_wage',
    'yeduc': 'years_of_education',
})

df['age74'] = 74 - df['birth_yr'] # Calculate age in 1974

# Generate age group indicators
df['old'] = ((df['age74'] <= 17) & (df['age74'] >= 12)).astype(int)
df['young'] = ((df['age74'] >= 2) & (df['age74'] <= 6)).astype(int)

# Generate interaction term for high_inpres and young
df['school_construction_x_young'] = df['school_construction'] * df['young']
df = df[((df['young'] == 1) | (df['old'] == 1)) & (df['log_wage'].notna())]

# Estimate the model
education_formula = "years_of_education ~ school_construction + school_construction_x_young + C(province) + ch71*C(birth_yr)"
log_wage_formula = "log_wage ~ school_construction + school_construction_x_young + C(province) + ch71*C(birth_yr)"
education_results = smf.wls(formula=education_formula, data=df, weights=df['weight'], hasconst=True).fit()
log_wage_results = smf.wls(formula=log_wage_formula, data=df, weights=df['weight'], hasconst=True).fit()

term = 'school_construction_x_young'
def get_estimates(result, term):
    coef = np.round(result.params[term], 3)
    std_err = np.round(result.bse[term], 3)
    p_value = np.round(result.pvalues[term], 3)
    return coef, std_err, p_value

education_coef, education_std_err, education_p_value = get_estimates(education_results, term)
log_wage_coef, log_wage_std_err, log_wage_p_value = get_estimates(log_wage_results, term)

df_results = pd.DataFrame({ # Create a dfFrame with the results
    'Outcome': ['Years of Education', 'Log Wage'],
    'Coefficient': [education_coef, log_wage_coef],
    'Standard Error': [education_std_err, log_wage_std_err],
    'P-value': [education_p_value, log_wage_p_value]
})
df_results



Unnamed: 0,Outcome,Coefficient,Standard Error,P-value
0,Years of Education,0.18,0.086,0.037
1,Log Wage,0.042,0.014,0.004


In [14]:
df['nin_x_young'] = df['nin'] * df['young']
education_formula = "years_of_education ~ nin + nin_x_young + C(province) + ch71:C(birth_yr) + C(birth_yr)"
log_wage_formula = "log_wage ~ nin + nin_x_young + C(province) + ch71:C(birth_yr) + C(birth_yr)"

Index(['years_of_education', 'p105', 'birth_yr', 'province', 'p509kab',
       'weight', 'p607', 'p608', 'wage', 'birthpl', 'log_wage', 'lwage',
       'p504th', 'resid', 'nin', 'school_construction', 'ch71', 'en71',
       'wsppc', 'dens71', 'treat1b', 'treat2b', 'dum', 'moldyed', 'java',
       'urban', 'age74', 'old', 'young', 'school_construction_x_young'],
      dtype='object')