In [None]:
from autil.ipy_addpath import addpath
addpath("code/")
addpath("code/MarchCPS/")

In [None]:
from statsmodels.stats.weightstats import DescrStatsW

# Use Original Data in AA2011

follow `fig1_2_3_19_20_21.do` in fig-01 `plot-wage-changes.do` in fig-04

## Setup

In [None]:
from MarchCPS.prep_wage import assemb_marchwg_regs_exp, calc_marchwg_byexp, tabulate_march_inequality_loop
from MarchCPS.prep_supply import effunit_supplies_exp_byexp

In [None]:
ineq_stat, ineq_pct = tabulate_march_inequality_loop()

In [None]:
pm = assemb_marchwg_regs_exp()
cmre = calc_marchwg_byexp(pm)

In [None]:
esee = effunit_supplies_exp_byexp()

In [None]:
cmre.columns
esee.columns

In [None]:
# "March 1963-2008 wage gaps and efficiency units: Experienced-based measures"
# march-price-quantity-exp-all
mpqea = cmre.merge(esee, on=["year", "expcat"])

Note: because our data_prep code does not perfectly replicate the prep code in AA2011, we also directly use the data generated by the original stata code and check if diverge largely 

In [None]:
cmre_ = pd.read_stata("../ref/origin/Figures/fig-01/clghsgwg-march-regseries-exp.dta")
esee_ = pd.read_stata("../ref/origin/Figures/fig-01/effunits-exp-byexp-6308.dta")
pm_ = pd.read_stata("../ref/origin/Figures/fig-04/pred-marwg-6308.dta")

In [None]:
cmre_.expcat = cmre_.expcat.str.extract("(\d+)").replace({"5":"1","15":"2","25":"3","35":"4","45":"5"})
esee_.expcat = esee_.expcat.replace({"0-9":"1","10-19":"2","20-29":"3","30-39":"4","40-48":"5"})

In [None]:
mpqea_ = cmre_.merge(esee_, on=["year", "expcat"])

In [None]:
mpqea_.columns

## Fig 1 College/High-School Wage Premium

composition-adjusted log college/high school weekly wage premium for FTFY 
- This *composition adjustment holds constant the relative employment shares of demographic group, as defined by gender, education, and potential experience*, across all years of the sample. 
  - In particular, we first compute mean (predicted) log real weekly wages in each year for 40 sex-education-experience groups. 
  - Mean wages for broader groups shown in the figures are then calculated as fixed-weighted averages of the relevant sub-group means (using the average share of total hours worked for each group over 1963 to 2008 as weights). 
  - This adjustment ensures that the estimated college premium is not mechanically affected by shifts in the experience, gender composition, or average level of completed schooling within the broader categories of college and high school graduates. 
- Group details: 
  - These 40 groups consist of five education categories (less than high school, high school graduate, some college, fouryear college degree, post-college schooling), four potential experience levels (0 to 9 years, 10 to 19 years, 20 to 29 years, and 30 to 39 years), and two genders. 
  - Full-time, full-year workers are those who work at least 40 weeks per year and at least 35 hours per week. 
  - The construction of the relative wage series follows Katz and Murphy (1992), Katz and Autor (1999), and Autor et al. (2008). We follow closely the conventions set by these prior studies to facilitate comparisons. The Data Appendix provides further details.
- Fignote:
  - Source: March CPS data for earnings years 1963-2008. Log weekly wages for full-time, fullyear workers are regressed separately by sex in each year on four education dummies (high school dropout, some college, college graduate, greater than college), a quartic in experience, interactions of the education dummies and experience quartic, two race categories (black, non-white other), and a full set of interactions between education, experience, and sex. The composition-adjusted mean log wage is the predicted log wage evaluated for whites at the relevant experience level (5, 15, 25, 35, 45 years) and relevant education level (high school dropout, high school graduate, some college, college graduate, greater than college). Themean log wage for college and high school is the weighted average of the relevant composition adjusted cells using a fixed set of weights equal to the average employment share of each sex by potential experience group. The ratio ofmean log wages for college and high school graduates for each year is plotted. See the Data Appendix for more details on the treatment of March CPS data.

In [None]:
fig, ax = plt.subplots()
mpqea.plot("year", "clphsg_all", ax=ax)
mpqea.plot("year", "clghsg_all", ax=ax) # just use college rather than using college+
# mpqea_.plot("year", "clphsg_all", ax=ax) # slightly higher, but basically the same
ax.axvline(1982, color="grey")
ax.set(ylim=(.34,.71));

## Fig 2 College/High-School Relative Supply of Skills

- use a standard measure of college/non-college relative supply calculated in “efficiency units” to adjust for changes in labor force composition
  - This series is also composition adjusted to correctly weight the changing gender and experience composition of college and non-college labor supply. Our construction of this figure follows Autor et al. (2008) Figure 4b, and adds three subsequent years of data. See the Data Appendix for details.
- fignote:
  - Source: March CPS data for earnings years 1963-2008. Labor supply is calculated using all persons aged 16-64 who reported having worked at least one week in the earnings years, excluding those in the military. The data are sorted into sex-education-experience groups of two sexes (male/female), five education groups (high school dropout, high school graduate, some college, college graduate, and greater than college) and 49 experience groups (0-48 years of potential experience). The number of years of potential experience is calculated by subtracting the number six (the age at which one begins school) and the number of years of schooling from the age of the individual. This number is further adjusted using the assumption that an individual cannot begin work before age 16 and that experience is always non-negative. The labor supply for college/high school groups by experience level is calculated using efficiency units, equal to mean labor supply for broad college (including college graduates and greater than college) and high school (including high school dropouts and high school graduate) categories, weighted by fixed relative average wage weights for each cell. The labor supply of the ’’some college’’ category is allocated equally between the broad college and high school categories. The fixed set of wage weights for 1963-2008 are constructed using the average wage in each of the 490 cells (2 sexes, 5 education groups, 49 experience groups) over this time period.

In [None]:
fig, ax = plt.subplots()
mpqea.plot("year", "eu_lnclg", ax=ax) # this is over college + half some college
# here no only-college series contructed in the code thus we cannot also plot it
# mpqea_.plot("year", "eu_lnclg", ax=ax) # again slightly higher, but basically the same
ax.axvline(1982, color="grey");

## Fig 3 College/High-School Relative Supply by Experience

In [None]:
fig, ax = plt.subplots()
mpqea.query("expcat==1").plot("year", "euexp_lnclg_m", ax=ax) 
mpqea.query("expcat==1").plot("year", "euexp_lnclg_f", ax=ax) 
# mpqea_.query("expcat=='1'").plot("year", "euexp_lnclg_m", ax=ax) # again change very little
# mpqea_.query("expcat=='1'").plot("year", "euexp_lnclg_f", ax=ax) 
ax.axvline(1982, color="grey")
ax.set(ylim=(-1.3,.9));

In [None]:
fig, ax = plt.subplots()
mpqea.query("expcat==3").plot("year", "euexp_lnclg_m", ax=ax) 
mpqea.query("expcat==3").plot("year", "euexp_lnclg_f", ax=ax) 
ax.axvline(1982, color="grey")
ax.set(ylim=(-1.3,.9));

## Fig 4 Log Weekly Real Wages

- Fignote:
  - Source: March CPS data for earnings years 1963-2008. See note to Fig. 1. The real log weekly wage for each education group is the weighted average of the relevant composition adjusted cells using a fixed set of weights equal to the average employment share of each group. Nominal wage values are deflated using the Personal Consumption Expenditure (PCE) deflator. 

In [None]:
rw_allexp = pm.groupby(["year","female","edcat"]).apply(
    lambda x: DescrStatsW(data=x.rplnwkw, weights=x.avlswt).mean).rename("rw").reset_index()

In [None]:
edu_map = {1:"HSD", 2:"HSG", 3:"SMC", 4:"CLG", 5:"GTC"}
rw_allexp.edcat = rw_allexp.edcat.map(edu_map)

In [None]:
rw_allexp_normalize = rw_allexp.groupby(["female","edcat"]).apply(
    lambda x: x.set_index("year", append=True) - x.iloc[0])["rw"].reset_index()

In [None]:
rw_allexp_normalize.query("female==0").pipe(
    (sns.lineplot,"data"),x="year",y="rw",hue="edcat");

In [None]:
rw_allexp_normalize.query("female==1").pipe(
    (sns.lineplot,"data"),x="year",y="rw",hue="edcat");

## Fig 7 Changes throughout Earnings Destribution

In [None]:
ineq_stat["sample"].unique()

In [None]:
ineq_stat.query("sample=='tot_ft_mf'")[[10,50,90,"year"]].set_index("year").pipe(
    lambda x: x-x.iloc[0,:]).plot();

In [None]:
ineq_stat.query("sample=='tot_ft_m'")[[10,50,90,"year"]].set_index("year").pipe(
    lambda x: x-x.iloc[0,:]).plot();

In [None]:
ineq_stat.query("sample=='tot_ft_f'")[[10,50,90,"year"]].set_index("year").pipe(
    lambda x: x-x.iloc[0,:]).plot();

## Fig 19 Katz-Murphy Predictions

- fignote:
  - Source: March CPS data for earnings years 1963-2008. Log weekly wages for full-time, fullyear workers are regressed separately by sex in each year on four education dummies (high school dropout, some college, college graduate, greater than college), a quartic in experience, interactions of the education dummies and experience quartic, and two race categories (black, non-white other). The composition-adjusted mean log wage is the predicted log wage evaluated for whites at the relevant experience level (5, 15, 25, 35, 45 years) and relevant education level (high school dropout, high school graduate, some college, college graduate, greater than college). The mean log wage for college and high school is the weighted average of the relevant composition adjusted cells using a fixed set of weights equal to the average employment share of each sex by experience group. The ratio of mean log wages for college and high school graduates for each year is plotted. See the Data Appendix for more details on the treatment of March CPS data. The Katz-Murphy predicted wage gap series contains the predicted values from a regression of the college/high school wage gap on time trend term and log labor supply, as measured in efficiency units described in the note to Fig. 2, for years 1963-1987.
- @Note: The predicted trend is slightly different from the one in the paper, perhaps due to the data cleaning difference I have mentioned earlier. Anyway the general trend is same.

In [None]:
mpqea["t"] = mpqea.year-1962
dt = mpqea.query("year<1988")[["year","clphsg_all", "eu_lnclg","t"]].drop_duplicates()

In [None]:
model = smf.ols("clphsg_all ~ eu_lnclg + t", data=dt).fit()

In [None]:
km_predict = model.predict(mpqea.eval("t=year-1962")[["eu_lnclg", "t"]]).rename("km_predict")
km_predict = pd.concat([mpqea.year, km_predict],axis=1).drop_duplicates().set_index("year")

In [None]:
fig, ax = plt.subplots()
mpqea.plot("year", "clphsg_all", ax=ax)
ax.plot(km_predict, label="km_predict")
ax.legend()
ax.set(ylim=(.34,.86));

## Fig 20 Detrended Relative Wage and Relative Supply

In [None]:
# "Detrended Wage Differential" "Log Change Relative Wage";
gapdt = smf.ols("clphsg_all ~ t", data=mpqea).fit().resid.drop_duplicates()
# "Detrended Relative Supply" "Log Change Relative Supply";
supdt = smf.ols("eu_lnclg ~ t", data=mpqea).fit().resid.drop_duplicates()

In [None]:
year = mpqea.year.drop_duplicates()

In [None]:
fig, ax1 = plt.subplots()
l1 = ax1.plot(year, gapdt, label="Log change relative wage")
ax2 = ax1.twinx() 
l2 = ax2.plot(year, supdt, label="Log change relative supply",color="C1")
ax1.legend(handles=l1+l2);

## Fig 21: Log College/High-School Weekly Wage Ratio by Experience and Sex

In [None]:
mpqea.columns

In [None]:
fig, ax = plt.subplots()
mpqea.query("expcat==1").plot("year", "clphsg_exp_m", ax=ax)
mpqea.query("expcat==1").plot("year", "clphsg_exp_f", ax=ax);

In [None]:
fig, ax = plt.subplots()
mpqea.query("expcat==3").plot("year", "clphsg_exp_m", ax=ax)
mpqea.query("expcat==3").plot("year", "clphsg_exp_f", ax=ax);

## Table 8

In [None]:
mpqea["t"] = mpqea.year-1962
mpqea["t2"] = (mpqea.year-1962)**2 / 100

dt = mpqea[["year","clphsg_all", "eu_lnclg","t", "t2"]].drop_duplicates()

In [None]:
model = smf.ols("clphsg_all ~ eu_lnclg + t", data=dt).fit()
model.summary()

In [None]:
fig, ax = plt.subplots()
mpqea.plot("year", "clphsg_all", ax=ax)
km_predict = (dt.clphsg_all - model.resid)
km_predict = pd.concat([dt.year, km_predict],axis=1).set_index("year")
ax.plot(km_predict, label="km_predict")
ax.legend()
ax.set(ylim=(.34,.86));

In [None]:
model = smf.ols("clphsg_all ~ eu_lnclg + t + t2", data=dt).fit()
model.summary()

## Table 9

In [None]:
mpqea["t"] = mpqea.year-1962
mpqea["t2"] = (mpqea.year-1962)**2 / 100

dt = mpqea.query("expcat != 5")[["year", "expcat", "clphsg_exp", "eu_lnclg", "euexp_lnclg","t", "t2"]]
dt["euexp_lnclg_diff"] = dt.euexp_lnclg - dt.eu_lnclg

In [None]:
model = smf.ols("clphsg_exp ~ euexp_lnclg_diff + eu_lnclg + t + t2 + C(expcat)", data=dt).fit()
model.summary()

# Use Data Downloaded from IPUMS