Skip to content

Commit

Permalink
write up IRS application
Browse files Browse the repository at this point in the history
  • Loading branch information
CoryMcCartan committed Feb 6, 2024
1 parent 5c2fa43 commit 94b0af5
Show file tree
Hide file tree
Showing 11 changed files with 379 additions and 40 deletions.
16 changes: 16 additions & 0 deletions explore/irs_prior_ex.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
library(birdie)
# `morg_int` is the outcome variable
# `r_probs` contains the BISG estiamtes
# `itm_data` is the data frame

# empirical bayes prior
# 0.1 is the effective number of data points per geo unit (PUMA/ZCTA) in the prior:
# make it bigger for the prior to have a stronger regularizing effect
prior_eb = list(
alpha = 1 + 0.1 * coef(est_weighted(r_probs, morg_int ~ 1, data=itm_data))
)

# then use e.g. as follows:
fit1 = birdie(r_prob, morg_int ~ puma, data=itm_data, prior=prior_eb)

# passing boot = 10 to the above will do the bootstrapping we talked about as well
17 changes: 17 additions & 0 deletions explore/ols_sens.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,20 @@ solve(t(r_pr2) %*% r_pr2) %*% t(r_pr2) %*% (r_pr1 - r_pr2) %*% p_xr[2, ]

coef(lm(pseudo_vf$turnout == "yes" ~ r_pr2 + 0)) - coef(lm(pseudo_vf$turnout == "yes" ~ r_pr1 + 0))
coef(lm(pseudo_vf$turnout == "yes" ~ r_pr2 + 0)) - p_xr[2,]


r_probs = bisg(~ nm(last_name) + zip(zip), pseudo_vf)
p_r = prop.table(table(pseudo_vf$race))
p_r_est = colMeans(r_probs)
X = as.matrix(r_probs)
mu = lm.fit(X, pseudo_vf$turnout == "yes")$coefficients

invM = solve(t(X) %*% X)
sens_r = map_dbl(1:6, function(i) {
delta = matrix(0, nrow=nrow(X), ncol=ncol(X))
delta[, i] = 0.5 * X[, i]
delta_wt = 0.9*X[, -i] + 0.1 * 1/5
delta[, -i] = -delta[, i] * delta_wt / rowSums(delta_wt)
(invM %*% t(X) %*% delta %*% mu)[i, 1]
})
abs(sens_r / sens_r[1])
9 changes: 3 additions & 6 deletions paper/_extensions/CoryMcCartan/cmc-article/partials/title.tex
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,9 @@
$endfor$
$if(thanks)$$thanks$$endif$}$endif$
$for(it.affiliations)$
$if(it.department)$
\\$it.department$\\$it.name$
$else$
\\$it.name$
$endif$
\vspace{0.05in}
$if(it.department)$\\$it.department$$endif$%
$if(it.name)$\\$it.name$$endif$%
\vspace{2pt}
$endfor$ $sep$
\and $endfor$}
$endif$
Expand Down
4 changes: 4 additions & 0 deletions paper/_extensions/CoryMcCartan/cmc-article/refs.lua
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@ function proc_app(el)
pandoc.RawInline("latex", "\\begin{CSLReferences}{0}{0}\\end{CSLReferences}\n\n"),
pandoc.RawBlock("latex", "\\appendix\n"),
pandoc.RawInline("latex", "\\renewcommand\\thefigure{\\thesection\\arabic{figure}}\n"),
pandoc.RawInline("latex", "\\renewcommand\\thetable{\\thesection\\arabic{table}}\n"),
pandoc.RawInline("latex", "\\setcounter{figure}{0}\n\n"),
pandoc.RawInline("latex", "\\setcounter{table}{0}\n\n"),
el
})
else
return pandoc.List({
pandoc.RawBlock("latex", "\\appendix\n"),
pandoc.RawInline("latex", "\\renewcommand\\thefigure{\\thesection\\arabic{figure}}\n"),
pandoc.RawInline("latex", "\\renewcommand\\thetable{\\thesection\\arabic{table}}\n"),
pandoc.RawInline("latex", "\\setcounter{figure}{0}\n\n"),
pandoc.RawInline("latex", "\\setcounter{table}{0}\n\n"),
el
})
end
Expand Down
Binary file added paper/data/irs_amt.rds
Binary file not shown.
Binary file added paper/data/irs_pct.rds
Binary file not shown.
Binary file added paper/data/race_mortg.rds
Binary file not shown.
Binary file added paper/figures/irs_hmid.pdf
Binary file not shown.
150 changes: 118 additions & 32 deletions paper/race-proxy.qmd

Large diffs are not rendered by default.

40 changes: 38 additions & 2 deletions paper/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ @article{goodman1953ecological
}


@article{king1997solution,
@book{king1997solution,
title={A Solution to the Ecological Inference Problem: Reconstructing Individual Behavior from Aggregate Data},
author={King, Gary},
year={1997},
Expand Down Expand Up @@ -629,14 +629,15 @@ @book{brown2022whiteness
publisher={Crown}
}


@article{order8613985,
title={13985. Advancing Racial Equity and Support for Underserved Communities Through the Federal Government},
author={Order, Executive},
journal={Federal Register},
volume={86}
}

@InBook{cho2008ecological,
@incollection{cho2008ecological,
author = {Cho, Wendy Tam and Manski, Charles F.},
editor = {Box-Steffensmeier, Janet and Brady, Henry and Collier, David},
title = {Oxford Handbook of Political Methodology},
Expand All @@ -656,3 +657,38 @@ @InBook{cho2008ecological
OPTannote = {}
}


% IRS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@article{brown2018homeownership,
title={Homeownership in Black and White: The Role of Tax Policy in Increasing Housing Inequity},
author={Brown, Dorothy A},
journal={U. Mem. L. Rev.},
volume={49},
pages={205},
year={2018},
publisher={HeinOnline}
}

@techreport{crs2017deductions,
author = {{Congressional Research Service}},
title = {Tax Deductions for Individuals: A Summary},
number = {R42872},
year = {2017}
}

@techreport{cronin2023bisg,
author = {Cronin, Julie Ann and DeFilippes, Portia and Fisher, Robin},
title = {Tax Expenditures by Race and Hispanic Ethnicity: An Application of the {U.S. Treasury Department's} Race and {Hispanic} Ethnicity Imputation},
institution = {Office of Tax Analysis, U.S. Department of the Treasury},
number = {122},
year = {2023}
}

@techreport{sullivan2017hmid,
author = {Sullivan, Laura and Meschede, Tatjana and Shapiro, Thomas and Fernanda Escobar, Maria},
title = {Misdirected Investments: How the Mortgage Interest Deduction Drives Inequality and the Racial Wealth Gap},
institution = {Institute on Assets and Social Policy and the National Low Income Housing Coalition},
year={2017}
}
183 changes: 183 additions & 0 deletions replication/06_irs_sum.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
library(easycensus)

# IRS text output ------------------------
morg_int_brk = c(1, 3785.9, 5681, 7204, 8734, 10390, 12346, 14618, 17298, 22301, 750000)

res_txt = "
white black hisp asian aian other
1 0.89394365 0.934647100 0.953928413 0.878645558 0.869804696 0.82222368
2 0.01063682 0.011650768 0.003147550 0.005412941 0.017885690 0.01714904
3 0.01056057 0.010476624 0.004052581 0.006082662 0.015768905 0.01929709
4 0.01013059 0.009146048 0.004552202 0.006604051 0.018265482 0.01978764
5 0.01047017 0.008145958 0.005276701 0.007706196 0.016134144 0.02019548
6 0.01011557 0.007111155 0.005526864 0.009067576 0.014205106 0.02039068
7 0.01035289 0.005784415 0.005576239 0.011114815 0.012223831 0.01700871
8 0.01078163 0.004272466 0.005389839 0.013982239 0.009505774 0.01853845
9 0.01069592 0.003655878 0.005197625 0.017011964 0.009813602 0.01657461
10 0.01095254 0.002905778 0.004598053 0.021007179 0.008118151 0.01665905
11 0.01135965 0.002203808 0.002753933 0.023364818 0.008274620 0.01217556
"

res_amt = c(1428.9228, 633.5371, 597.4768, 2096.2345, 1482.6316, 2215.9076)


# process ------------------------------
fmt_brk = label_dollar(accuracy=0.1, scale=1e-3, suffix="k")
morg_int_lbl = str_c("[", fmt_brk(lag(morg_int_brk, 1, default=0)),
", ", fmt_brk(morg_int_brk), ")")
morg_int_lbl[1] = "$0"
morg_int_lbl[2] = str_c("[$1, ", fmt_brk(morg_int_brk[2]), ")")
morg_int_lbl[11] = str_c("[", fmt_brk(morg_int_brk[10]), ", $750k]")

res_m = str_split_1(res_txt, "\n") |>
str_split("\\s+") |>
keep(~ length(.) > 1) |>
map(~ .[-1]) # remove row numbers

d_est = res_m[-1] |>
map(function(row) {
tibble::new_tibble(list(
race = races[res_m[[1]]],
est = as.numeric(row)
))
}) |>
bind_rows(.id = "morg_int") |>
mutate(race = fct_inorder(race),
morg_int = fct_inorder(morg_int_lbl[as.integer(morg_int)]))

d_amt = tibble(race = fct_inorder(races),
est = res_amt / (1 - d_est$est[1:6]))

res_m[-1] |>
map(as.numeric) |>
do.call(rbind, args=_) |>
`colnames<-`(res_m[[1]]) |>
`rownames<-`(morg_int_lbl) |>
write_rds(here("paper/data/irs_pct.rds"), compress="gz")
res_amt / (1 - as.numeric(res_m[[2]])) |>
setNames(res_m[[1]]) |>
write_rds(here("paper/data/irs_amt.rds"), compress="gz")


# Census benchmarks --------------------------

# cens_find_dec(tenure)
d_mortg = cens_get_dec(tables_sf1$H4, geo="zcta")
idx_own = which(str_detect(d_mortg$tenure[1:3], "free and clear"))
idx_mort = which(str_detect(d_mortg$tenure[1:3], "mortgage"))
d_mortg = d_mortg |>
group_by(GEOID) |>
summarize(pct_mortg = value[idx_mort] / (value[idx_own] + value[idx_mort]))

# Check: can we use racial comp. to predict fraction of owners w/ a mortgage?
d_ei = census_race_geo_table("zcta") |>
left_join(d_mortg, by="GEOID") |>
mutate(total = white + black + hisp + asian + aian + other,
across(white:other, ~ . / total))
ggplot(d_ei, aes(white, pct_mortg, size=total)) +
geom_point(alpha=0.5) +
scale_size_area(max_size=2) +
geom_smooth()
# Answer: no. So probably OK to assume this fraction is const. across races
lm(pct_mortg ~ white + black + hisp + asian + aian + other - 1, data=d_ei) |>
summary()


tables_sf2 = cens_parse_tables("dec/sf1", 2010)
# cens_find(tables_sf2, tenure, race)
d_ten_race = cens_get_dec(tables_sf2$HCT1, geo="zcta")
d_ten_race = d_ten_race |>
transmute(GEOID = GEOID, tenure = tenure, value = value,
hisp = tidy_ethnicity(hispanic_or_latino_origin_of_householder),
race = tidy_race(str_remove(race_of_householder, "^householder who is "))) |>
filter(hisp != "total", !is.na(hisp),
race != "total", !is.na(race)) |>
mutate(race = case_when(
hisp == "hisp" ~ "hisp",
race %in% c("nhpi", "other", "two") ~ "other",
TRUE ~ race
)) |>
count(GEOID, tenure, race, wt=value)

# join, impute with average of nearby ZIPs, and tally to nation
d_mortg_race = d_ten_race |>
left_join(d_mortg, by="GEOID") |>
group_by(zip_g3 = str_sub(GEOID, 1, 3)) |>
mutate(pct_mortg = coalesce(pct_mortg, mean(pct_mortg, na.rm=TRUE))) |>
group_by(zip_g2 = str_sub(GEOID, 1, 2)) |>
mutate(pct_mortg = coalesce(pct_mortg, mean(pct_mortg, na.rm=TRUE))) |>
group_by(race) |>
summarize(total = sum(n),
pct_owners = sum(n[tenure == "owner occupied"]) / total,
pct_mortg = sum((n * pct_mortg)[tenure == "owner occupied"]) / total) |>
mutate(race = factor(race, levels=names(races), labels=races)) |>
arrange(race)


d_mortg_race |>
mutate(across(pct_owners:pct_mortg, ~ percent(., accuracy=0.1))) |>
transmute(`Race`=race, `Total households`=comma(total),
`Fraction owner-occupied`=pct_owners,
`Fraction with mortgage`=pct_mortg) |>
write_rds(here("paper/data/race_mortg.rds"), compress="gz")


# plot ---------------------------------

# expected rate if all groups had the same prop. having mortgages
d_mort_adj = d_est |>
filter(as.integer(morg_int) == 1, # only non-users
race != "Other") |>
left_join(d_mortg_race, by="race") |>
group_by(morg_int) |>
arrange(race) |>
mutate(est = 1 - est, # pct that use
est_nodisp = est[1] * pct_mortg / pct_mortg[1]) |>
# est_nodisp = weighted.mean(est, total) * pct_mortg /
# weighted.mean(pct_mortg, total)) |>
ungroup() |>
select(-morg_int, -total, -pct_owners)

p1 = d_est |>
filter(as.integer(morg_int) > 1, # remove non-users
race != "Other") |>
ggplot(aes(race, est, fill=morg_int)) +
geom_col(color="white", linewidth=0.2) +
geom_text(aes(label=morg_int, group=morg_int,
color=as.integer(morg_int) > 6,
alpha=as.integer(morg_int) %% 3 == 2),
position=position_stack(vjust = 0.5), size=1.9, family="Times") +
geom_segment(aes(x = as.integer(race) - 0.5, xend = as.integer(race) + 0.5,
y = est_nodisp, yend = est_nodisp),
data=d_mort_adj, inherit.aes=FALSE,
linewidth=0.8, linetype="11") +
scale_fill_wa_d("volcano", name="Home mortgage\ninterest deduction", reverse=TRUE) +
scale_color_manual(values=c("black", "white"), guide="none") +
scale_alpha_manual(values=c(0, 1), guide="none") +
scale_y_continuous("Estimated fraction of filers", labels=percent,
minor_breaks=seq(0, 1, 0.01),
expand=expansion(mult=c(0, 0.05))) +
labs(x=NULL) +
theme_paper() +
theme(panel.grid.major.x=element_blank(),
legend.position="left",
legend.margin=margin())

p2 = d_amt |>
filter(race != "Other") |>
ggplot(aes(race, est)) +
geom_col(fill=wacolors$volcano[3]) +
scale_y_continuous("Estimated average HMID amount", labels=dollar,
minor_breaks=seq(0, 20000, 1000),
expand=expansion(mult=c(0, 0.05))) +
labs(x=NULL) +
theme_paper() +
theme(panel.grid.major.x=element_blank(),
plot.title.position="plot")

p = (p1 + labs(title="(a) Deduction distribution")) +
(p2 + labs(title=" (b) Average deduction when taken")) +
plot_layout(widths=c(2, 1.1)) &
theme(legend.margin=margin(),
plot.margin=margin(l=4, r=2))
ggsave(here("paper/figures/irs_hmid.pdf"), plot=p, width=8, height=4.25)

0 comments on commit 94b0af5

Please sign in to comment.