In [24]:
import polars as pl

from utils.paths import FIG_PATH, FINAL_PATH, TBL_PATH, PPP_PATH, DIST_PATH

In [None]:
all_df = (
    pl.scan_parquet(FINAL_PATH / "all.parquet")
    .select("race_ethnicity", "is_self_reported")
    .collect()
)

In [None]:
# with pl.Config(tbl_rows=100):
#     print(all_df.get_column("state_abbrev").value_counts())
#     print(all_df.get_column("zcta").value_counts())


race = all_df.get_column("race_ethnicity").value_counts().rename({"counts": "Total"})
race_self_report = (
    all_df.filter(pl.col("is_self_reported"))
    .get_column("race_ethnicity")
    .value_counts()
    .rename({"counts": "Self-Reported"})
)
race_inferred = (
    all_df.filter(~pl.col("is_self_reported"))
    .get_column("race_ethnicity")
    .value_counts()
    .rename({"counts": "From Ethnicity"})
)
race_counts = (
    race.join(race_self_report, on="race_ethnicity", how="left")
    .join(race_inferred, on="race_ethnicity", how="left")
    .sort("race_ethnicity")
    .rename({"race_ethnicity": "Race"})
    .fill_null(0)
    .fill_nan(0)
    .with_columns(
        pl.col("Total", "Self-Reported", "From Ethnicity").apply(lambda x: f"{x:,}"),
        pl.col("Race").str.to_titlecase(),
    )
)
race_counts.to_pandas().to_latex(TBL_PATH / "l2_race_counts.tex", index=False)

In [3]:
ppp_raw = pl.read_parquet(PPP_PATH / "ppp_raw.parquet")
ppp_raw.shape[0]

11460475

In [10]:
ppp_clean = pl.read_parquet(PPP_PATH / "ppp_clean.parquet")
print(ppp_clean.shape[0])
print(ppp_clean.unique(["first_name", "last_name", "zip", "race_ethnicity"]).shape[0])

1066697

In [23]:
ppp_test = pl.read_parquet(FINAL_PATH / "ppp_test.parquet")
ppp_test.unique(["first_name", "last_name", "zcta"]).shape[0]
ppp_test.unique("state_abbrev").shape[0]
ppp_test.unique("zcta").shape[0]


ppp_race_counts = (
    ppp_test.get_column("race_ethnicity")
    .value_counts()
    .rename({"race_ethnicity": "Race", "counts": "Total"})
    .join(
        (
            pl.read_parquet(FINAL_PATH / "ppp_test_sample.parquet")
            .get_column("race_ethnicity")
            .value_counts()
            .rename({"race_ethnicity": "Race", "counts": "Sampled"})
        ),
        on="Race",
        how="left",
    )
    .sort("Race")
    .with_columns(
        pl.col("Race").str.to_titlecase(),
        pl.col("Total", "Sampled").apply(lambda x: f"{x:,}"),
    )
    .to_pandas()
    .to_latex(TBL_PATH / "ppp_race_counts.tex", index=False)
)

In [25]:
pl.read_csv(DIST_PATH / "original/prob_race_given_surname_2010.csv")

name,white,black,api,native,multiple,hispanic
str,f64,f64,f64,f64,f64,f64
"""AAB""",0.8797,0.0175,0.0175,0.0,0.0677,0.0175
"""AABERG""",0.951,0.0,0.0053,0.0053,0.0256,0.0128
"""AABY""",0.9818,0.0061,0.0061,0.0,0.0,0.0061
"""AADLAND""",0.8797,0.0053,0.0053,0.0348,0.0508,0.0241
"""AAFEDT""",0.913,0.0121,0.0121,0.0,0.0507,0.0121
"""AAGAARD""",0.9833,0.0,0.0056,0.0,0.0056,0.0056
"""AAGARD""",0.9712,0.0,0.0028,0.0,0.0028,0.0231
"""AAGESEN""",0.9492,0.0,0.0169,0.0,0.0169,0.0169
"""AAKER""",0.9469,0.0084,0.0084,0.0,0.014,0.0223
"""AAKHUS""",0.903,0.0074,0.0074,0.0,0.0373,0.0448
