In [1]:
!python -m pip install -r requirements_ml.txt

[31mERROR: Invalid requirement: 'scikit-learn=1.3.1' (from line 2 of requirements_ml.txt)
Hint: = is not a valid operator. Did you mean == ?[0m[31m
[0m

In [19]:
from pathlib import Path

import pandas as pd

from sklearn.tree import DecisionTreeClassifier

from epxpop import SynthPop
from epxpop.pop import race_map as RACE_MAP, sex_map as SEX_MAP

In [3]:
RESULTS_DIR = Path("results")

We focus specifically on results for Carson City county (FIPS code `32510`) for this example for speed and ease of processing. However, this could be straightforwardly scaled up to include all results for the State of Nevada.

In [4]:
selected_county = "32510"

## Read simulation results data

Listing of agents who were diagnosed with diabetes by scenario, run number, and year

In [5]:
def read_results_file(file: Path, scenario_name: str) -> pd.DataFrame:
    run_id = int(file.name.split("_")[0])
    insur_id_df = (
        pd.Series(
            {
                0: "no_insurance",
                1: "medicaid",
                2: "medicare",
                3: "private",
            }
        )
        .rename_axis("insurance_status_id")
        .rename("insurance_status")
        .reset_index()
    )
    return (
        pd.read_csv(
            file,
            dtype={
                "agent_id": str,
                "sim_day": int,
                "is_participating": int,
                "insurance_status_id": int,
                "census_tract": str,
                "block_group": str,
            },
        )
        .assign(run_id=run_id)
        .assign(scenario=scenario_name)
        .merge(insur_id_df, on="insurance_status_id", how="left")
    )


def list_results_files(individual_results_dir) -> dict[str, list[Path]]:
    scenarios = ["no-program", "eq-participation"]
    return {s: list((individual_results_dir / s).glob("*.csv")) for s in scenarios}


scenarios = ["no-program", "eq-participation"]
results_files = list_results_files(RESULTS_DIR / "individual")
results_df = (
    pd.concat(
        [
            pd.concat([read_results_file(f, s) for f in results_files[s]])
            for s in scenarios
        ]
    )
    .assign(sim_year=lambda df: (df["sim_day"] / 380).astype(int) + 2023)
    .pipe(lambda df: df[df["block_group"].str[:5] == selected_county])
    .loc[:, ["scenario", "run_id", "sim_year", "agent_id", "insurance_status_id"]]
)
results_df.head()

Unnamed: 0,scenario,run_id,sim_year,agent_id,insurance_status_id
124509,no-program,1,2023,164129744,3
124510,no-program,1,2023,164129774,1
124511,no-program,1,2023,164129776,3
124512,no-program,1,2023,164129779,3
124513,no-program,1,2023,164234945,3


## Associate simulation results with demographic data from Synthetic Population

Listing of all agents in Nevada in the synthetic population

In [6]:
pop = SynthPop(country="usa", version="US_2010.v5")

In [7]:
regular_people_df = (
    pop.load_people(locations=["NV"], include_gq_people=False)
    .merge(pop.load_people_household_xref(locations=["NV"]), how="left", on="ID")
    .merge(
        pop.load_households(locations=["NV"]),
        how="left",
        left_on="PLACE",
        right_on="ID",
    )
    .rename(columns={"ID_x": "ID"})
    .loc[:, ["ID", "AGE", "sex", "race", "Block_Group"]]
)

gq_people_df = (
    pop.load_gq_people(["NV"])
    .loc[:, ["ID", "AGE", "sex", "race", "Block_Group"]]
    .pipe(lambda df: df[~df["Block_Group"].isna()])
)

people_df = (
    pd.concat([regular_people_df, gq_people_df])
    .assign(block_group=lambda df: df["Block_Group"].astype(str))
    .drop(columns="Block_Group")
)
people_df.head(5)

Unnamed: 0,ID,AGE,sex,race,block_group
0,164129744,51,0,1,325100001003
1,164129751,53,1,1,325100004001
2,164129754,51,0,1,325100008002
3,164129755,53,1,1,325100008002
4,164129766,51,0,1,325100009004


Create a person table with a row per scenario and simulated year for individuals in Carson City county.

In [8]:
all_people_df = (
    pd.concat(
        [
            people_df.pipe(lambda df: df[df["block_group"].str[:5] == selected_county])
            .assign(scenario=s)
            .assign(sim_year=y)
            .assign(run_id=r)
            for s in scenarios
            for y in list(range(2023, 2033))
            for r in range(1, 4)
        ]
    )
    .rename(columns={"ID": "agent_id", "AGE": "age"})
    .assign(agent_id=lambda df: df["agent_id"].astype(str))
    .assign(census_tract=lambda df: df["block_group"].str[:11])
    .loc[
        :,
        [
            "scenario",
            "run_id",
            "agent_id",
            "sim_year",
            "age",
            "sex",
            "race",
            "census_tract",
        ],
    ]
)

Encode census tracts with numerical IDs

In [10]:
census_tract_map = (
    all_people_df["census_tract"]
    .drop_duplicates()
    .reset_index(drop=True)
    .rename_axis("census_tract_id")
    .to_frame()
    .reset_index()
)

Construct final dataset

In [11]:
data_df = (
    all_people_df.merge(
        results_df.assign(diagnosed=True),
        on=["agent_id", "scenario", "sim_year", "run_id"],
        how="left",
    )
    .assign(diagnosed=lambda df: df["diagnosed"].fillna(False).astype(int))
    .assign(
        insurance_status_id=lambda df: df["insurance_status_id"].fillna(3).astype(int)
    )
    .merge(census_tract_map, on="census_tract", how="left")
    .drop(columns="census_tract")
)
data_df.head()

Unnamed: 0,scenario,run_id,agent_id,sim_year,age,sex,race,insurance_status_id,diagnosed,census_tract_id
0,no-program,1,164129744,2023,51,0,1,3,1,0
1,no-program,1,164129751,2023,53,1,1,3,0,1
2,no-program,1,164129754,2023,51,0,1,3,0,2
3,no-program,1,164129755,2023,53,1,1,3,0,2
4,no-program,1,164129766,2023,51,0,1,3,0,3


Note that the numerical age and sex IDs can be looked up using data from our `epxpop` package

In [20]:
SEX_MAP

{0: 'Female', 1: 'Male'}

In [21]:
RACE_MAP

{-1: 'Unspecified',
 0: 'Unknown',
 1: 'White',
 2: 'African American',
 3: 'American Indian',
 4: 'Alaska Native',
 5: 'Tribal',
 6: 'Asian',
 7: 'Hawaiian Native',
 8: 'Other Race',
 9: 'Multiple Races'}

## Query individual-level diabetes dataset

Show all people under 30 diagnosed across all simulation runs and scenarios

In [12]:
data_df.pipe(lambda df: df[(df["diagnosed"] == 1) & (df["age"] < 30)])

Unnamed: 0,scenario,run_id,agent_id,sim_year,age,sex,race,insurance_status_id,diagnosed,census_tract_id
226,no-program,1,166793909,2023,27,1,1,3,1,4
339,no-program,1,167757987,2023,23,0,1,3,1,6
413,no-program,1,168217348,2023,21,1,1,2,1,7
939,no-program,1,174316086,2023,20,1,2,0,1,11
941,no-program,1,174316091,2023,20,1,2,1,1,9
...,...,...,...,...,...,...,...,...,...,...
3316704,eq-participation,3,399793143,2032,29,1,1,3,1,2
3319367,eq-participation,3,135298548,2032,27,1,9,3,1,2
3320520,eq-participation,3,147088684,2032,21,1,8,3,1,4
3320645,eq-participation,3,148212799,2032,29,0,1,3,1,4


## Fit models

In [13]:
feature_cols = ["age", "sex", "race", "insurance_status_id", "census_tract_id"]
target_col = "diagnosed"

In [14]:
no_program_df = data_df.pipe(lambda df: df[df["scenario"] == "no-program"])
eq_participation_df = data_df.pipe(lambda df: df[df["scenario"] == "eq-participation"])

In [15]:
no_program_model = DecisionTreeClassifier()
no_program_model.fit(
    no_program_df.loc[:, feature_cols].values, no_program_df[target_col].values
)

In [16]:
no_program_model.predict(
    [
        [52, 0, 1, 0, 4],
        [20, 0, 1, 4, 4],
    ]
)

array([1, 0])

In [17]:
with_program_model = DecisionTreeClassifier()
with_program_model.fit(
    eq_participation_df.loc[:, feature_cols].values,
    eq_participation_df[target_col].values,
)

In [18]:
no_program_model.predict(
    [
        [52, 0, 1, 0, 4],
        [20, 0, 1, 4, 4],
    ]
)

array([1, 0])