In [1]:
import git
import pandas as pd

In [2]:
repo = git.Repo(".", search_parent_directories=True).working_tree_dir
cdi_raw_path = f"{repo}/datasets/raw/U.S._Chronic_Disease_Indicators_2023.csv"

In [3]:
cdi_processed_path = f"{repo}/datasets/processed/cdi.csv"

cdi_cols = [
    "YearStart",
    "YearEnd",
    "LocationAbbr",
    "Topic",
    "Question",
    "DataValueUnit",
    "DataValueType",
    "DataValue",
    "StratificationCategory1",
    "Stratification1",
]

cdi_df = pd.read_csv(
    cdi_raw_path,
    usecols=cdi_cols,
)

cdi_filters = {
    "Topic": [
        "Cancer",
        "Cardiovascular Disease",
        "Chronic Kidney Disease",
        "Chronic Obstructive Pulmonary Disease",
    ],
    "DataValueType": [
        "Number",
        "Crude Prevalence",
        "Age-adjusted Prevalence",
        "Average Annual Number",
        "Average Annual Age-adjusted Rate",
        "Average Annual Crude Rate",
    ],
}

cdi_dtype = {
    "YearStart": "category",
    "YearEnd": "category",
    "LocationAbbr": "category",
    "Topic": "category",
    "Question": "category",
    "DataValueUnit": "category",
    "DataValueType": "category",
    "DataValue": "Float32",
    "StratificationCategory1": "category",
    "Stratification1": "category",
}

for column, allowed_values in cdi_filters.items():
    cdi_df = cdi_df[cdi_df[column].isin(allowed_values)]

for column, dtype in cdi_dtype.items():
    cdi_df[column] = cdi_df[column].astype(dtype)

cdi_df = cdi_df.sort_values(["YearEnd", "Topic", "Question", "LocationAbbr"])
cdi_df.to_csv(cdi_processed_path, index=False)

  cdi_df = pd.read_csv(


In [None]:
cdi_us_overall_processed_path = f"{repo}/datasets/processed/cdi_us_overall.csv"

cdi_us_overall_cols = [
    "YearEnd",
    "LocationAbbr",
    "Topic",
    "Question",
    "DataValueUnit",
    "DataValueType",
    "DataValue",
    "StratificationCategory1",
]

cdi_us_overall_df = pd.read_csv(
    cdi_raw_path,
    usecols=cdi_us_overall_cols,
)

cdi_us_overall_filters = {
    "Topic": [
        "Chronic Kidney Disease",
        "Chronic Obstructive Pulmonary Disease",
    ],
    "DataValueType": [
        "Number",
        "Crude Prevalence",
        "Age-adjusted Prevalence",
    ],
    "LocationAbbr": ["US"],
    "StratificationCategory1": ["Overall"],
}

cdi_us_overall_dropped_cols = ["LocationAbbr", "StratificationCategory1"]

cdi_us_overall_dtype = {
    "YearEnd": "category",
    "Topic": "category",
    "Question": "category",
    "DataValueUnit": "category",
    "DataValueType": "category",
    "DataValue": "Float32",
}

# Filter data
for column, allowed_values in cdi_us_overall_filters.items():
    cdi_us_overall_df = cdi_us_overall_df[
        cdi_us_overall_df[column].isin(allowed_values)
    ]

# Drop single-value columns (like US)
cdi_us_overall_df = cdi_us_overall_df.drop(columns=cdi_us_overall_dropped_cols)

# Set types
for column, dtype in cdi_us_overall_dtype.items():
    cdi_us_overall_df[column] = cdi_us_overall_df[column].astype(dtype)

cdi_us_overall_df = cdi_us_overall_df.sort_values(["YearEnd", "Topic", "Question"])
cdi_us_overall_df.to_csv(cdi_us_overall_processed_path, index=False)

  cdi_us_overall_df = pd.read_csv(
