# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [183]:
import duckdb

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Create database in order to persist data if one does not already exist. (DO NOT create a `.duckdb` file manually as this will not contain the instructions for duckdb to read and write data from and to)

In [184]:
# conn = duckdb.connect("chronic_disease_analyses.duckdb")

# loading transformed population tables

In [185]:
us_populations_per_state_file_names = ['./data/population-data-transformed/us_populations_per_state.parquet/*.parquet']

In [186]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state AS
    SELECT *
    FROM read_parquet({us_populations_per_state_file_names}, union_by_name=True, filename=False)
""")

In [187]:
us_populations_per_state_by_sex_age_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2020_2023.parquet/*.parquet']

In [188]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_age AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_age_file_names}, union_by_name=True, filename=False)
""")

In [189]:
us_populations_per_state_by_sex_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2020_2023.parquet/*.parquet']

In [190]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_race_ho AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_race_ho_file_names}, union_by_name=True, filename=False)
""")

In [191]:
us_populations_per_state_by_sex_age_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_race_ho_2020_2023.parquet/*.parquet']

In [192]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_age_race_ho AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_age_race_ho_file_names}, union_by_name=True, filename=False)
""")

In [193]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1071 │
└──────────────┘

In [194]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state_by_sex_age
""")

┌────────────────┬───────┬────────────┬─────────┬──────────┬──────────┬────────┐
│    Bracket     │ Year  │ Population │   Sex   │  State   │ AgeStart │ AgeEnd │
│    varchar     │ int32 │   int64    │ varchar │ varchar  │  double  │ double │
├────────────────┼───────┼────────────┼─────────┼──────────┼──────────┼────────┤
│ under 5 years  │  2000 │     150609 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2001 │     151410 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2002 │     150856 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2003 │     150594 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2004 │     150699 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2005 │     150960 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2006 │     151442 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2007 │     153128 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2008 │  

In [195]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        29376 │
└──────────────┘

In [196]:
duckdb.sql("""
    SELECT DISTINCT(Ethnicity)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌─────────────┐
│  Ethnicity  │
│   varchar   │
├─────────────┤
│ Multiracial │
│ Asian       │
│ Nhpi        │
│ White       │
│ Black       │
│ Aian        │
│ NHPI        │
│ AIAN        │
└─────────────┘

In [197]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state_by_sex_age_race_ho
""")

┌─────────┬─────────┬───────┬─────────────┬──────────────┬─────────┬──────────────────┬───────┬────────────┐
│ StateID │  State  │  Age  │  Ethnicity  │    Origin    │   Sex   │ StratificationID │ Year  │ Population │
│ varchar │ varchar │ float │   varchar   │   varchar    │ varchar │     varchar      │ int32 │   int64    │
├─────────┼─────────┼───────┼─────────────┼──────────────┼─────────┼──────────────────┼───────┼────────────┤
│ AL      │ Alabama │   0.0 │ White       │ Not Hispanic │ Male    │ NH_M_WHITE       │  2000 │      19270 │
│ AL      │ Alabama │   0.0 │ White       │ Not Hispanic │ Male    │ NH_M_WHITE       │  2001 │      19612 │
│ AL      │ Alabama │   0.0 │ White       │ Not Hispanic │ Male    │ NH_M_WHITE       │  2002 │      18731 │
│ AL      │ Alabama │   0.0 │ White       │ Not Hispanic │ Male    │ NH_M_WHITE       │  2003 │      18623 │
│ AL      │ Alabama │   0.0 │ White       │ Not Hispanic │ Male    │ NH_M_WHITE       │  2004 │      18659 │
│ AL      │ Alabama

In [217]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_age_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      2526336 │
└──────────────┘

In [198]:
duckdb.sql("""
    SELECT Age, COUNT(Age) AS AgeOcc
    FROM us_populations_per_state_by_sex_age_race_ho
    GROUP BY Age
""")

┌───────┬────────┐
│  Age  │ AgeOcc │
│ float │ int64  │
├───────┼────────┤
│  76.0 │  29376 │
│  79.0 │  29376 │
│  84.0 │  29376 │
│   0.0 │  29376 │
│  11.0 │  29376 │
│  21.0 │  29376 │
│  40.0 │  29376 │
│  70.0 │  29376 │
│   3.0 │  29376 │
│  23.0 │  29376 │
│    ·  │    ·   │
│    ·  │    ·   │
│    ·  │    ·   │
│   8.0 │  29376 │
│  24.0 │  29376 │
│  49.0 │  29376 │
│  18.0 │  29376 │
│  34.0 │  29376 │
│  42.0 │  29376 │
│  66.0 │  29376 │
│  82.0 │  29376 │
│  36.0 │  29376 │
│  59.0 │  29376 │
├───────┴────────┤
│    86 rows     │
│   (20 shown)   │
└────────────────┘

In [199]:
duckdb.sql("""
    SELECT DISTINCT(StratificationID)
    FROM us_populations_per_state_by_sex_age_race_ho
""").fetchall()

[('H_M_BLACK',),
 ('H_F_NHPI',),
 ('H_M_AIAN',),
 ('H_F_MULTI',),
 ('NH_M_NHPI',),
 ('H_M_NHPI',),
 ('NH_F_BLACK',),
 ('NH_F_AIAN',),
 ('H_F_WHITE',),
 ('NH_M_WHITE',),
 ('NH_M_AIAN',),
 ('NH_M_ASIAN',),
 ('H_M_ASIAN',),
 ('H_M_WHITE',),
 ('NH_F_ASIAN',),
 ('NH_F_NHPI',),
 ('H_F_BLACK',),
 ('NH_F_MULTI',),
 ('H_F_AIAN',),
 ('H_F_ASIAN',),
 ('NH_M_BLACK',),
 ('NH_M_MULTI',),
 ('H_M_MULTI',),
 ('NH_F_WHITE',)]

In [200]:
duckdb.sql("""
    SELECT Sex, Origin, Ethnicity, StratificationID
    FROM us_populations_per_state_by_sex_age_race_ho
""")

┌─────────┬──────────────┬─────────────┬──────────────────┐
│   Sex   │    Origin    │  Ethnicity  │ StratificationID │
│ varchar │   varchar    │   varchar   │     varchar      │
├─────────┼──────────────┼─────────────┼──────────────────┤
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│ Male    │ Not Hispanic │ White       │ NH_M_WHITE       │
│  ·      │    ·         │   ·         │     ·            │
│  ·      │    ·         │   ·         │     ·            │
│  ·      │    ·         │   ·         │

In [201]:
population_unique_states = duckdb.sql("""
    SELECT DISTINCT(State)
    FROM us_populations_per_state
""").fetchall()

In [202]:
population_state_names = list(zip(*population_unique_states))[0]
population_state_names

('New Mexico',
 'Washington',
 'New Jersey',
 'Tennessee',
 'Wisconsin',
 'Alabama',
 'California',
 'Colorado',
 'Illinois',
 'Mississippi',
 'Arizona',
 'Delaware',
 'Hawaii',
 'North Dakota',
 'Kentucky',
 'New York',
 'Indiana',
 'Maryland',
 'Iowa',
 'Kansas',
 'Louisiana',
 'Ohio',
 'Rhode Island',
 'Texas',
 'Alaska',
 'District of Columbia',
 'Florida',
 'Minnesota',
 'Nevada',
 'New Hampshire',
 'Arkansas',
 'Idaho',
 'South Carolina',
 'West Virginia',
 'Georgia',
 'South Dakota',
 'Vermont',
 'Oklahoma',
 'Utah',
 'Virginia',
 'Wyoming',
 'Montana',
 'Nebraska',
 'North Carolina',
 'Connecticut',
 'Maine',
 'Massachusetts',
 'Oregon',
 'Michigan',
 'Missouri',
 'Pennsylvania')

# Loading transformed chronic disease indicators table

In [203]:
cdi_file_name = './data/cdi-data-transformed/cdi.parquet/*.parquet'

In [204]:
q = f"""
    CREATE OR REPLACE TABLE cdi AS
    SELECT *
    FROM read_parquet('{cdi_file_name}')
"""

In [205]:
duckdb.sql(q)

In [206]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM cdi
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       678471 │
└──────────────┘

In [207]:
duckdb.sql("""
    SELECT *
    FROM cdi
""")

┌───────────┬─────────┬──────────────┬────────────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬─────────────────────────┬───────────┬────────────────────┬─────────────────────┬────────────┬─────────┬────────────┬─────────────────┬─────────────────────┬────────────────────┬──────────┬────────┬─────────┬───────────┬──────────────┬───────────────────┐
│ YearStart │ YearEnd │ LocationAbbr │  LocationDesc  │  Topic  │                                        Question                                         │ DataValueUnit │      DataValueType      │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ LocationID │ TopicID │ QuestionID │ DataValueTypeID │      Latitude       │     Longitude      │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │ Stratification1ID │
│   int32   │  int32  │   varchar    │    varchar     │ varchar │                                         varchar                                 

#### unique pairs of question id and topic id

In [208]:
duckdb.sql("""
    SELECT TopicID, QuestionID
    FROM cdi
    GROUP BY TopicID, QuestionID
""")

┌─────────┬────────────┐
│ TopicID │ QuestionID │
│ varchar │  varchar   │
├─────────┼────────────┤
│ CKD     │ CKD3_0     │
│ CVD     │ CVD3_1     │
│ NPAW    │ NPAW10_0   │
│ NPAW    │ NPAW11_3   │
│ NPAW    │ NPAW18_0   │
│ OLD     │ OLD3_1     │
│ OVC     │ OVC2_2     │
│ TOB     │ TOB11_1    │
│ TOB     │ TOB2_1     │
│ ALC     │ ALC5_1     │
│  ·      │   ·        │
│  ·      │   ·        │
│  ·      │   ·        │
│ NPAW    │ NPAW12_2   │
│ OLD     │ OLD3_2     │
│ OVC     │ OVC2_1     │
│ OVC     │ OVC7_2     │
│ TOB     │ TOB9_0     │
│ ALC     │ ALC4_0     │
│ ART     │ ART1_2     │
│ AST     │ AST6_1     │
│ CAN     │ CAN4_2     │
│ CAN     │ CAN8_2     │
├─────────┴────────────┤
│ 192 rows (20 shown)  │
└──────────────────────┘

In [209]:
cdi_unique_states = duckdb.sql("""
    SELECT LocationAbbr, LocationDesc
    FROM cdi
    GROUP BY LocationAbbr, LocationDesc
""").fetchall()
cdi_unique_states

[('DE', 'Delaware'),
 ('GA', 'Georgia'),
 ('AR', 'Arkansas'),
 ('VT', 'Vermont'),
 ('MT', 'Montana'),
 ('MN', 'Minnesota'),
 ('OH', 'Ohio'),
 ('NV', 'Nevada'),
 ('MS', 'Mississippi'),
 ('HI', 'Hawaii'),
 ('WY', 'Wyoming'),
 ('KS', 'Kansas'),
 ('AK', 'Alaska'),
 ('TN', 'Tennessee'),
 ('MO', 'Missouri'),
 ('CA', 'California'),
 ('LA', 'Louisiana'),
 ('KY', 'Kentucky'),
 ('AL', 'Alabama'),
 ('SC', 'South Carolina'),
 ('SD', 'South Dakota'),
 ('IA', 'Iowa'),
 ('OK', 'Oklahoma'),
 ('RI', 'Rhode Island'),
 ('UT', 'Utah'),
 ('NE', 'Nebraska'),
 ('CO', 'Colorado'),
 ('ME', 'Maine'),
 ('IN', 'Indiana'),
 ('DC', 'District of Columbia'),
 ('WV', 'West Virginia'),
 ('ID', 'Idaho'),
 ('WA', 'Washington'),
 ('MI', 'Michigan'),
 ('NM', 'New Mexico'),
 ('IL', 'Illinois'),
 ('NH', 'New Hampshire'),
 ('MD', 'Maryland'),
 ('WI', 'Wisconsin'),
 ('AZ', 'Arizona'),
 ('ND', 'North Dakota'),
 ('FL', 'Florida'),
 ('PA', 'Pennsylvania'),
 ('VA', 'Virginia'),
 ('CT', 'Connecticut'),
 ('TX', 'Texas'),
 ('NY', 'Ne

In [210]:
cdi_state_codes, cdi_state_names = zip(*cdi_unique_states)
cdi_state_codes

('DE',
 'GA',
 'AR',
 'VT',
 'MT',
 'MN',
 'OH',
 'NV',
 'MS',
 'HI',
 'WY',
 'KS',
 'AK',
 'TN',
 'MO',
 'CA',
 'LA',
 'KY',
 'AL',
 'SC',
 'SD',
 'IA',
 'OK',
 'RI',
 'UT',
 'NE',
 'CO',
 'ME',
 'IN',
 'DC',
 'WV',
 'ID',
 'WA',
 'MI',
 'NM',
 'IL',
 'NH',
 'MD',
 'WI',
 'AZ',
 'ND',
 'FL',
 'PA',
 'VA',
 'CT',
 'TX',
 'NY',
 'OR',
 'NC',
 'NJ',
 'MA')

In [211]:
set(cdi_state_names) - set(population_state_names)

set()

In [212]:
duckdb.sql("""
    SELECT Sex, Ethnicity, Origin
    FROM cdi
    GROUP BY Sex, Ethnicity, Origin
""")

┌─────────┬─────────────┬──────────────┐
│   Sex   │  Ethnicity  │    Origin    │
│ varchar │   varchar   │   varchar    │
├─────────┼─────────────┼──────────────┤
│ Both    │ Black       │ Not Hispanic │
│ Both    │ Multiracial │ Not Hispanic │
│ Both    │ Asian       │ Not Hispanic │
│ Both    │ White       │ Not Hispanic │
│ Both    │ All         │ Both         │
│ Female  │ All         │ Both         │
│ Both    │ AIAN        │ Not Hispanic │
│ Male    │ All         │ Both         │
│ Both    │ All         │ Hispanic     │
│ Both    │ NHPI        │ Not Hispanic │
│ Both    │ Other       │ Not Hispanic │
├─────────┴─────────────┴──────────────┤
│ 11 rows                    3 columns │
└──────────────────────────────────────┘

In [213]:
duckdb.sql("""
    SELECT DISTINCT(Sex)
    FROM cdi
""")

┌─────────┐
│   Sex   │
│ varchar │
├─────────┤
│ Female  │
│ Male    │
│ Both    │
└─────────┘

In [214]:
duckdb.sql("""
    SELECT DISTINCT(Ethnicity)
    FROM cdi
""")

┌─────────────┐
│  Ethnicity  │
│   varchar   │
├─────────────┤
│ NHPI        │
│ Multiracial │
│ All         │
│ White       │
│ Black       │
│ AIAN        │
│ Other       │
│ Asian       │
└─────────────┘

In [215]:
duckdb.sql("""
    SELECT DISTINCT(Origin)
    FROM cdi
""")

┌──────────────┐
│    Origin    │
│   varchar    │
├──────────────┤
│ Both         │
│ Hispanic     │
│ Not Hispanic │
└──────────────┘

In [216]:
duckdb.sql("""
    WITH Stratifications AS (
        SELECT Sex, Ethnicity, Origin, DENSE_RANK() OVER(ORDER BY Sex, Ethnicity, Origin) AS StratificationID
        FROM cdi
    )
           
    SELECT *
    FROM Stratifications
    GROUP BY Sex, Ethnicity, Origin, StratificationID
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │      int64       │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Male    │ All         │ Both         │               11 │
│ Both    │ AIAN        │ Not Hispanic │                1 │
│ Female  │ All         │ Both         │               10 │
│ Both    │ All         │ Both         │                2 │
│ Both    │ Asian       │ Not Hispanic │                4 │
│ Both    │ White       │ Not Hispanic │                9 │
│ Both    │ Multiracial │ Not Hispanic │                6 │
│ Both    │ All         │ Hispanic     │                3 │
│ Both    │ Black       │ Not Hispanic │                5 │
│ Both    │ NHPI        │ Not Hispanic │                7 │
│ Both    │ Other       │ Not Hispanic │                8 │
├─────────┴─────────────┴──────────────┴──────────────────┤
│ 11 rows                               