# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [1]:
import duckdb

%load_ext autoreload
%autoreload 2

# Create database in order to persist data if one does not already exist. (DO NOT create a `.duckdb` file manually as this will not contain the instructions for duckdb to read and write data from and to)

In [2]:
# conn = duckdb.connect("chronic_disease_analyses.duckdb")

# loading transformed population tables

In [13]:
us_population_per_state_by_sex_age_race_ho_file_names = [
    './data/population-data-transformed/Population_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/Population_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/Population_2020_2023.parquet/*.parquet',
]

In [14]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_population_per_state_by_sex_age_race_ho AS
    SELECT *
    FROM read_parquet({us_population_per_state_by_sex_age_race_ho_file_names}, union_by_name=True, filename=False)
""")

In [15]:
duckdb.sql("""
    SELECT *
    FROM us_population_per_state_by_sex_age_race_ho
""")

┌─────────┬───────┬──────────────────┬───────┬────────────┐
│ StateID │  Age  │ StratificationID │ Year  │ Population │
│ varchar │ float │     varchar      │ int32 │   int64    │
├─────────┼───────┼──────────────────┼───────┼────────────┤
│ AL      │   0.0 │ NH_M_WHITE       │  2000 │      19270 │
│ AL      │   0.0 │ NH_M_WHITE       │  2001 │      19612 │
│ AL      │   0.0 │ NH_M_WHITE       │  2002 │      18731 │
│ AL      │   0.0 │ NH_M_WHITE       │  2003 │      18623 │
│ AL      │   0.0 │ NH_M_WHITE       │  2004 │      18659 │
│ AL      │   0.0 │ NH_M_WHITE       │  2005 │      18816 │
│ AL      │   0.0 │ NH_M_WHITE       │  2006 │      18877 │
│ AL      │   0.0 │ NH_M_WHITE       │  2007 │      19027 │
│ AL      │   0.0 │ NH_M_WHITE       │  2008 │      18937 │
│ AL      │   0.0 │ NH_M_WHITE       │  2009 │      18039 │
│ ·       │    ·  │     ·            │    ·  │          · │
│ ·       │    ·  │     ·            │    ·  │          · │
│ ·       │    ·  │     ·            │  

In [16]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_population_per_state_by_sex_age_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      2526336 │
└──────────────┘

In [20]:
duckdb.sql("""
    SELECT Age, ROUND(AVG(Population), 2) AS MeanPopulationPerAge
    FROM us_population_per_state_by_sex_age_race_ho
    GROUP BY Age
    ORDER BY Age
""")

┌───────┬──────────────────────┐
│  Age  │ MeanPopulationPerAge │
│ float │        double        │
├───────┼──────────────────────┤
│   0.0 │              3230.22 │
│   1.0 │              3239.74 │
│   2.0 │              3248.47 │
│   3.0 │              3259.22 │
│   4.0 │              3271.03 │
│   5.0 │              3283.32 │
│   6.0 │              3293.71 │
│   7.0 │              3304.52 │
│   8.0 │              3314.64 │
│   9.0 │              3327.33 │
│    ·  │                  ·   │
│    ·  │                  ·   │
│    ·  │                  ·   │
│  76.0 │               1433.9 │
│  77.0 │              1336.63 │
│  78.0 │              1264.82 │
│  79.0 │              1191.36 │
│  80.0 │               1114.0 │
│  81.0 │              1030.56 │
│  82.0 │               953.07 │
│  83.0 │               876.16 │
│  84.0 │               799.87 │
│  85.0 │               4573.9 │
├───────┴──────────────────────┤
│      86 rows (20 shown)      │
└──────────────────────────────┘

In [21]:
duckdb.sql("""
    SELECT DISTINCT(StratificationID)
    FROM us_population_per_state_by_sex_age_race_ho
""").fetchall()

[('NH_F_ASIAN',),
 ('NH_F_NHPI',),
 ('H_F_BLACK',),
 ('H_F_WHITE',),
 ('NH_F_AIAN',),
 ('NH_M_BLACK',),
 ('NH_M_MULTI',),
 ('H_F_AIAN',),
 ('H_F_ASIAN',),
 ('H_M_MULTI',),
 ('NH_F_WHITE',),
 ('H_F_NHPI',),
 ('H_M_AIAN',),
 ('H_M_NHPI',),
 ('NH_F_BLACK',),
 ('H_M_WHITE',),
 ('NH_M_ASIAN',),
 ('H_M_ASIAN',),
 ('H_M_BLACK',),
 ('NH_F_MULTI',),
 ('NH_M_WHITE',),
 ('NH_M_AIAN',),
 ('H_F_MULTI',),
 ('NH_M_NHPI',)]

# Loading population dimension tables

In [23]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE PopulationState AS
    SELECT *
    FROM read_parquet({["./data/population-data-transformed/State.parquet/*.parquet"]}, union_by_name=True, filename=False)
""")

In [24]:
duckdb.sql(f"""
    SELECT *
    FROM PopulationState
""")

┌───────────────┬─────────┐
│     State     │ StateID │
│    varchar    │ varchar │
├───────────────┼─────────┤
│ Mississippi   │ MS      │
│ South Dakota  │ SD      │
│ Utah          │ UT      │
│ Kentucky      │ KY      │
│ California    │ CA      │
│ Nebraska      │ NE      │
│ New Hampshire │ NH      │
│ Delaware      │ DE      │
│ Minnesota     │ MN      │
│ Nevada        │ NV      │
│   ·           │ ·       │
│   ·           │ ·       │
│   ·           │ ·       │
│ Michigan      │ MI      │
│ Illinois      │ IL      │
│ Maryland      │ MD      │
│ Alaska        │ AK      │
│ Connecticut   │ CT      │
│ Vermont       │ VT      │
│ Oklahoma      │ OK      │
│ West Virginia │ WV      │
│ Virginia      │ VA      │
│ Massachusetts │ NULL    │
├───────────────┴─────────┤
│   51 rows (20 shown)    │
└─────────────────────────┘

In [25]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE PopulationStratification AS
    SELECT *
    FROM read_parquet({["./data/population-data-transformed/Stratification.parquet/*.parquet"]}, union_by_name=True, filename=False)
""")

In [30]:
duckdb.sql(f"""
    SELECT *
    FROM PopulationStratification
    ORDER BY Sex DESC
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │     varchar      │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Male    │ AIAN        │ Hispanic     │ H_M_AIAN         │
│ Male    │ NHPI        │ Hispanic     │ H_M_NHPI         │
│ Male    │ Black       │ Not Hispanic │ NH_M_BLACK       │
│ Male    │ Asian       │ Not Hispanic │ NH_M_ASIAN       │
│ Male    │ AIAN        │ Not Hispanic │ NH_M_AIAN        │
│ Male    │ White       │ Not Hispanic │ NH_M_WHITE       │
│ Male    │ Black       │ Hispanic     │ H_M_BLACK        │
│ Male    │ NHPI        │ Not Hispanic │ NH_M_NHPI        │
│ Male    │ Multiracial │ Not Hispanic │ NH_M_MULTI       │
│ Male    │ White       │ Hispanic     │ H_M_WHITE        │
│  ·      │  ·          │    ·         │    ·             │
│  ·      │  ·          │    ·         │    ·             │
│  ·      │  ·          │    ·         │

# Loading transformed chronic disease indicators table

In [6]:
cdi_file_name = './data/cdi-data-transformed/CDI.parquet/*.parquet'

In [7]:
q = f"""
    CREATE OR REPLACE TABLE cdi AS
    SELECT *
    FROM read_parquet('{cdi_file_name}')
"""

In [8]:
duckdb.sql(q)

In [9]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM cdi
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       678471 │
└──────────────┘

In [10]:
duckdb.sql("""
    SELECT *
    FROM cdi
""")

┌───────────┬─────────┬────────────┬───────────────┬───────────┬────────────────────┬─────────────────────┬────────────┬─────────────────┬─────────────────────┬────────────────────┬──────────────────┐
│ YearStart │ YearEnd │ LocationID │ DataValueUnit │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ QuestionID │ DataValueTypeID │      Latitude       │     Longitude      │ StratificationID │
│   int32   │  int32  │  varchar   │    varchar    │  double   │       double       │       double        │  varchar   │     varchar     │       double        │       double       │     varchar      │
├───────────┼─────────┼────────────┼───────────────┼───────────┼────────────────────┼─────────────────────┼────────────┼─────────────────┼─────────────────────┼────────────────────┼──────────────────┤
│      2015 │    2015 │ IL         │ %             │      10.6 │                4.8 │                22.2 │ ALC1_1     │ CRDPREV         │  -88.99771017799969 │  40.48501028300046 │ NH_B_ASIAN    

In [11]:
duckdb.sql("""
    SELECT DISTINCT(StratificationID)
    FROM cdi
""")

┌──────────────────┐
│ StratificationID │
│     varchar      │
├──────────────────┤
│ H_B_ALL          │
│ NH_B_MULTI       │
│ NH_B_NHPI        │
│ NH_B_ASIAN       │
│ B_M_ALL          │
│ B_B_ALL          │
│ NH_B_BLACK       │
│ B_F_ALL          │
│ NH_B_WHITE       │
│ NH_B_AIAN        │
│ NH_B_OTHER       │
├──────────────────┤
│     11 rows      │
└──────────────────┘

In [12]:
cdi_unique_states = duckdb.sql("""
    SELECT DISTINCT(LocationID)
    FROM cdi
""").fetchall()
cdi_unique_states

[('NC',),
 ('MN',),
 ('ID',),
 ('KS',),
 ('PA',),
 ('IA',),
 ('TX',),
 ('VT',),
 ('WV',),
 ('AK',),
 ('SC',),
 ('ND',),
 ('TN',),
 ('AZ',),
 ('DE',),
 ('NJ',),
 ('AL',),
 ('MI',),
 ('MT',),
 ('NY',),
 ('WY',),
 ('IL',),
 ('MS',),
 ('OK',),
 ('FL',),
 ('AR',),
 ('CO',),
 ('MD',),
 ('OH',),
 ('DC',),
 ('WA',),
 ('RI',),
 ('GA',),
 ('CT',),
 ('MA',),
 ('NE',),
 ('IN',),
 ('WI',),
 ('HI',),
 ('LA',),
 ('VA',),
 ('CA',),
 ('ME',),
 ('NH',),
 ('NV',),
 ('SD',),
 ('KY',),
 ('UT',),
 ('OR',),
 ('NM',),
 ('MO',)]