# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [1]:
import duckdb
import os
from dotenv import load_dotenv
from pathlib import Path

%load_ext autoreload
%autoreload 2

# Load AWS credentials in order for duck db to read parquet files in s3 bucket 

In [2]:
# Build paths inside the project like this: BASE_DIR / 'subdir'.
# use this only in development
env_dir = Path('./').resolve()
load_dotenv(os.path.join(env_dir, '.env'))

True

In [4]:
# jdbc:duckdb:md:chronic_disease_analyses_db
# duckdb:///md:chronic_disease_analyses_db
conn = duckdb.connect(f"md:chronic_disease_analyses_db?motherduck_token={os.environ['MOTHERDUCK_TOKEN']}")

In [5]:
# load env vars
credentials = {
    "aws_access_key_id": os.environ["AWS_ACCESS_KEY_ID"],
    "aws_secret_access_key": os.environ["AWS_SECRET_ACCESS_KEY"],
    "region_name": os.environ["AWS_REGION_NAME"],
}

In [41]:
conn.sql(f"""
    INSTALL httpfs
""")

In [42]:
conn.sql(f"""
    LOAD httpfs
""")

In [43]:
conn.sql(f"""
    CREATE SECRET (
        TYPE s3,
        KEY_ID '{credentials["aws_access_key_id"]}',
        SECRET '{credentials["aws_secret_access_key"]}',
        REGION '{credentials["region_name"]}',
        ENDPOINT 's3.{credentials["region_name"]}.amazonaws.com'
    );
""")

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

# loading CDI and Population fact tables

In [44]:
cdi_url = "s3://chronic-disease-analyses-bucket/cdi-data-transformed/CDI.parquet/*.parquet"
cdi_url

's3://chronic-disease-analyses-bucket/cdi-data-transformed/CDI.parquet/*.parquet'

In [45]:
query = f"""
    CREATE OR REPLACE TABLE CDI AS
    SELECT *
    FROM read_parquet('{cdi_url}', union_by_name=True, filename=False)
"""
query

"\n    CREATE OR REPLACE TABLE CDI AS\n    SELECT *\n    FROM read_parquet('s3://chronic-disease-analyses-bucket/cdi-data-transformed/CDI.parquet/*.parquet', union_by_name=True, filename=False)\n"

In [46]:
conn.sql(query)

In [47]:
conn.sql("""
    SELECT * FROM CDI
""")

┌───────────┬─────────┬────────────┬───────────────┬───────────┬────────────────────┬─────────────────────┬────────────┬─────────────────┬──────────────────┐
│ YearStart │ YearEnd │ LocationID │ DataValueUnit │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ QuestionID │ DataValueTypeID │ StratificationID │
│   int32   │  int32  │  varchar   │    varchar    │  double   │       double       │       double        │  varchar   │     varchar     │     varchar      │
├───────────┼─────────┼────────────┼───────────────┼───────────┼────────────────────┼─────────────────────┼────────────┼─────────────────┼──────────────────┤
│      2015 │    2015 │ IL         │ %             │      10.6 │                4.8 │                22.2 │ ALC1_1     │ CRDPREV         │ NH_B_ASIAN       │
│      2015 │    2015 │ RI         │ %             │      12.6 │                7.3 │                21.0 │ ALC1_1     │ CRDPREV         │ NH_B_ASIAN       │
│      2013 │    2013 │ MA         │ %             │

In [48]:
us_population_file_names = [
    "s3://chronic-disease-analyses-bucket/population-data-transformed/Population_2000_2009.parquet/*.parquet",
    "s3://chronic-disease-analyses-bucket/population-data-transformed/Population_2010_2019.parquet/*.parquet",
    "s3://chronic-disease-analyses-bucket/population-data-transformed/Population_2020_2023.parquet/*.parquet",
]

In [49]:
query = f"""
    CREATE OR REPLACE TABLE Population AS
    SELECT *
    FROM read_parquet({us_population_file_names}, union_by_name=True, filename=False)
"""

In [50]:
conn.sql(query)

In [51]:
conn.sql("""
    SELECT * FROM Population
""")

┌─────────┬───────┬──────────────────┬───────┬────────────┐
│ StateID │  Age  │ StratificationID │ Year  │ Population │
│ varchar │ float │     varchar      │ int32 │   int64    │
├─────────┼───────┼──────────────────┼───────┼────────────┤
│ AL      │   0.0 │ NH_M_WHITE       │  2000 │      19270 │
│ AL      │   0.0 │ NH_M_WHITE       │  2001 │      19612 │
│ AL      │   0.0 │ NH_M_WHITE       │  2002 │      18731 │
│ AL      │   0.0 │ NH_M_WHITE       │  2003 │      18623 │
│ AL      │   0.0 │ NH_M_WHITE       │  2004 │      18659 │
│ AL      │   0.0 │ NH_M_WHITE       │  2005 │      18816 │
│ AL      │   0.0 │ NH_M_WHITE       │  2006 │      18877 │
│ AL      │   0.0 │ NH_M_WHITE       │  2007 │      19027 │
│ AL      │   0.0 │ NH_M_WHITE       │  2008 │      18937 │
│ AL      │   0.0 │ NH_M_WHITE       │  2009 │      18039 │
│ ·       │    ·  │     ·            │    ·  │          · │
│ ·       │    ·  │     ·            │    ·  │          · │
│ ·       │    ·  │     ·            │  

# Loading CDI dimension tables

#### location table

In [52]:
cdi_location_url = "s3://chronic-disease-analyses-bucket/cdi-data-transformed/Location.parquet/*.parquet"
cdi_location_url

's3://chronic-disease-analyses-bucket/cdi-data-transformed/Location.parquet/*.parquet'

In [53]:
# note that if we only specify a string instead of a list in read_parquet it must be enclosed in a quote or double quotes
query = f"""
    CREATE OR REPLACE TABLE CDILocation AS
    SELECT *
    FROM read_parquet('{cdi_location_url}', union_by_name=True, filename=False)
"""

In [54]:
conn.sql(query)

In [55]:
conn.sql("""
    SELECT *
    FROM CDILocation
""")

┌────────────┬────────────────┬─────────────────────┬────────────────────┐
│ LocationID │  LocationDesc  │      Latitude       │     Longitude      │
│  varchar   │    varchar     │       double        │       double       │
├────────────┼────────────────┼─────────────────────┼────────────────────┤
│ NM         │ New Mexico     │ -106.24058098499967 │  34.52088095200048 │
│ OK         │ Oklahoma       │  -97.52107021399968 │  35.47203135600046 │
│ GA         │ Georgia        │  -83.62758034599966 │  32.83968109300048 │
│ KY         │ Kentucky       │  -84.77497104799966 │ 37.645970271000465 │
│ ND         │ North Dakota   │ -100.11842104899966 │  47.47531977900047 │
│ WY         │ Wyoming        │ -108.10983035299967 │  43.23554134300048 │
│ TN         │ Tennessee      │  -85.77449091399967 │  35.68094058000048 │
│ LA         │ Louisiana      │  -92.44568007099969 │  31.31266064400046 │
│ NE         │ Nebraska       │  -99.36572062299967 │   41.6410409880005 │
│ AK         │ Alaska    

#### Stratification table

In [56]:
cdi_stratification_url = "s3://chronic-disease-analyses-bucket/cdi-data-transformed/Stratification.parquet/*.parquet"
cdi_stratification_url

's3://chronic-disease-analyses-bucket/cdi-data-transformed/Stratification.parquet/*.parquet'

In [57]:
query = f"""
    CREATE OR REPLACE TABLE CDIStratification AS
    SELECT *
    FROM read_parquet('{cdi_stratification_url}', union_by_name=True, filename=False)
"""

In [58]:
conn.sql(query)

In [59]:
conn.sql("""
    SELECT *
    FROM CDIStratification
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │     varchar      │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Male    │ All         │ Both         │ B_M_ALL          │
│ Both    │ White       │ Not Hispanic │ NH_B_WHITE       │
│ Both    │ Black       │ Not Hispanic │ NH_B_BLACK       │
│ Both    │ AIAN        │ Not Hispanic │ NH_B_AIAN        │
│ Both    │ Multiracial │ Not Hispanic │ NH_B_MULTI       │
│ Both    │ NHPI        │ Not Hispanic │ NH_B_NHPI        │
│ Both    │ Other       │ Not Hispanic │ NH_B_OTHER       │
│ Female  │ All         │ Both         │ B_F_ALL          │
│ Both    │ All         │ Hispanic     │ H_B_ALL          │
│ Both    │ All         │ Both         │ B_B_ALL          │
│ Both    │ Asian       │ Not Hispanic │ NH_B_ASIAN       │
├─────────┴─────────────┴──────────────┴──────────────────┤
│ 11 rows                               

#### Question table

In [60]:
question_url = "s3://chronic-disease-analyses-bucket/cdi-data-transformed/Question.parquet/*.parquet"
question_url

's3://chronic-disease-analyses-bucket/cdi-data-transformed/Question.parquet/*.parquet'

In [61]:
query = f"""
    CREATE OR REPLACE TABLE Question AS
    SELECT *
    FROM read_parquet('{question_url}', union_by_name=True, filename=False)
"""

In [62]:
conn.sql(query)

In [63]:
conn.sql("""
    SELECT *
    FROM Question
""")

┌────────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────┬──────────┬────────┐
│ QuestionID │ TopicID │                                                   Question                                                   │                      Topic                      │ AgeStart │ AgeEnd │
│  varchar   │ varchar │                                                   varchar                                                    │                     varchar                     │  double  │ double │
├────────────┼─────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────┼──────────┼────────┤
│ ART1_1     │ ART     │ Arthritis among adults aged >= 18 years                                                                      │ Arthritis                               

#### DataValueType table

In [64]:
data_value_type_url = "s3://chronic-disease-analyses-bucket/cdi-data-transformed/DataValueType.parquet/*.parquet"
data_value_type_url

's3://chronic-disease-analyses-bucket/cdi-data-transformed/DataValueType.parquet/*.parquet'

In [65]:
query = f"""
    CREATE OR REPLACE TABLE DataValueType AS
    SELECT *
    FROM read_parquet('{data_value_type_url}', union_by_name=True, filename=False)
"""

In [66]:
conn.sql(query)

In [67]:
conn.sql("""
    SELECT *
    FROM DataValueType
""")

┌───────────────────┬──────────────────────────────────────────┐
│  DataValueTypeID  │              DataValueType               │
│      varchar      │                 varchar                  │
├───────────────────┼──────────────────────────────────────────┤
│ MEDIAN            │ Median                                   │
│ NMBR              │ Number                                   │
│ AGEADJRATE        │ Age-adjusted Rate                        │
│ PERCAPALC         │ Per capita alcohol consumption           │
│ USD               │ US Dollars                               │
│ AGEADJMEAN        │ Age-adjusted Mean                        │
│ PREV              │ Prevalence                               │
│ MEAN              │ Mean                                     │
│ AVGANNCRDRATE     │ Average Annual Crude Rate                │
│ CRDRATE           │ Crude Rate                               │
│ AGEADJPREV        │ Age-adjusted Prevalence                  │
│ AVGANNAGEADJRATE  │ Ave

# Loading Population dimension tables

#### State table 

In [68]:
population_state_url = "s3://chronic-disease-analyses-bucket/population-data-transformed/State.parquet/*.parquet"
population_state_url

's3://chronic-disease-analyses-bucket/population-data-transformed/State.parquet/*.parquet'

In [69]:
query = f"""
    CREATE OR REPLACE TABLE PopulationState AS
    SELECT *
    FROM read_parquet('{population_state_url}', union_by_name=True, filename=False)
"""

In [70]:
conn.sql(query)

In [71]:
conn.sql("""
    SELECT *
    FROM PopulationState
""")

┌───────────────┬─────────┐
│     State     │ StateID │
│    varchar    │ varchar │
├───────────────┼─────────┤
│ Mississippi   │ MS      │
│ South Dakota  │ SD      │
│ Utah          │ UT      │
│ Kentucky      │ KY      │
│ California    │ CA      │
│ Nebraska      │ NE      │
│ New Hampshire │ NH      │
│ Delaware      │ DE      │
│ Minnesota     │ MN      │
│ Nevada        │ NV      │
│   ·           │ ·       │
│   ·           │ ·       │
│   ·           │ ·       │
│ Michigan      │ MI      │
│ Illinois      │ IL      │
│ Maryland      │ MD      │
│ Alaska        │ AK      │
│ Connecticut   │ CT      │
│ Vermont       │ VT      │
│ Oklahoma      │ OK      │
│ West Virginia │ WV      │
│ Virginia      │ VA      │
│ Massachusetts │ NULL    │
├───────────────┴─────────┤
│   51 rows (20 shown)    │
└─────────────────────────┘

#### Stratification table

In [72]:
population_stratification_url = "s3://chronic-disease-analyses-bucket/population-data-transformed/Stratification.parquet/*.parquet"
population_stratification_url

's3://chronic-disease-analyses-bucket/population-data-transformed/Stratification.parquet/*.parquet'

In [73]:
query = f"""
    CREATE OR REPLACE TABLE PopulationStratification AS
    SELECT *
    FROM read_parquet('{population_stratification_url}', union_by_name=True, filename=False)
"""

In [74]:
conn.sql(query)

In [75]:
conn.sql(f"""
    SELECT *
    FROM PopulationStratification
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │     varchar      │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Female  │ Black       │ Not Hispanic │ NH_F_BLACK       │
│ Female  │ Asian       │ Not Hispanic │ NH_F_ASIAN       │
│ Female  │ NHPI        │ Hispanic     │ H_F_NHPI         │
│ Male    │ Asian       │ Not Hispanic │ NH_M_ASIAN       │
│ Male    │ White       │ Not Hispanic │ NH_M_WHITE       │
│ Male    │ AIAN        │ Not Hispanic │ NH_M_AIAN        │
│ Male    │ Multiracial │ Hispanic     │ H_M_MULTI        │
│ Male    │ Black       │ Not Hispanic │ NH_M_BLACK       │
│ Male    │ Black       │ Hispanic     │ H_M_BLACK        │
│ Female  │ Asian       │ Hispanic     │ H_F_ASIAN        │
│  ·      │   ·         │    ·         │     ·            │
│  ·      │   ·         │    ·         │     ·            │
│  ·      │   ·         │    ·         │