# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [2]:
import duckdb

%load_ext autoreload
%autoreload 2

# Create database in order to persist data if one does not already exist. (DO NOT create a `.duckdb` file manually as this will not contain the instructions for duckdb to read and write data from and to)

In [3]:
# conn = duckdb.connect("chronic_disease_analyses.duckdb")

# loading transformed population tables

In [4]:
us_populations_per_state_file_names = ['./data/population-data-transformed/us_populations_per_state.parquet/*.parquet']

In [5]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state AS
    SELECT *
    FROM read_parquet({us_populations_per_state_file_names}, union_by_name=True, filename=False)
""")

In [6]:
us_populations_per_state_by_sex_age_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2020_2023.parquet/*.parquet']

In [7]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_age AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_age_file_names}, union_by_name=True, filename=False)
""")

In [8]:
us_populations_per_state_by_sex_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2020_2023.parquet/*.parquet']

In [9]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_race_ho AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_race_ho_file_names}, union_by_name=True, filename=False)
""")

In [10]:
us_populations_per_state_by_sex_age_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_race_ho_2020_2023.parquet/*.parquet']

In [11]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_age_race_ho AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_age_race_ho_file_names}, union_by_name=True, filename=False)
""")

In [12]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1071 │
└──────────────┘

In [13]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state_by_sex_age
""")

┌────────────────┬───────┬────────────┬─────────┬──────────┬──────────┬────────┐
│    Bracket     │ Year  │ Population │   Sex   │  State   │ AgeStart │ AgeEnd │
│    varchar     │ int32 │   int64    │ varchar │ varchar  │  double  │ double │
├────────────────┼───────┼────────────┼─────────┼──────────┼──────────┼────────┤
│ under 5 years  │  2000 │     150609 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2001 │     151410 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2002 │     150856 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2003 │     150594 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2004 │     150699 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2005 │     150960 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2006 │     151442 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2007 │     153128 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2008 │  

In [14]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        29376 │
└──────────────┘

In [15]:
duckdb.sql("""
    SELECT DISTINCT(Ethnicity)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌─────────────┐
│  Ethnicity  │
│   varchar   │
├─────────────┤
│ White       │
│ Black       │
│ Aian        │
│ AIAN        │
│ Multiracial │
│ Nhpi        │
│ Asian       │
│ NHPI        │
└─────────────┘

In [16]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state_by_sex_age_race_ho
""")

┌─────────┬───────┬─────────────┬──────────────┬─────────┬───────┬────────────┐
│  State  │  Age  │  Ethnicity  │    Origin    │   Sex   │ Year  │ Population │
│ varchar │ int32 │   varchar   │   varchar    │ varchar │ int32 │   int64    │
├─────────┼───────┼─────────────┼──────────────┼─────────┼───────┼────────────┤
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2000 │      19270 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2001 │      19612 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2002 │      18731 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2003 │      18623 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2004 │      18659 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2005 │      18816 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2006 │      18877 │
│ Alabama │     0 │ White       │ Not Hispanic │ Male    │  2007 │      19027 │
│ Alabama │     0 │ White       │ Not Hi

In [17]:
duckdb.sql("""
    SELECT Age, COUNT(Age) AS AgeOcc
    FROM us_populations_per_state_by_sex_age_race_ho
    GROUP BY Age
""")

┌───────┬────────┐
│  Age  │ AgeOcc │
│ int32 │ int64  │
├───────┼────────┤
│     0 │  29376 │
│     1 │  29376 │
│     2 │  29376 │
│     3 │  29376 │
│     4 │  29376 │
│     5 │  29376 │
│     6 │  29376 │
│     7 │  29376 │
│     8 │  29376 │
│     9 │  29376 │
│     · │    ·   │
│     · │    ·   │
│     · │    ·   │
│    76 │  29376 │
│    77 │  29376 │
│    78 │  29376 │
│    79 │  29376 │
│    80 │  29376 │
│    81 │  29376 │
│    82 │  29376 │
│    83 │  29376 │
│    84 │  29376 │
│    85 │  29376 │
├───────┴────────┤
│    86 rows     │
│   (20 shown)   │
└────────────────┘

In [18]:
population_unique_states = duckdb.sql("""
    SELECT DISTINCT(State)
    FROM us_populations_per_state
""").fetchall()

In [19]:
population_state_names = list(zip(*population_unique_states))[0]
population_state_names

('Arizona',
 'Montana',
 'Nebraska',
 'North Carolina',
 'Alabama',
 'California',
 'Colorado',
 'Illinois',
 'Mississippi',
 'Indiana',
 'Maryland',
 'New Mexico',
 'Washington',
 'Michigan',
 'Missouri',
 'Pennsylvania',
 'Georgia',
 'South Dakota',
 'Vermont',
 'Oklahoma',
 'Utah',
 'Virginia',
 'Wyoming',
 'Delaware',
 'Hawaii',
 'North Dakota',
 'Arkansas',
 'Idaho',
 'South Carolina',
 'West Virginia',
 'Connecticut',
 'Maine',
 'Massachusetts',
 'Oregon',
 'New Jersey',
 'Tennessee',
 'Wisconsin',
 'Iowa',
 'Kansas',
 'Louisiana',
 'Ohio',
 'Rhode Island',
 'Texas',
 'Kentucky',
 'New York',
 'Alaska',
 'District of Columbia',
 'Florida',
 'Minnesota',
 'Nevada',
 'New Hampshire')

# Loading transformed chronic disease indicators table

In [20]:
cdi_file_name = './data/cdi-data-transformed/cdi.parquet/*.parquet'

In [21]:
q = f"""
    CREATE OR REPLACE TABLE cdi AS
    SELECT *
    FROM read_parquet('{cdi_file_name}')
"""

In [22]:
duckdb.sql(q)

In [23]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM cdi
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       678471 │
└──────────────┘

In [24]:
duckdb.sql("""
    SELECT *
    FROM cdi
""")

┌───────────┬─────────┬──────────────┬────────────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬─────────────────────────┬───────────┬────────────────────┬─────────────────────┬─────────────────────────┬────────────┬─────────┬────────────┬─────────────────┬───────────────────────────┬───────────────────┬─────────────────────┬────────────────────┬──────────┬────────┬─────────┬───────────┬──────────────┐
│ YearStart │ YearEnd │ LocationAbbr │  LocationDesc  │  Topic  │                                        Question                                         │ DataValueUnit │      DataValueType      │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ StratificationCategory1 │ LocationID │ TopicID │ QuestionID │ DataValueTypeID │ StratificationCategory1ID │ Stratification1ID │      Latitude       │     Longitude      │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │
│   int32   │  int32  │   varchar    │

#### unique pairs of question id and topic id

In [25]:
duckdb.sql("""
    SELECT TopicID, QuestionID
    FROM cdi
    GROUP BY TopicID, QuestionID
""")

┌─────────┬────────────┐
│ TopicID │ QuestionID │
│ varchar │  varchar   │
├─────────┼────────────┤
│ COPD    │ COPD5_4    │
│ COPD    │ COPD6_2    │
│ CVD     │ CVD4_0     │
│ CVD     │ CVD7_0     │
│ DIA     │ DIA11_1    │
│ DIA     │ DIA12_1    │
│ IMM     │ IMM1_0     │
│ NPAW    │ NPAW1_2    │
│ ORH     │ ORH4_2     │
│ OVC     │ OVC6_2     │
│  ·      │   ·        │
│  ·      │   ·        │
│  ·      │   ·        │
│ COPD    │ COPD5_2    │
│ CVD     │ CVD1_5     │
│ DIA     │ DIA7_0     │
│ NPAW    │ NPAW4_1    │
│ NPAW    │ NPAW5_2    │
│ OVC     │ OVC1_1     │
│ OVC     │ OVC7_1     │
│ AST     │ AST4_1     │
│ AST     │ AST5_1     │
│ CAN     │ CAN9_1     │
├─────────┴────────────┤
│ 192 rows (20 shown)  │
└──────────────────────┘

In [26]:
cdi_unique_states = duckdb.sql("""
    SELECT LocationAbbr, LocationDesc
    FROM cdi
    GROUP BY LocationAbbr, LocationDesc
""").fetchall()
cdi_unique_states

[('UT', 'Utah'),
 ('RI', 'Rhode Island'),
 ('AK', 'Alaska'),
 ('KS', 'Kansas'),
 ('TN', 'Tennessee'),
 ('OH', 'Ohio'),
 ('MS', 'Mississippi'),
 ('WY', 'Wyoming'),
 ('MN', 'Minnesota'),
 ('NV', 'Nevada'),
 ('HI', 'Hawaii'),
 ('ND', 'North Dakota'),
 ('FL', 'Florida'),
 ('NM', 'New Mexico'),
 ('WI', 'Wisconsin'),
 ('AZ', 'Arizona'),
 ('IL', 'Illinois'),
 ('MD', 'Maryland'),
 ('NH', 'New Hampshire'),
 ('NE', 'Nebraska'),
 ('WV', 'West Virginia'),
 ('CO', 'Colorado'),
 ('IN', 'Indiana'),
 ('DC', 'District of Columbia'),
 ('ME', 'Maine'),
 ('DE', 'Delaware'),
 ('GA', 'Georgia'),
 ('VT', 'Vermont'),
 ('MT', 'Montana'),
 ('AR', 'Arkansas'),
 ('LA', 'Louisiana'),
 ('CA', 'California'),
 ('KY', 'Kentucky'),
 ('MO', 'Missouri'),
 ('TX', 'Texas'),
 ('CT', 'Connecticut'),
 ('NY', 'New York'),
 ('SC', 'South Carolina'),
 ('SD', 'South Dakota'),
 ('OK', 'Oklahoma'),
 ('IA', 'Iowa'),
 ('AL', 'Alabama'),
 ('MI', 'Michigan'),
 ('WA', 'Washington'),
 ('ID', 'Idaho'),
 ('OR', 'Oregon'),
 ('MA', 'Massachu

In [27]:
cdi_state_codes, cdi_state_names = zip(*cdi_unique_states)
cdi_state_codes

('UT',
 'RI',
 'AK',
 'KS',
 'TN',
 'OH',
 'MS',
 'WY',
 'MN',
 'NV',
 'HI',
 'ND',
 'FL',
 'NM',
 'WI',
 'AZ',
 'IL',
 'MD',
 'NH',
 'NE',
 'WV',
 'CO',
 'IN',
 'DC',
 'ME',
 'DE',
 'GA',
 'VT',
 'MT',
 'AR',
 'LA',
 'CA',
 'KY',
 'MO',
 'TX',
 'CT',
 'NY',
 'SC',
 'SD',
 'OK',
 'IA',
 'AL',
 'MI',
 'WA',
 'ID',
 'OR',
 'MA',
 'NJ',
 'NC',
 'PA',
 'VA')

In [28]:
set(cdi_state_names) - set(population_state_names)

set()

In [29]:
duckdb.sql("""
    SELECT Sex, Ethnicity, Origin
    FROM cdi
    GROUP BY Sex, Ethnicity, Origin
""")

┌─────────┬─────────────┬──────────────┐
│   Sex   │  Ethnicity  │    Origin    │
│ varchar │   varchar   │   varchar    │
├─────────┼─────────────┼──────────────┤
│ Both    │ All         │ Both         │
│ Female  │ All         │ Both         │
│ Both    │ Other       │ Not Hispanic │
│ Both    │ NHPI        │ Not Hispanic │
│ Both    │ Multiracial │ Not Hispanic │
│ Both    │ Black       │ Not Hispanic │
│ Both    │ Asian       │ Not Hispanic │
│ Both    │ All         │ Hispanic     │
│ Both    │ White       │ Not Hispanic │
│ Both    │ AIAN        │ Not Hispanic │
│ Male    │ All         │ Both         │
├─────────┴─────────────┴──────────────┤
│ 11 rows                    3 columns │
└──────────────────────────────────────┘

In [30]:
duckdb.sql("""
    SELECT DISTINCT(Sex)
    FROM cdi
""")

┌─────────┐
│   Sex   │
│ varchar │
├─────────┤
│ Both    │
│ Male    │
│ Female  │
└─────────┘

In [31]:
duckdb.sql("""
    SELECT DISTINCT(Ethnicity)
    FROM cdi
""")

┌─────────────┐
│  Ethnicity  │
│   varchar   │
├─────────────┤
│ Other       │
│ Asian       │
│ White       │
│ Black       │
│ Multiracial │
│ AIAN        │
│ All         │
│ NHPI        │
└─────────────┘

In [32]:
duckdb.sql("""
    SELECT DISTINCT(Origin)
    FROM cdi
""")

┌──────────────┐
│    Origin    │
│   varchar    │
├──────────────┤
│ Hispanic     │
│ Not Hispanic │
│ Both         │
└──────────────┘