# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [430]:
import duckdb

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Create database in order to persist data if one does not already exist. (DO NOT create a `.duckdb` file manually as this will not contain the instructions for duckdb to read and write data from and to)

In [431]:
# conn = duckdb.connect("chronic_disease_analyses.duckdb")

# loading transformed population tables

In [432]:
us_populations_per_state_file_names = ['./data/population-data-transformed/us_populations_per_state.parquet/*.parquet']

In [433]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state AS
    SELECT *
    FROM read_parquet({us_populations_per_state_file_names}, union_by_name=True, filename=False)
""")

In [434]:
us_populations_per_state_by_sex_age_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2020_2023.parquet/*.parquet']

In [435]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_age AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_age_file_names}, union_by_name=True, filename=False)
""")

In [436]:
us_populations_per_state_by_sex_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2020_2023.parquet/*.parquet']

In [437]:
duckdb.sql(f"""
    CREATE OR REPLACE TABLE us_populations_per_state_by_sex_race_ho AS
    SELECT *
    FROM read_parquet({us_populations_per_state_by_sex_race_ho_file_names}, union_by_name=True, filename=False)
""")

In [438]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1071 │
└──────────────┘

In [439]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_age
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       152388 │
└──────────────┘

In [440]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        29376 │
└──────────────┘

In [469]:
population_unique_states = duckdb.sql("""
    SELECT DISTINCT(State)
    FROM us_populations_per_state
""").fetchall()

In [477]:
population_state_names = list(zip(*population_unique_states))[0]
population_state_names

('Iowa',
 'Kansas',
 'Louisiana',
 'Ohio',
 'Rhode Island',
 'Texas',
 'Arkansas',
 'Idaho',
 'South Carolina',
 'West Virginia',
 'Alabama',
 'California',
 'Colorado',
 'Illinois',
 'Mississippi',
 'Kentucky',
 'New York',
 'New Jersey',
 'Tennessee',
 'Wisconsin',
 'Indiana',
 'Maryland',
 'Michigan',
 'Missouri',
 'Pennsylvania',
 'Delaware',
 'Hawaii',
 'North Dakota',
 'New Mexico',
 'Washington',
 'Connecticut',
 'Maine',
 'Massachusetts',
 'Oregon',
 'Montana',
 'Nebraska',
 'North Carolina',
 'Oklahoma',
 'Utah',
 'Virginia',
 'Wyoming',
 'Arizona',
 'Alaska',
 'District of Columbia',
 'Florida',
 'Minnesota',
 'Nevada',
 'New Hampshire',
 'Georgia',
 'South Dakota',
 'Vermont')

# Loading transformed chronic disease indicators table

In [447]:
cdi_file_name = './data/cdi-data-transformed/cdi.parquet/*.parquet'

In [455]:
q = f"""
    CREATE OR REPLACE TABLE cdi AS
    SELECT *
    FROM read_parquet('{cdi_file_name}')
"""

In [456]:
duckdb.sql(q)

In [457]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM cdi
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       699340 │
└──────────────┘

In [458]:
duckdb.sql("""
    SELECT *
    FROM cdi
""")

┌───────────┬─────────┬──────────────┬────────────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬─────────────────────────┬───────────┬────────────────────┬─────────────────────┬─────────────────────────┬────────────┬─────────┬────────────┬─────────────────┬───────────────────────────┬───────────────────┬─────────────────────┬────────────────────┬──────────┬────────┬─────────┬───────────┬──────────────┐
│ YearStart │ YearEnd │ LocationAbbr │  LocationDesc  │  Topic  │                                        Question                                         │ DataValueUnit │      DataValueType      │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ StratificationCategory1 │ LocationID │ TopicID │ QuestionID │ DataValueTypeID │ StratificationCategoryID1 │ StratificationID1 │      Latitude       │     Longitude      │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │
│   int32   │  int32  │   varchar    │

#### unique pairs of question id and topic id

In [459]:
duckdb.sql("""
    SELECT TopicID, QuestionID
    FROM cdi
    GROUP BY TopicID, QuestionID
""")

┌─────────┬────────────┐
│ TopicID │ QuestionID │
│ varchar │  varchar   │
├─────────┼────────────┤
│ TOB     │ TOB3_0     │
│ ALC     │ ALC8_0_3   │
│ DIA     │ DIA12_2    │
│ MTH     │ MTH1_0     │
│ NPAW    │ NPAW1_1    │
│ NPAW    │ NPAW3_1    │
│ ORH     │ ORH1_1     │
│ TOB     │ TOB2_2     │
│ TOB     │ TOB8_0     │
│ ALC     │ ALC7_0     │
│  ·      │   ·        │
│  ·      │   ·        │
│  ·      │   ·        │
│ CAN     │ CAN8_1     │
│ CKD     │ CKD3_0     │
│ CVD     │ CVD3_1     │
│ NPAW    │ NPAW10_0   │
│ NPAW    │ NPAW11_3   │
│ NPAW    │ NPAW18_0   │
│ OLD     │ OLD3_1     │
│ OVC     │ OVC2_2     │
│ TOB     │ TOB11_1    │
│ TOB     │ TOB2_1     │
├─────────┴────────────┤
│ 192 rows (20 shown)  │
└──────────────────────┘

In [478]:
cdi_unique_states = duckdb.sql("""
    SELECT LocationAbbr, LocationDesc
    FROM cdi
    GROUP BY LocationAbbr, LocationDesc
""").fetchall()
cdi_unique_states

[('LA', 'Louisiana'),
 ('KY', 'Kentucky'),
 ('CA', 'California'),
 ('MO', 'Missouri'),
 ('PR', 'Puerto Rico'),
 ('ID', 'Idaho'),
 ('MT', 'Montana'),
 ('VT', 'Vermont'),
 ('AR', 'Arkansas'),
 ('DE', 'Delaware'),
 ('GA', 'Georgia'),
 ('TN', 'Tennessee'),
 ('ND', 'North Dakota'),
 ('FL', 'Florida'),
 ('US', 'United States'),
 ('AK', 'Alaska'),
 ('KS', 'Kansas'),
 ('IL', 'Illinois'),
 ('NM', 'New Mexico'),
 ('WI', 'Wisconsin'),
 ('NH', 'New Hampshire'),
 ('AZ', 'Arizona'),
 ('MD', 'Maryland'),
 ('SD', 'South Dakota'),
 ('OK', 'Oklahoma'),
 ('IA', 'Iowa'),
 ('SC', 'South Carolina'),
 ('UT', 'Utah'),
 ('RI', 'Rhode Island'),
 ('MS', 'Mississippi'),
 ('OH', 'Ohio'),
 ('WY', 'Wyoming'),
 ('HI', 'Hawaii'),
 ('NV', 'Nevada'),
 ('MN', 'Minnesota'),
 ('VI', 'Virgin Islands'),
 ('AL', 'Alabama'),
 ('TX', 'Texas'),
 ('NY', 'New York'),
 ('CT', 'Connecticut'),
 ('MA', 'Massachusetts'),
 ('NC', 'North Carolina'),
 ('OR', 'Oregon'),
 ('NJ', 'New Jersey'),
 ('GU', 'Guam'),
 ('WA', 'Washington'),
 ('MI',

In [480]:
cdi_state_codes, cdi_state_names = zip(*cdi_unique_states)
cdi_state_codes

('LA',
 'KY',
 'CA',
 'MO',
 'PR',
 'ID',
 'MT',
 'VT',
 'AR',
 'DE',
 'GA',
 'TN',
 'ND',
 'FL',
 'US',
 'AK',
 'KS',
 'IL',
 'NM',
 'WI',
 'NH',
 'AZ',
 'MD',
 'SD',
 'OK',
 'IA',
 'SC',
 'UT',
 'RI',
 'MS',
 'OH',
 'WY',
 'HI',
 'NV',
 'MN',
 'VI',
 'AL',
 'TX',
 'NY',
 'CT',
 'MA',
 'NC',
 'OR',
 'NJ',
 'GU',
 'WA',
 'MI',
 'VA',
 'PA',
 'NE',
 'WV',
 'DC',
 'IN',
 'ME',
 'CO')

These are other states we shouldn't include in the CDI table since there are 

In [481]:
set(cdi_state_names) - set(population_state_names)

{'Guam', 'Puerto Rico', 'United States', 'Virgin Islands'}