# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [375]:
import duckdb

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Create database in order to persist data if one does not already exist. (DO NOT create a `.duckdb` file manually as this will not contain the instructions for duckdb to read and write data from and to)

In [None]:
conn = duckdb.connect("chronic_disease_analyses.duckdb")

# loading transformed population tables

In [377]:
us_populations_per_state_file_names = ['./data/population-data-transformed/us_populations_per_state.parquet/*.parquet']

In [378]:
us_populations_per_state = duckdb.read_parquet(us_populations_per_state_file_names, union_by_name=True, filename=False)

In [379]:
us_populations_per_state_by_sex_age_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2020_2023.parquet/*.parquet']

In [380]:
us_populations_per_state_by_sex_age = duckdb.read_parquet(us_populations_per_state_by_sex_age_file_names, union_by_name=True, filename=False)

In [381]:
us_populations_per_state_by_sex_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2020_2023.parquet/*.parquet']

In [382]:
us_populations_per_state_by_sex_race_ho = duckdb.read_parquet(us_populations_per_state_by_sex_race_ho_file_names, union_by_name=True, filename=False)

# Creating tables to copy population tables into

In [383]:
# duckdb.sql("""
#     CREATE TABLE IF NOT EXISTS us_populations_per_state (
#         State VARCHAR,
#         Population BIGINT,
#         Year INT,
#     )
# """)

In [384]:
# duckdb.sql("""
#     CREATE TABLE IF NOT EXISTS us_populations_per_state_by_sex_age (
#         Bracket VARCHAR,
#         Year INT,
#         Population BIGINT,
#         Sex VARCHAR,
#         State VARCHAR,
#         AgeStart DOUBLE,
#         AgeEnd DOUBLE
#     )
# """)

In [385]:
# duckdb.sql("""
#     CREATE TABLE IF NOT EXISTS us_populations_per_state_by_sex_race_ho (
#         Ethnicity VARCHAR,
#         Origin VARCHAR,
#         Sex VARCHAR,
#         Year INT,
#         Population BIGINT,
#         State VARCHAR
#     )
# """)

# Insert data into table

#### us populations per state

In [386]:
# q = f"""
#     INSERT INTO us_populations_per_state
#     SELECT * FROM read_parquet({us_populations_per_state_file_names}, union_by_name=True, filename=False)
# """

In [387]:
# q

In [388]:
# duckdb.sql(q)

#### us populations per state by sex and age

In [389]:
# q = f"""
#     INSERT INTO us_populations_per_state_by_sex_age
#     SELECT * FROM read_parquet({us_populations_per_state_by_sex_age_file_names}, union_by_name=True, filename=False)
# """

In [390]:
# q

In [391]:
# duckdb.sql(q)

In [392]:
# duckdb.sql("""
#     SELECT Sex FROM us_populations_per_state_by_sex_age
# """)

#### us populations per state by sex, race, and ethnicity

In [393]:
# q = f"""
#     INSERT INTO us_populations_per_state_by_sex_race_ho
#     SELECT * FROM read_parquet({us_populations_per_state_by_sex_race_ho_file_names}, union_by_name=True, filename=False)
# """

In [394]:
# q

In [395]:
# duckdb.sql(q)

In [406]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1071 │
└──────────────┘

In [405]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_age
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       152388 │
└──────────────┘

In [404]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        29376 │
└──────────────┘

In [401]:
duckdb.sql("""
    DROP TABLE us_populations_per_state
""")

In [402]:
duckdb.sql("""
    DROP TABLE us_populations_per_state_by_sex_age
""")
    

In [403]:
duckdb.sql("""
    DROP TABLE us_populations_per_state_by_sex_race_ho
""")

# Loading transformed chronic disease indicators table

In [None]:
cdi_file_name = "./data/cdi-data-transformed/cdi.parquet/*.parquet"
cdi = duckdb.read_parquet(cdi_file_name)

In [None]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM cdi
""")

In [None]:
duckdb.sql("""
    SELECT *
    FROM cdi
""")

In [407]:
conn.close()