# Now that all necessary data have now been extracted, transformed, and dumped to parquet files we can start doing our analyses by reading these parquet files as tables with duckdb (an in process OLAP) and then use these tables to make our transformations and draw insights

In [11]:
import duckdb

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# loading transformed population tables

In [12]:
us_populations_per_state_file_names = ['./data/population-data-transformed/us_populations_per_state.parquet/*.parquet']

In [13]:
us_populations_per_state = duckdb.read_parquet(us_populations_per_state_file_names, union_by_name=True, filename=False)

In [14]:
us_populations_per_state_by_sex_age_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_age_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_age_2020_2023.parquet/*.parquet']

In [15]:
us_populations_per_state_by_sex_age = duckdb.read_parquet(us_populations_per_state_by_sex_age_file_names, union_by_name=True, filename=False)

In [16]:
us_populations_per_state_by_sex_race_ho_file_names = ['./data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2000_2009.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2010_2019.parquet/*.parquet',
    './data/population-data-transformed/us_populations_per_state_by_sex_race_ho_2020_2023.parquet/*.parquet']

In [17]:
us_populations_per_state_by_sex_race_ho = duckdb.read_parquet(us_populations_per_state_by_sex_race_ho_file_names, union_by_name=True, filename=False)

In [None]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_race_ho
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        29376 │
└──────────────┘

In [None]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state_by_sex_age
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       152388 │
└──────────────┘

In [None]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM us_populations_per_state
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│         1071 │
└──────────────┘

In [21]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state_by_sex_age
""")

┌────────────────┬───────┬────────────┬─────────┬──────────┬──────────┬────────┐
│    Bracket     │ Year  │ Population │   Sex   │  State   │ AgeStart │ AgeEnd │
│    varchar     │ int32 │   int64    │ varchar │ varchar  │  double  │ double │
├────────────────┼───────┼────────────┼─────────┼──────────┼──────────┼────────┤
│ under 5 years  │  2000 │     150609 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2001 │     151410 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2002 │     150856 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2003 │     150594 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2004 │     150699 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2005 │     150960 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2006 │     151442 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2007 │     153128 │ Male    │ Alabama  │      0.0 │    5.0 │
│ under 5 years  │  2008 │  

In [23]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state_by_sex_race_ho
""")

┌───────────┬──────────────┬─────────┬───────┬────────────┬──────────────┐
│ Ethnicity │    Origin    │   Sex   │ Year  │ Population │    State     │
│  varchar  │   varchar    │ varchar │ int32 │   int64    │   varchar    │
├───────────┼──────────────┼─────────┼───────┼────────────┼──────────────┤
│ White     │ Not Hispanic │ Male    │  2000 │    1526743 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2001 │    1526185 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2002 │    1524987 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2003 │    1527793 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2004 │    1530095 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2005 │    1536529 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2006 │    1548231 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2007 │    1555756 │ Alabama      │
│ White     │ Not Hispanic │ Male    │  2008 │    1562282 │ Alabama      │
│ White     │ Not Hispani

In [24]:
duckdb.sql("""
    SELECT *
    FROM us_populations_per_state
""")

┌──────────────────────┬────────────┬───────┐
│        State         │ Population │ Year  │
│       varchar        │   int64    │ int32 │
├──────────────────────┼────────────┼───────┤
│ Alabama              │    4467634 │  2001 │
│ Alaska               │     633714 │  2001 │
│ Arizona              │    5273477 │  2001 │
│ Arkansas             │    2691571 │  2001 │
│ California           │   34479458 │  2001 │
│ Colorado             │    4425687 │  2001 │
│ Connecticut          │    3432835 │  2001 │
│ Delaware             │     795699 │  2001 │
│ District of Columbia │     574504 │  2001 │
│ Florida              │   16356966 │  2001 │
│    ·                 │        ·   │    ·  │
│    ·                 │        ·   │    ·  │
│    ·                 │        ·   │    ·  │
│ South Dakota         │     896492 │  2021 │
│ Tennessee            │    6965740 │  2021 │
│ Texas                │   29570351 │  2021 │
│ Utah                 │    3339738 │  2021 │
│ Vermont              │     64721

# Loading transformed chronic disease indicators table

In [27]:
cdi_file_name = "./data/cdi-data-transformed/cdi.parquet/*.parquet"
cdi = duckdb.read_parquet(cdi_file_name)

In [28]:
duckdb.sql("""
    SELECT COUNT(*)
    FROM cdi
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       699340 │
└──────────────┘

In [29]:
duckdb.sql("""
    SELECT *
    FROM cdi
""")

┌───────────┬─────────┬──────────────┬────────────────┬─────────┬─────────────────────────────────────────────────────────────────────────────────────────┬───────────────┬─────────────────────────┬───────────┬────────────────────┬─────────────────────┬─────────────────────────┬────────────┬─────────┬────────────┬─────────────────┬───────────────────────────┬───────────────────┬─────────────────────┬────────────────────┬──────────┬────────┬─────────┬───────────┬──────────────┐
│ YearStart │ YearEnd │ LocationAbbr │  LocationDesc  │  Topic  │                                        Question                                         │ DataValueUnit │      DataValueType      │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ StratificationCategory1 │ LocationID │ TopicID │ QuestionID │ DataValueTypeID │ StratificationCategoryID1 │ StratificationID1 │      Latitude       │     Longitude      │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │
│   int32   │  int32  │   varchar    │