# Now that all necessary data have now been extracted, transformed, dumped to parquet files in a lake (AWS S3) and then loaded into a DWH (DuckDB/Motherduck) we can start doing our/ analyses and do further transformations and draw insights through querying

In [1]:
import duckdb
import os

from dotenv import load_dotenv
from pathlib import Path
from duckdb.typing import *

%load_ext autoreload
%autoreload 2

# Local connection

In [2]:
# # C:\Users\LARRY\Documents\Scripts\data-engineering-path\chronic-disease-analyses\chronic_disease_analyses_db.db
# conn = duckdb.connect("chronic_disease_analyses_db.db")

# remote connection

In [3]:
# Build paths inside the project like this: BASE_DIR / 'subdir'.
# use this only in development
print("loading env variables...")
env_dir = Path('./').resolve()
load_dotenv(os.path.join(env_dir, '.env'))
print("env variables loaded.\n")

loading env variables...
env variables loaded.



In [4]:
# jdbc:duckdb:md:chronic_disease_analyses_db
# duckdb:///md:chronic_disease_analyses_db
print("connecting to duckdb...")
conn = duckdb.connect(f"md:chronic_disease_analyses_db?motherduck_token={os.environ['MOTHERDUCK_TOKEN']}")
print("connected to duckdb.\n")

connecting to duckdb...
connected to duckdb.



In [5]:
tables = [
    table
    for table_tuple in conn.sql("""SHOW TABLES""").fetchall()
    for table in table_tuple 
]
tables

['CDI',
 'CDILocation',
 'CDIStratification',
 'DataValueType',
 'Population',
 'PopulationState',
 'PopulationStratification',
 'Question',
 'Stratification']

In [None]:
for table in tables:
    count = conn.sql(f"""SELECT COUNT(*) FROM {table}""").fetchall()[0][0]
    print(f"table {table} count: {count}")

678471
51
11
15
2947392
51
28
192
39


In [7]:
conn.sql("""
    SELECT * FROM CDILocation
""")

┌────────────┬────────────────┬─────────────────────┬────────────────────┐
│ LocationID │  LocationDesc  │      Latitude       │     Longitude      │
│  varchar   │    varchar     │       double        │       double       │
├────────────┼────────────────┼─────────────────────┼────────────────────┤
│ NM         │ New Mexico     │ -106.24058098499967 │  34.52088095200048 │
│ OK         │ Oklahoma       │  -97.52107021399968 │  35.47203135600046 │
│ GA         │ Georgia        │  -83.62758034599966 │  32.83968109300048 │
│ KY         │ Kentucky       │  -84.77497104799966 │ 37.645970271000465 │
│ ND         │ North Dakota   │ -100.11842104899966 │  47.47531977900047 │
│ WY         │ Wyoming        │ -108.10983035299967 │  43.23554134300048 │
│ TN         │ Tennessee      │  -85.77449091399967 │  35.68094058000048 │
│ LA         │ Louisiana      │  -92.44568007099969 │  31.31266064400046 │
│ NE         │ Nebraska       │  -99.36572062299967 │   41.6410409880005 │
│ AK         │ Alaska    

In [8]:
conn.sql("""
    SELECT * FROM CDIStratification
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │     varchar      │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Male    │ All         │ Both         │ B_M_ALL          │
│ Both    │ White       │ Not Hispanic │ NH_B_WHITE       │
│ Both    │ Black       │ Not Hispanic │ NH_B_BLACK       │
│ Both    │ AIAN        │ Not Hispanic │ NH_B_AIAN        │
│ Both    │ Multiracial │ Not Hispanic │ NH_B_MULTI       │
│ Both    │ NHPI        │ Not Hispanic │ NH_B_NHPI        │
│ Both    │ Other       │ Not Hispanic │ NH_B_OTHER       │
│ Female  │ All         │ Both         │ B_F_ALL          │
│ Both    │ All         │ Hispanic     │ H_B_ALL          │
│ Both    │ All         │ Both         │ B_B_ALL          │
│ Both    │ Asian       │ Not Hispanic │ NH_B_ASIAN       │
├─────────┴─────────────┴──────────────┴──────────────────┤
│ 11 rows                               

In [9]:
conn.sql("""
    SELECT * FROM DataValueType
""")

┌───────────────────┬──────────────────────────────────────────┐
│  DataValueTypeID  │              DataValueType               │
│      varchar      │                 varchar                  │
├───────────────────┼──────────────────────────────────────────┤
│ MEDIAN            │ Median                                   │
│ NMBR              │ Number                                   │
│ AGEADJRATE        │ Age-adjusted Rate                        │
│ PERCAPALC         │ Per capita alcohol consumption           │
│ USD               │ US Dollars                               │
│ AGEADJMEAN        │ Age-adjusted Mean                        │
│ PREV              │ Prevalence                               │
│ MEAN              │ Mean                                     │
│ AVGANNCRDRATE     │ Average Annual Crude Rate                │
│ CRDRATE           │ Crude Rate                               │
│ AGEADJPREV        │ Age-adjusted Prevalence                  │
│ AVGANNAGEADJRATE  │ Ave

In [10]:
conn.sql("""
    SELECT * FROM Question
""")

┌────────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────┬──────────┬────────┐
│ QuestionID │ TopicID │                                                   Question                                                   │                      Topic                      │ AgeStart │ AgeEnd │
│  varchar   │ varchar │                                                   varchar                                                    │                     varchar                     │  double  │ double │
├────────────┼─────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────────────┼──────────┼────────┤
│ ART1_1     │ ART     │ Arthritis among adults aged >= 18 years                                                                      │ Arthritis                               

In [11]:
conn.sql("""
    SELECT * FROM Population
""")

┌─────────┬───────┬──────────────────┬───────┬────────────┐
│ StateID │  Age  │ StratificationID │ Year  │ Population │
│ varchar │ float │     varchar      │ int32 │   int64    │
├─────────┼───────┼──────────────────┼───────┼────────────┤
│ AL      │   0.0 │ NH_M_WHITE       │  2000 │      19270 │
│ AL      │   0.0 │ NH_M_WHITE       │  2001 │      19612 │
│ AL      │   0.0 │ NH_M_WHITE       │  2002 │      18731 │
│ AL      │   0.0 │ NH_M_WHITE       │  2003 │      18623 │
│ AL      │   0.0 │ NH_M_WHITE       │  2004 │      18659 │
│ AL      │   0.0 │ NH_M_WHITE       │  2005 │      18816 │
│ AL      │   0.0 │ NH_M_WHITE       │  2006 │      18877 │
│ AL      │   0.0 │ NH_M_WHITE       │  2007 │      19027 │
│ AL      │   0.0 │ NH_M_WHITE       │  2008 │      18937 │
│ AL      │   0.0 │ NH_M_WHITE       │  2009 │      18039 │
│ ·       │    ·  │     ·            │    ·  │          · │
│ ·       │    ·  │     ·            │    ·  │          · │
│ ·       │    ·  │     ·            │  

In [12]:
conn.sql("""
    SELECT * FROM PopulationState
""")

┌───────────────┬─────────┐
│     State     │ StateID │
│    varchar    │ varchar │
├───────────────┼─────────┤
│ Mississippi   │ MS      │
│ South Dakota  │ SD      │
│ Utah          │ UT      │
│ Kentucky      │ KY      │
│ California    │ CA      │
│ Nebraska      │ NE      │
│ New Hampshire │ NH      │
│ Delaware      │ DE      │
│ Minnesota     │ MN      │
│ Nevada        │ NV      │
│   ·           │ ·       │
│   ·           │ ·       │
│   ·           │ ·       │
│ Michigan      │ MI      │
│ Illinois      │ IL      │
│ Maryland      │ MD      │
│ Alaska        │ AK      │
│ Connecticut   │ CT      │
│ Vermont       │ VT      │
│ Massachusetts │ MA      │
│ Oklahoma      │ OK      │
│ West Virginia │ WV      │
│ Virginia      │ VA      │
├───────────────┴─────────┤
│   51 rows (20 shown)    │
└─────────────────────────┘

In [13]:
conn.sql("""
    SELECT * FROM PopulationStratification
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │     varchar      │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Male    │ Other       │ Not Hispanic │ NH_M_OTHER       │
│ Female  │ Black       │ Not Hispanic │ NH_F_BLACK       │
│ Female  │ Asian       │ Not Hispanic │ NH_F_ASIAN       │
│ Female  │ NHPI        │ Hispanic     │ H_F_NHPI         │
│ Male    │ Other       │ Hispanic     │ H_M_OTHER        │
│ Male    │ Asian       │ Not Hispanic │ NH_M_ASIAN       │
│ Male    │ White       │ Not Hispanic │ NH_M_WHITE       │
│ Male    │ AIAN        │ Not Hispanic │ NH_M_AIAN        │
│ Male    │ Multiracial │ Hispanic     │ H_M_MULTI        │
│ Male    │ Black       │ Not Hispanic │ NH_M_BLACK       │
│  ·      │   ·         │    ·         │     ·            │
│  ·      │   ·         │    ·         │     ·            │
│  ·      │   ·         │    ·         │

In [14]:
conn.sql("""
    SELECT COUNT(*) 
    FROM PopulationState
    -- WHERE COLUMNS(*) IS NOT NULL
""")


┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│           51 │
└──────────────┘

In [15]:
conn.sql("""
    SELECT *
    FROM CDI
    ORDER BY LogID ASC
    LIMIT 10
""")

┌───────────┬─────────┬────────────┬───────────────┬───────────┬────────────────────┬─────────────────────┬────────────┬─────────────────┬──────────────────┬───────┐
│ YearStart │ YearEnd │ LocationID │ DataValueUnit │ DataValue │ LowConfidenceLimit │ HighConfidenceLimit │ QuestionID │ DataValueTypeID │ StratificationID │ LogID │
│   int32   │  int32  │  varchar   │    varchar    │  double   │       double       │       double        │  varchar   │     varchar     │     varchar      │ int32 │
├───────────┼─────────┼────────────┼───────────────┼───────────┼────────────────────┼─────────────────────┼────────────┼─────────────────┼──────────────────┼───────┤
│      2015 │    2015 │ IL         │ %             │      10.6 │                4.8 │                22.2 │ ALC1_1     │ CRDPREV         │ NH_B_ASIAN       │     1 │
│      2015 │    2015 │ RI         │ %             │      12.6 │                7.3 │                21.0 │ ALC1_1     │ CRDPREV         │ NH_B_ASIAN       │     2 │
│   

In [16]:
conn.sql("""
    -- Create a 2nd CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID AS ID,
            c.YearStart AS YearStart, 
            c.YearEnd AS YearEnd, 
            cl.LocationDesc AS LocationDesc, 
            c.DataValueUnit AS DataValueUnit,
            c.DataValue AS DataValue,
            q.Question AS Question,
            q.AgeStart AS AgeStart,
            q.AgeEnd AS AgeEnd,
            dvt.DataValueType AS DataValueType,
            cs.Sex AS Sex,
            cs.Ethnicity AS Ethnicity,
            cs.Origin AS Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN CDIStratification cs
        ON c.StratificationID = cs.StratificationID
        ORDER BY ID ASC
    )
    SELECT * FROM MergedCDI;
""")

┌───────┬───────────┬─────────┬────────────────┬───────────────┬───────────┬─────────────────────────────────────────────────────────────────────────────────────────┬──────────┬────────┬─────────────────────────┬─────────┬───────────┬──────────────┐
│  ID   │ YearStart │ YearEnd │  LocationDesc  │ DataValueUnit │ DataValue │                                        Question                                         │ AgeStart │ AgeEnd │      DataValueType      │   Sex   │ Ethnicity │    Origin    │
│ int32 │   int32   │  int32  │    varchar     │    varchar    │  double   │                                         varchar                                         │  double  │ double │         varchar         │ varchar │  varchar  │   varchar    │
├───────┼───────────┼─────────┼────────────────┼───────────────┼───────────┼─────────────────────────────────────────────────────────────────────────────────────────┼──────────┼────────┼─────────────────────────┼─────────┼───────────┼──────────────┤


In [17]:
conn.sql("""
    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    WITH MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    )

    SELECT 
        StateID, 
        Sex, 
        Origin, 
        Ethnicity,
        Age, 
        Year
    FROM MergedPopulation
    WHERE (Sex = 'Male' OR Sex = 'Female') 
    AND (Origin = 'Hispanic' OR Origin = 'Not Hispanic') 
    AND (Ethnicity = 'White')
    AND (Age BETWEEN 0 AND 85)
    AND (State = 'Alabama')
    AND (Year = 2000)
""")

┌─────────┬─────────┬──────────────┬───────────┬───────┬───────┐
│ StateID │   Sex   │    Origin    │ Ethnicity │  Age  │ Year  │
│ varchar │ varchar │   varchar    │  varchar  │ float │ int32 │
├─────────┼─────────┼──────────────┼───────────┼───────┼───────┤
│ AL      │ Male    │ Not Hispanic │ White     │   0.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   1.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   2.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   3.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   4.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   5.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   6.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   7.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   8.0 │  2000 │
│ AL      │ Male    │ Not Hispanic │ White     │   9.0 │  2000 │
│ ·       │  ·      │    ·         │   ·       │    ·  │    ·  │
│ ·       │  ·      │    

In [18]:
conn.sql("""
    -- Create a 2nd CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID AS ID,
            c.YearStart AS YearStart, 
            c.YearEnd AS YearEnd, 
            cl.LocationDesc AS LocationDesc, 
            c.DataValueUnit AS DataValueUnit,
            c.DataValue AS DataValue,
            q.Question AS Question,
            q.AgeStart AS AgeStart,
            q.AgeEnd AS AgeEnd,
            dvt.DataValueType AS DataValueType,
            cs.Sex AS Sex,
            cs.Ethnicity AS Ethnicity,
            cs.Origin AS Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN CDIStratification cs
        ON c.StratificationID = cs.StratificationID
    )

    -- groups the rows with the same YearStart, YearEnd, 
    -- LocationDesc, AgeStart, AgeEnd, Sex, Ethnicity, and Origin
    -- as this will simulate the rows taking on different values
    -- which we will need to dynamically aggregate the Population
    -- table
    SELECT 
        MAX(YearStart) AS YearStart, 
        MAX(YearEnd) AS YearEnd, 
        MAX(LocationDesc) AS LocationDesc, 
        MAX(AgeStart) AS AgeStart, 
        MAX(AgeEnd) AS AgeEnd, 
        MAX(Sex) AS Sex, 
        MAX(Ethnicity) AS Ethnicity, 
        MAX(Origin) AS Origin
    FROM MergedCDI
    GROUP BY (
        YearStart, 
        YearEnd, 
        LocationDesc, 
        AgeStart, 
        AgeEnd,
        Sex,
        Ethnicity,
        Origin
    )
""")

┌───────────┬─────────┬──────────────────────┬──────────┬────────┬─────────┬───────────┬──────────────┐
│ YearStart │ YearEnd │     LocationDesc     │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │
│   int32   │  int32  │       varchar        │  double  │ double │ varchar │  varchar  │   varchar    │
├───────────┼─────────┼──────────────────────┼──────────┼────────┼─────────┼───────────┼──────────────┤
│      2011 │    2011 │ New Mexico           │     NULL │   NULL │ Both    │ White     │ Not Hispanic │
│      2018 │    2018 │ Michigan             │     NULL │   NULL │ Both    │ Black     │ Not Hispanic │
│      2016 │    2016 │ Texas                │     NULL │   NULL │ Male    │ All       │ Both         │
│      2011 │    2011 │ Iowa                 │     NULL │   NULL │ Male    │ All       │ Both         │
│      2020 │    2020 │ North Carolina       │     NULL │   NULL │ Male    │ All       │ Both         │
│      2011 │    2011 │ South Dakota         │     NULL │   NULL

In [19]:
conn.sql("""
    WITH MergedPopulation AS (
        SELECT 
            ps.State AS State,
            p.Age AS Age,
            p.Year AS Year,
            pstr.Sex AS Sex,
            pstr.Ethnicity AS Ethnicity,
            pstr.Origin AS Origin,
            p.Population AS Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN PopulationStratification pstr
        ON p.StratificationID = pstr.StratificationID
    )

    SELECT 
        Year, 
        SUM(Population) AS TotalPopulation
    FROM MergedPopulation
    WHERE (Year BETWEEN 2011 AND 2013) AND
    (Age BETWEEN 18 AND 85) AND
    State = 'Wyoming' AND
    Sex IN ('Male') AND
    Ethnicity IN ('White', 'Black', 'AIAN', 'Asian', 'NHPI', 'Multiracial') AND
    Origin IN ('Hispanic',)
    GROUP BY Year
    ORDER BY Year ASC
""")

┌───────┬─────────────────┐
│ Year  │ TotalPopulation │
│ int32 │     int128      │
├───────┼─────────────────┤
│  2011 │           17565 │
│  2012 │           18734 │
│  2013 │           19147 │
└───────┴─────────────────┘

I have loaded two fact tables CDI and Population and a couple dimension tables in DuckDB. I did joins on the CDI fact table and its respective dimension tables which yields a snippet of the table below CDI table merged

And below is the Population fact table merged with its other dimension tables yielding this snippet below Population table merged enter image description here

Now what I want to basically do is filter out the Population table based only on the values of this particular row of the CDI table. In this case the current row outlined in green will somehow do this query
```
SELECT Year, SUM(Population) AS TotalPopulation
FROM Population
WHERE (Year BETWEEN 2018 AND 2018) AND
(Age BETWEEN 18 AND 85) AND
State = 'Pennsylvania' AND
Sex IN ('Male', 'Female') AND
Ethnicity IN ('Multiracial') AND
Origin IN ('Not Hispanic')
GROUP BY Year
ORDER BY Year ASC
```

This query aggregates the Population column values based on the row values of the CDI table. What I'm just at a loss in trying to implement is doing this aggregation operation for all row values in the CDI table. Here is a full visualization of what I'm trying to do. enter image description here

How would I implement this type of varying filtering aggregation based on each row column values of the CDI table? I'm using DuckDB as the OLAP DB here so ANSI SQL is what I'm trying to use to implement this task. Could it be possible only using this kind of SQL?

In [20]:
conn.sql("""
    SELECT current_schema();
""")

┌──────────────────┐
│ current_schema() │
│     varchar      │
├──────────────────┤
│ main             │
└──────────────────┘

In [21]:
conn.sql("""
    SELECT * FROM duckdb_columns();
""")

┌─────────────────────────────┬──────────────┬─────────────┬────────────┬────────────────────┬───────────┬─────────────────────┬──────────────┬─────────┬──────────┬────────────────┬─────────────┬───────────┬──────────────┬──────────────────────────┬───────────────────┬─────────────────────────┬───────────────┐
│        database_name        │ database_oid │ schema_name │ schema_oid │     table_name     │ table_oid │     column_name     │ column_index │ comment │ internal │ column_default │ is_nullable │ data_type │ data_type_id │ character_maximum_length │ numeric_precision │ numeric_precision_radix │ numeric_scale │
│           varchar           │    int64     │   varchar   │   int64    │      varchar       │   int64   │       varchar       │    int32     │ varchar │ boolean  │    varchar     │   boolean   │  varchar  │    int64     │          int32           │       int32       │          int32          │     int32     │
├─────────────────────────────┼──────────────┼─────────────┼────

In [22]:
conn.sql("""
    SELECT * FROM Stratification
""")

┌─────────┬─────────────┬──────────────┬──────────────────┐
│   Sex   │  Ethnicity  │    Origin    │ StratificationID │
│ varchar │   varchar   │   varchar    │     varchar      │
├─────────┼─────────────┼──────────────┼──────────────────┤
│ Both    │ Black       │ Not Hispanic │ NH_B_BLACK       │
│ Both    │ NHPI        │ Not Hispanic │ NH_B_NHPI        │
│ Both    │ Multiracial │ Not Hispanic │ NH_B_MULTI       │
│ Both    │ Other       │ Not Hispanic │ NH_B_OTHER       │
│ Female  │ Other       │ Hispanic     │ H_F_OTHER        │
│ Female  │ Other       │ Not Hispanic │ NH_F_OTHER       │
│ Both    │ All         │ Both         │ B_B_ALL          │
│ Male    │ Black       │ Hispanic     │ H_M_BLACK        │
│ Male    │ Multiracial │ Not Hispanic │ NH_M_MULTI       │
│ Male    │ White       │ Hispanic     │ H_M_WHITE        │
│  ·      │   ·         │    ·         │     ·            │
│  ·      │   ·         │    ·         │     ·            │
│  ·      │   ·         │    ·         │

In [38]:
conn.sql("""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    ),

    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    ),

    -- this is just a sample variable
    LeftTable AS (
        SELECT *
        FROM MergedCDI
        WHERE AgeStart IS NULL AND AgeEnd IS NULL 
        LIMIT 1
    )

    SELECT 
        lt.LogID,
        lt.YearStart, 
        lt.YearEnd,  
        lt.LocationDesc, 
        lt.AgeStart, 
        lt.AgeEnd, 
        lt.Sex, 
        lt.Ethnicity, 
        lt.Origin,
        mp.Population,
        mp.State,
        mp.Age,
        mp.Year,
        mp.Sex,
        mp.Ethnicity,
        mp.Origin
    FROM MergedPopulation mp
    INNER JOIN LeftTable lt
    ON (mp.Year BETWEEN lt.YearStart AND lt.YearEnd) AND
    (mp.State = lt.LocationDesc) AND
    ((mp.Age BETWEEN lt.AgeStart AND (CASE WHEN lt.AgeEnd = 'infinity' THEN 85 ELSE lt.AgeEnd END)) OR (lt.AgeStart IS NULL AND lt.AgeEnd IS NULL)) AND
    (mp.Sex = lt.Sex OR lt.Sex = 'Both') AND
    (mp.Ethnicity = lt.Ethnicity OR lt.Ethnicity = 'All') AND
    (mp.Origin = lt.Origin OR lt.Origin = 'Both')
    ORDER BY lt.LogID ASC
""")

┌───────┬───────────┬─────────┬──────────────┬──────────┬────────┬─────────┬───────────┬──────────────┬────────────┬──────────┬───────┬───────┬─────────┬───────────┬──────────────┐
│ LogID │ YearStart │ YearEnd │ LocationDesc │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │ Population │  State   │  Age  │ Year  │   Sex   │ Ethnicity │    Origin    │
│ int32 │   int32   │  int32  │   varchar    │  double  │ double │ varchar │  varchar  │   varchar    │   int64    │ varchar  │ float │ int32 │ varchar │  varchar  │   varchar    │
├───────┼───────────┼─────────┼──────────────┼──────────┼────────┼─────────┼───────────┼──────────────┼────────────┼──────────┼───────┼───────┼─────────┼───────────┼──────────────┤
│    81 │      2018 │    2018 │ Maryland     │     NULL │   NULL │ Both    │ NHPI      │ Not Hispanic │         21 │ Maryland │   3.0 │  2018 │ Male    │ NHPI      │ Not Hispanic │
│    81 │      2018 │    2018 │ Maryland     │     NULL │   NULL │ Both    │ NHPI      │ Not Hi

In [None]:
conn.sql("""
  -- Create a 2nd CTE that will join the necessary
  -- values from the dimension tables to the fact
  -- table
  WITH MergedCDI AS (
    SELECT
      c.LogID AS ID,
      c.YearStart AS YearStart, 
      c.YearEnd AS YearEnd, 
      cl.LocationDesc AS LocationDesc, 
      c.DataValueUnit AS DataValueUnit,
      c.DataValue AS DataValue,
      q.Question AS Question,
      q.AgeStart AS AgeStart,
      q.AgeEnd AS AgeEnd,
      dvt.DataValueType AS DataValueType,
      cs.Sex AS Sex,
      cs.Ethnicity AS Ethnicity,
      cs.Origin AS Origin
    FROM CDI c
    LEFT JOIN CDILocation cl
    ON c.LocationID = cl.LocationID
    LEFT JOIN Question q
    ON c.QuestionID = q.QuestionID
    LEFT JOIN DataValueType dvt
    ON c.DataValueTypeID = dvt.DataValueTypeID
    LEFT JOIN CDIStratification cs
    ON c.StratificationID = cs.StratificationID
  ),
      
  LeftTable AS (
    SELECT *
    FROM MergedCDI
    WHERE AgeStart IS NULL AND AgeEnd IS NULL 
    LIMIT 1
  ),

  MergedPopulation AS (
    SELECT 
      ps.State AS State,
      p.Age AS Age,
      p.Year AS Year,
      pstr.Sex AS Sex,
      pstr.Ethnicity AS Ethnicity,
      pstr.Origin AS Origin,
      p.Population AS Population
    FROM Population p
    LEFT JOIN PopulationState ps
    ON p.StateID = ps.StateID
    LEFT JOIN PopulationStratification pstr
    ON p.StratificationID = pstr.StratificationID
  )

  SELECT 
    lt.YearStart, 
    lt.YearEnd,  
    lt.LocationDesc, 
    lt.AgeStart, 
    lt.AgeEnd, 
    lt.Sex, 
    lt.Ethnicity, 
    lt.Origin,
    mp.Population,
    mp.State,
    mp.Age,
    mp.Year,
    mp.Sex,
    mp.Ethnicity,
    mp.Origin
  FROM MergedPopulation mp
  INNER JOIN LeftTable lt
  ON (mp.Year BETWEEN lt.YearStart AND lt.YearEnd) AND
  (mp.State = lt.LocationDesc) AND
  ((mp.Age BETWEEN lt.AgeStart AND (CASE WHEN lt.AgeEnd = 'infinity' THEN 85 ELSE lt.AgeEnd END)) OR (lt.AgeStart IS NULL AND lt.AgeEnd IS NULL)) AND
  (mp.Sex = lt.Sex OR lt.Sex = 'Both') AND
  (mp.Ethnicity = lt.Ethnicity OR lt.Ethnicity = 'All') AND
  (mp.Origin = lt.Origin OR lt.Origin = 'Both')
""")

┌───────────┬─────────┬──────────────┬──────────┬────────┬─────────┬───────────┬──────────────┬────────────┬──────────┬───────┬───────┬─────────┬───────────┬──────────────┐
│ YearStart │ YearEnd │ LocationDesc │ AgeStart │ AgeEnd │   Sex   │ Ethnicity │    Origin    │ Population │  State   │  Age  │ Year  │   Sex   │ Ethnicity │    Origin    │
│   int32   │  int32  │   varchar    │  double  │ double │ varchar │  varchar  │   varchar    │   int64    │ varchar  │ float │ int32 │ varchar │  varchar  │   varchar    │
├───────────┼─────────┼──────────────┼──────────┼────────┼─────────┼───────────┼──────────────┼────────────┼──────────┼───────┼───────┼─────────┼───────────┼──────────────┤
│      2018 │    2018 │ Maryland     │     NULL │   NULL │ Both    │ NHPI      │ Not Hispanic │         22 │ Maryland │   0.0 │  2018 │ Male    │ NHPI      │ Not Hispanic │
│      2018 │    2018 │ Maryland     │     NULL │   NULL │ Both    │ NHPI      │ Not Hispanic │         25 │ Maryland │   1.0 │  2018 │

In [None]:
conn.sql("""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    ),

    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    ),


    LeftTable AS (
        SELECT *
        FROM MergedCDI
        WHERE AgeStart IS NULL AND AgeEnd IS NULL 
        LIMIT 1
    ),

    -- performs an inner join on both CDI and Population
    -- tables based
    CDIWithPop AS (
        SELECT 
            mcdi.LogID AS LogID,
            mcdi.DataValueUnit AS DataValueUnit,
            mcdi.DataValue AS DataValue,
            mcdi.YearStart AS YearStart, 
            mcdi.YearEnd AS YearEnd,
            mcdi.LocationID AS LocationID,
            mcdi.LocationDesc AS LocationDesc, 
            mcdi.QuestionID as QuestionID,
            mcdi.AgeStart AS AgeStart,
            mcdi.AgeEnd AS AgeEnd,
            mcdi.DataValueTypeID AS DataValueTypeID,
            mcdi.DataValueType AS DataValueType,
            mcdi.StratificationID AS StratificationID,
            mcdi.Sex AS Sex,
            mcdi.Ethnicity AS Ethnicity,
            mcdi.Origin AS Origin,
        
            mp.Population,
            mp.State PState,
            mp.Age AS PAge,
            mp.Year AS PYear,
            mp.Sex AS PSex,
            mp.Ethnicity AS PEthnicity,
            mp.Origin AS POrigin
        FROM MergedPopulation mp
        INNER JOIN LeftTable mcdi
        ON (mp.Year BETWEEN mcdi.YearStart AND mcdi.YearEnd) AND
        (mp.StateID = mcdi.LocationID) AND
        ((mp.Age BETWEEN mcdi.AgeStart AND (CASE WHEN mcdi.AgeEnd = 'infinity' THEN 85 ELSE mcdi.AgeEnd END)) OR (mcdi.AgeStart IS NULL AND mcdi.AgeEnd IS NULL)) AND
        (mp.Sex = mcdi.Sex OR mcdi.Sex = 'Both') AND
        (mp.Ethnicity = mcdi.Ethnicity OR mcdi.Ethnicity = 'All') AND
        (mp.Origin = mcdi.Origin OR mcdi.Origin = 'Both')
    )

    SELECT * FROM CDIWithPop
""")

┌───────┬───────────────┬───────────┬───────────┬─────────┬────────────┬──────────────┬────────────┬──────────┬────────┬─────────────────┬──────────────────┬──────────────────┬─────────┬───────────┬──────────────┬────────────┬──────────┬───────┬───────┬─────────┬────────────┬──────────────┐
│ LogID │ DataValueUnit │ DataValue │ YearStart │ YearEnd │ LocationID │ LocationDesc │ QuestionID │ AgeStart │ AgeEnd │ DataValueTypeID │  DataValueType   │ StratificationID │   Sex   │ Ethnicity │    Origin    │ Population │  PState  │ PAge  │ PYear │  PSex   │ PEthnicity │   POrigin    │
│ int32 │    varchar    │  double   │   int32   │  int32  │  varchar   │   varchar    │  varchar   │  double  │ double │     varchar     │     varchar      │     varchar      │ varchar │  varchar  │   varchar    │   int64    │ varchar  │ float │ int32 │ varchar │  varchar   │   varchar    │
├───────┼───────────────┼───────────┼───────────┼─────────┼────────────┼──────────────┼────────────┼──────────┼────────┼────

# The following query will be query to the full join

In [49]:
conn.sql("""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    ),

    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    ),


    LeftTable AS (
        SELECT *
        FROM MergedCDI
        WHERE AgeStart IS NULL AND AgeEnd IS NULL 
        LIMIT 1
    ),

    -- performs an inner join on both CDI and Population
    -- tables based
    CDIWithPop AS (
        SELECT 
            mcdi.LogID AS LogID,
            mcdi.DataValueUnit AS DataValueUnit,
            mcdi.DataValue AS DataValue,
            mcdi.YearStart AS YearStart, 
            mcdi.YearEnd AS YearEnd,
            mcdi.LocationID AS LocationID,
            mcdi.LocationDesc AS LocationDesc, 
            mcdi.QuestionID as QuestionID,
            mcdi.AgeStart AS AgeStart,
            mcdi.AgeEnd AS AgeEnd,
            mcdi.DataValueTypeID AS DataValueTypeID,
            mcdi.DataValueType AS DataValueType,
            mcdi.StratificationID AS StratificationID,
            mcdi.Sex AS Sex,
            mcdi.Ethnicity AS Ethnicity,
            mcdi.Origin AS Origin,
        
            mp.Population,
            mp.State PState,
            mp.Age AS PAge,
            mp.Year AS PYear,
            mp.Sex AS PSex,
            mp.Ethnicity AS PEthnicity,
            mp.Origin AS POrigin
        FROM MergedPopulation mp
        INNER JOIN MergedCDI mcdi
        ON (mp.Year BETWEEN mcdi.YearStart AND mcdi.YearEnd) AND
        (mp.StateID = mcdi.LocationID) AND
        ((mp.Age BETWEEN mcdi.AgeStart AND (CASE WHEN mcdi.AgeEnd = 'infinity' THEN 85 ELSE mcdi.AgeEnd END)) OR (mcdi.AgeStart IS NULL AND mcdi.AgeEnd IS NULL)) AND
        (mp.Sex = mcdi.Sex OR mcdi.Sex = 'Both') AND
        (mp.Ethnicity = mcdi.Ethnicity OR mcdi.Ethnicity = 'All') AND
        (mp.Origin = mcdi.Origin OR mcdi.Origin = 'Both')
    )

    SELECT 
      LogID,
      SUM(Population) AS TotalPopulation
    FROM CDIWithPop
    GROUP BY LogID
    ORDER BY LogID
""")

┌───────┬─────────────────┐
│ LogID │ TotalPopulation │
│ int32 │     int128      │
├───────┼─────────────────┤
│     1 │           67510 │
│     2 │            5939 │
│     3 │           51277 │
│     4 │           50628 │
│     5 │          144434 │
│     6 │          481644 │
│     7 │          165930 │
│     8 │          172065 │
│     9 │          178905 │
│    10 │           25075 │
│     · │             ·   │
│     · │             ·   │
│     · │             ·   │
│  9991 │         2037516 │
│  9992 │          321693 │
│  9993 │          268620 │
│  9994 │         3496103 │
│  9995 │          376642 │
│  9996 │          406092 │
│  9997 │         1731126 │
│  9998 │          147772 │
│  9999 │         2103647 │
│ 10000 │         1037268 │
├───────┴─────────────────┤
│ ? rows        2 columns │
└─────────────────────────┘

In [50]:
conn.sql("""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    ),

    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    ),


    LeftTable AS (
        SELECT *
        FROM MergedCDI
        WHERE AgeStart IS NULL AND AgeEnd IS NULL 
        LIMIT 1
    ),

    -- performs an inner join on both CDI and Population
    -- tables based
    CDIWithPop AS (
        SELECT 
            mcdi.LogID AS LogID,
            mcdi.DataValueUnit AS DataValueUnit,
            mcdi.DataValue AS DataValue,
            mcdi.YearStart AS YearStart, 
            mcdi.YearEnd AS YearEnd,
            mcdi.LocationID AS LocationID,
            mcdi.LocationDesc AS LocationDesc, 
            mcdi.QuestionID as QuestionID,
            mcdi.AgeStart AS AgeStart,
            mcdi.AgeEnd AS AgeEnd,
            mcdi.DataValueTypeID AS DataValueTypeID,
            mcdi.DataValueType AS DataValueType,
            mcdi.StratificationID AS StratificationID,
            mcdi.Sex AS Sex,
            mcdi.Ethnicity AS Ethnicity,
            mcdi.Origin AS Origin,
        
            mp.Population,
            mp.State PState,
            mp.Age AS PAge,
            mp.Year AS PYear,
            mp.Sex AS PSex,
            mp.Ethnicity AS PEthnicity,
            mp.Origin AS POrigin
        FROM MergedPopulation mp
        INNER JOIN MergedCDI mcdi
        ON (mp.Year BETWEEN mcdi.YearStart AND mcdi.YearEnd) AND
        (mp.StateID = mcdi.LocationID) AND
        ((mp.Age BETWEEN mcdi.AgeStart AND (CASE WHEN mcdi.AgeEnd = 'infinity' THEN 85 ELSE mcdi.AgeEnd END)) OR (mcdi.AgeStart IS NULL AND mcdi.AgeEnd IS NULL)) AND
        (mp.Sex = mcdi.Sex OR mcdi.Sex = 'Both') AND
        (mp.Ethnicity = mcdi.Ethnicity OR mcdi.Ethnicity = 'All') AND
        (mp.Origin = mcdi.Origin OR mcdi.Origin = 'Both')
    )

    SELECT COUNT(*) FROM CDIWithPop
""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│    784737296 │
└──────────────┘

#### confirming below that those with logid `<n>` before group by would yield the same total population based on stratification, state, yearstart, yearend, agestart, ageend 

In [59]:
cdi_log_id_1 = conn.sql("""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    ),

    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    ),


    LeftTable AS (
        SELECT *
        FROM MergedCDI
        WHERE AgeStart IS NULL AND AgeEnd IS NULL 
        LIMIT 1
    ),

    -- performs an inner join on both CDI and Population
    -- tables based
    CDIWithPop AS (
        SELECT 
            mcdi.LogID AS LogID,
            mcdi.DataValueUnit AS DataValueUnit,
            mcdi.DataValue AS DataValue,
            mcdi.YearStart AS YearStart, 
            mcdi.YearEnd AS YearEnd,
            mcdi.LocationID AS LocationID,
            mcdi.LocationDesc AS LocationDesc, 
            mcdi.QuestionID as QuestionID,
            mcdi.AgeStart AS AgeStart,
            mcdi.AgeEnd AS AgeEnd,
            mcdi.DataValueTypeID AS DataValueTypeID,
            mcdi.DataValueType AS DataValueType,
            mcdi.StratificationID AS StratificationID,
            mcdi.Sex AS Sex,
            mcdi.Ethnicity AS Ethnicity,
            mcdi.Origin AS Origin,
        
            mp.Population,
            mp.State PState,
            mp.Age AS PAge,
            mp.Year AS PYear,
            mp.Sex AS PSex,
            mp.Ethnicity AS PEthnicity,
            mp.Origin AS POrigin
        FROM MergedPopulation mp
        INNER JOIN MergedCDI mcdi
        ON (mp.Year BETWEEN mcdi.YearStart AND mcdi.YearEnd) AND
        (mp.StateID = mcdi.LocationID) AND
        ((mp.Age BETWEEN mcdi.AgeStart AND (CASE WHEN mcdi.AgeEnd = 'infinity' THEN 85 ELSE mcdi.AgeEnd END)) OR (mcdi.AgeStart IS NULL AND mcdi.AgeEnd IS NULL)) AND
        (mp.Sex = mcdi.Sex OR mcdi.Sex = 'Both') AND
        (mp.Ethnicity = mcdi.Ethnicity OR mcdi.Ethnicity = 'All') AND
        (mp.Origin = mcdi.Origin OR mcdi.Origin = 'Both')
    )

    SELECT *
    FROM CDIWithPop
    WHERE LogID = 82
""").fetchall()

In [60]:
log_id, _, _, year_start, year_end, loc_id, loc_desc, question_id, age_start, age_end, dvt_id, dvt, strat_id, sex, ethnicity, origin, population, state_desc, age, year, _, _, _ = zip(*cdi_log_id_1)

In [70]:
age_start[-1]

In [66]:
population

(5,
 5,
 4,
 5,
 5,
 5,
 3,
 0,
 3,
 1,
 0,
 1,
 0,
 0,
 2,
 1,
 3,
 3,
 1,
 2,
 4,
 3,
 4,
 4,
 1,
 2,
 4,
 3,
 9,
 4,
 7,
 2,
 1,
 2,
 0,
 3,
 3,
 4,
 1,
 1,
 1,
 3,
 0,
 4,
 3,
 2,
 2,
 2,
 0,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 3,
 3,
 2,
 0,
 2,
 3,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 3,
 5,
 5,
 4,
 2,
 4,
 3,
 4,
 2,
 1,
 1,
 1,
 3,
 4,
 4,
 0,
 2,
 0,
 0,
 3,
 1,
 0,
 4,
 2,
 7,
 2,
 0,
 7,
 3,
 3,
 2,
 1,
 5,
 2,
 1,
 3,
 3,
 4,
 1,
 1,
 1,
 1,
 5,
 2,
 2,
 2,
 3,
 6,
 1,
 0,
 1,
 2,
 2,
 5,
 1,
 1,
 1,
 3,
 5,
 3,
 3,
 1,
 3,
 4,
 0,
 1,
 1,
 0,
 4,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 3)

In [71]:
loc_desc[-1]

'Delaware'

In [72]:
strat_id[-1]

'NH_B_NHPI'

In [63]:
age

(0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 83.0,
 84.0,
 85.0,
 0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 33.0,
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 55.0,
 56.0,
 57.0,
 58.0,
 59.0

In [76]:
year_start[-1], year_end[-1]

(2016, 2016)

In [65]:
sum(population)

339

#### confirmed that total population of those with id 1 is the same as the table that has been grouped

# full join, now with aggregation of necessary rows according to their agestart, ageend, yearstart, yearend, ethnicity, sex, state, origin, 

In [27]:
total_population = conn.sql("""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    ),

    -- joins necessary values to Population table 
    -- via primary keys of its dimension tables
    MergedPopulation AS (
        SELECT
            ps.StateID,
            ps.State,
            p.Age,
            p.Year,
            s.Sex,
            s.Ethnicity,
            s.Origin,
            p.Population
        FROM Population p
        LEFT JOIN PopulationState ps
        ON p.StateID = ps.StateID
        LEFT JOIN Stratification s
        ON p.StratificationID = s.StratificationID
    ),


    LeftTable AS (
        SELECT *
        FROM MergedCDI
        WHERE AgeStart IS NULL AND AgeEnd IS NULL 
        LIMIT 1
    ),

    -- performs an inner join on both CDI and Population
    -- tables based
    CDIWithPop AS (
        SELECT 
            mcdi.LogID AS LogID,
            mcdi.DataValueUnit AS DataValueUnit,
            mcdi.DataValue AS DataValue,
            mcdi.YearStart AS YearStart, 
            mcdi.YearEnd AS YearEnd,
            mcdi.LocationID AS LocationID,
            mcdi.LocationDesc AS LocationDesc, 
            mcdi.QuestionID as QuestionID,
            mcdi.AgeStart AS AgeStart,
            mcdi.AgeEnd AS AgeEnd,
            mcdi.DataValueTypeID AS DataValueTypeID,
            mcdi.DataValueType AS DataValueType,
            mcdi.StratificationID AS StratificationID,
            mcdi.Sex AS Sex,
            mcdi.Ethnicity AS Ethnicity,
            mcdi.Origin AS Origin,
        
            mp.Population,
            mp.State PState,
            mp.Age AS PAge,
            mp.Year AS PYear,
            mp.Sex AS PSex,
            mp.Ethnicity AS PEthnicity,
            mp.Origin AS POrigin
        FROM MergedPopulation mp
        INNER JOIN MergedCDI mcdi
        ON (mp.Year BETWEEN mcdi.YearStart AND mcdi.YearEnd) AND
        (mp.StateID = mcdi.LocationID) AND
        ((mp.Age BETWEEN mcdi.AgeStart AND (CASE WHEN mcdi.AgeEnd = 'infinity' THEN 85 ELSE mcdi.AgeEnd END)) OR (mcdi.AgeStart IS NULL AND mcdi.AgeEnd IS NULL)) AND
        (mp.Sex = mcdi.Sex OR mcdi.Sex = 'Both') AND
        (mp.Ethnicity = mcdi.Ethnicity OR mcdi.Ethnicity = 'All') AND
        (mp.Origin = mcdi.Origin OR mcdi.Origin = 'Both')
    )

    SELECT 
        LogID,
        SUM(Population) AS TotalPopulation
    FROM CDIWithPop
    GROUP BY LogID
    ORDER BY LogID ASC
""").fetchall()

In [28]:
log_id, total_population_count = zip(*total_population)
log_id[0], log_id[-1]

(1, 678471)

In [29]:
len(log_id)

678471

In [30]:
missing_ids = list(set(range(1, log_id[-1] + 1)) - set(log_id))
missing_ids

[]

In [31]:
len(missing_ids)

0

In [32]:
conn.sql(f"""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    )
         
    SELECT *
    FROM MergedCDI
    WHERE LogID IN {missing_ids}
""")

┌───────┬───────────────┬───────────┬───────────┬─────────┬────────────┬──────────────┬────────────┬──────────┬────────┬─────────────────┬───────────────┬──────────────────┬─────────┬───────────┬─────────┐
│ LogID │ DataValueUnit │ DataValue │ YearStart │ YearEnd │ LocationID │ LocationDesc │ QuestionID │ AgeStart │ AgeEnd │ DataValueTypeID │ DataValueType │ StratificationID │   Sex   │ Ethnicity │ Origin  │
│ int32 │    varchar    │  double   │   int32   │  int32  │  varchar   │   varchar    │  varchar   │  double  │ double │     varchar     │    varchar    │     varchar      │ varchar │  varchar  │ varchar │
├───────┴───────────────┴───────────┴───────────┴─────────┴────────────┴──────────────┴────────────┴──────────┴────────┴─────────────────┴───────────────┴──────────────────┴─────────┴───────────┴─────────┤
│                                                                                                  0 rows                                                                       

In [33]:
conn.sql(f"""
    -- Creates a CTE that will join the necessary
    -- values from the dimension tables to the fact
    -- table
    WITH MergedCDI AS (
        SELECT
            c.LogID,
            c.DataValueUnit,
            c.DataValue,
            c.YearStart, 
            c.YearEnd,
            cl.LocationID,
            cl.LocationDesc, 
            q.QuestionID,
            q.AgeStart,
            q.AgeEnd,
            dvt.DataValueTypeID,
            dvt.DataValueType,
            s.StratificationID,
            s.Sex,
            s.Ethnicity,
            s.Origin
        FROM CDI c
        LEFT JOIN CDILocation cl
        ON c.LocationID = cl.LocationID
        LEFT JOIN Question q
        ON c.QuestionID = q.QuestionID
        LEFT JOIN DataValueType dvt
        ON c.DataValueTypeID = dvt.DataValueTypeID
        LEFT JOIN Stratification s
        ON c.StratificationID = s.StratificationID
    )
         
    SELECT DISTINCT(Ethnicity)
    FROM MergedCDI
    WHERE LogID IN {missing_ids}
""")

┌───────────┐
│ Ethnicity │
│  varchar  │
├───────────┤
│  0 rows   │
└───────────┘

In [81]:
# conn.close()