In [1]:
import duckdb
import os

from dotenv import load_dotenv
from pathlib import Path
from duckdb.typing import *

from utilities.utils import get_flat_table_rows

%load_ext autoreload
%autoreload 2

# Remote connection

In [2]:
# Build paths inside the project like this: BASE_DIR / 'subdir'.
# use this only in development
print("loading env variables...")
env_dir = Path('./').resolve()
load_dotenv(os.path.join(env_dir, '.env'))
print("env variables loaded.\n")

loading env variables...
env variables loaded.



In [3]:
# jdbc:duckdb:md:chronic_disease_analyses_db
# duckdb:///md:chronic_disease_analyses_db
print("connecting to duckdb...")
conn = duckdb.connect(f"md:chronic_disease_analyses_db?motherduck_token={os.environ['MOTHERDUCK_TOKEN']}")
print("connected to duckdb.\n")

connecting to duckdb...
connected to duckdb.



In [4]:
tables = get_flat_table_rows(conn.sql("""SHOW TABLES""").fetchall())
tables

['CDI',
 'CDILocation',
 'CDIStratification',
 'DataValueType',
 'Population',
 'PopulationState',
 'PopulationStratification',
 'Question',
 'Stratification',
 'Topic']

In [5]:
for table in tables:
    count = conn.sql(f"""SELECT COUNT(*) FROM {table}""").fetchall()[0][0]
    print(f"table {table} count: {count}")

table CDI count: 678471
table CDILocation count: 51
table CDIStratification count: 11
table DataValueType count: 15
table Population count: 2947392
table PopulationState count: 51
table PopulationStratification count: 28
table Question count: 192
table Stratification count: 39
table Topic count: 17


In [6]:
conn.sql("""
    SELECT * FROM Question
""")

┌────────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────┬────────┐
│ QuestionID │ TopicID │                                                   Question                                                   │ AgeStart │ AgeEnd │
│  varchar   │ varchar │                                                   varchar                                                    │  double  │ double │
├────────────┼─────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────┼────────┤
│ ART1_1     │ ART     │ Arthritis among adults aged >= 18 years                                                                      │     18.0 │    inf │
│ ALC4_0     │ ALC     │ Binge drinking intensity among adults aged >= 18 years who binge drink                                       │     18.0 │    inf │
│ NPAW2_3    │ NPAW    │ Overweight or obesity among women aged 

# Unique topics

In [7]:
topic_id = get_flat_table_rows(conn.sql("""
    SELECT DISTINCT(Question.TopicID, Topic)
    FROM Question
    JOIN Topic
    ON Question.TopicID = Topic.TopicID
""").fetchall())
topic_id

[('CAN', 'Cancer'),
 ('MTH', 'Mental Health'),
 ('IMM', 'Immunization'),
 ('OLD', 'Older Adults'),
 ('CVD', 'Cardiovascular Disease'),
 ('ART', 'Arthritis'),
 ('NPAW', 'Nutrition, Physical Activity, and Weight Status'),
 ('CKD', 'Chronic Kidney Disease'),
 ('AST', 'Asthma'),
 ('DIS', 'Disability'),
 ('TOB', 'Tobacco'),
 ('DIA', 'Diabetes'),
 ('ORH', 'Oral Health'),
 ('OVC', 'Overarching Conditions'),
 ('ALC', 'Alcohol'),
 ('RPH', 'Reproductive Health'),
 ('COPD', 'Chronic Obstructive Pulmonary Disease')]

In [8]:
dvt = get_flat_table_rows(conn.sql("""
    SELECT DISTINCT(DataValueType, DataValueTypeID)
    FROM DataValueType
""").fetchall())
dvt

[('Number', 'NMBR'),
 ('Crude Prevalence', 'CRDPREV'),
 ('Percent', 'PRCT'),
 ('Average Annual Crude Rate', 'AVGANNCRDRATE'),
 ('Median', 'MEDIAN'),
 ('Age-adjusted Mean', 'AGEADJMEAN'),
 ('Per capita alcohol consumption', 'PERCAPALC'),
 ('Mean', 'MEAN'),
 ('Prevalence', 'PREV'),
 ('Crude Rate', 'CRDRATE'),
 ('Age-adjusted Prevalence', 'AGEADJPREV'),
 ('Adjusted by age, sex, race and ethnicity', 'AGESEXRACEADJRATE'),
 ('Age-adjusted Rate', 'AGEADJRATE'),
 ('US Dollars', 'USD'),
 ('Average Annual Age-adjusted Rate', 'AVGANNAGEADJRATE')]

# unique questions under each unique topic

In [9]:
for id, topic in topic_id:
    topic_id_questions = get_flat_table_rows(conn.sql(f"""
        SELECT Question
        FROM Question
        WHERE TopicID = '{id}'
    """).fetchall())
    print(f"topic {topic}, questions: {topic_id_questions}\n")

topic Cancer, questions: ['Papanicolaou smear use among adult women aged 21-65 years', 'Recent Papanicolaou smear use among women aged 21-44 years', 'Mammography use among women aged 50-74 years', 'Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years', 'Invasive cancer of the prostate, incidence', 'Cancer of the colon and rectum (colorectal), incidence', 'Cancer of the oral cavity and pharynx, mortality', 'Invasive cancer of the oral cavity or pharynx, incidence', 'Cancer of the female breast, mortality', 'Melanoma, mortality', 'Cancer of the prostate, mortality', 'Invasive cancer (all sites combined), incidence', 'Cancer of the lung and bronchus, mortality', 'Invasive cancer of the female breast, incidence', 'Cancer of the lung and bronchus, incidence', 'Cancer of the colon and rectum (colorectal), mortality', 'Invasive melanoma, incidence', 'Invasive cancer (all sites combined), mortality', 'Invasive cancer of the cervix, incidence', 'Cancer of the fem

# Unique combinations of datavaluetype and datavalueunit

In [10]:
dvt_dvu_pairs = conn.sql(f"""
    SELECT max(cdi.DataValueUnit), MAX(dvt.DataValueType)
    FROM CDI cdi
    LEFT JOIN DataValueType dvt
    ON cdi.DataValueTypeID = dvt.DataValueTypeID
    GROUP BY cdi.DataValueTypeID, cdi.DataValueUnit                                   
""")

In [11]:
sorted(dvt_dvu_pairs.fetchall(), key=lambda value: value[0])

[('$', 'US Dollars'),
 ('%', 'Age-adjusted Prevalence'),
 ('%', 'Percent'),
 ('%', 'Prevalence'),
 ('%', 'Crude Prevalence'),
 ('Number', 'Median'),
 ('Number', 'Number'),
 ('Number', 'Age-adjusted Mean'),
 ('Number', 'Mean'),
 ('Years', 'Number'),
 ('cases per 1,000', 'Crude Rate'),
 ('cases per 1,000', 'Age-adjusted Rate'),
 ('cases per 1,000,000', 'Number'),
 ('cases per 1,000,000', 'Age-adjusted Rate'),
 ('cases per 1,000,000', 'Adjusted by age, sex, race and ethnicity'),
 ('cases per 1,000,000', 'Crude Rate'),
 ('cases per 10,000', 'Crude Rate'),
 ('cases per 10,000', 'Age-adjusted Rate'),
 ('cases per 100,000', 'Crude Rate'),
 ('cases per 100,000', 'Age-adjusted Rate'),
 ('cases per 100,000', 'Average Annual Age-adjusted Rate'),
 ('cases per 100,000', 'Number'),
 ('cases per 100,000', 'Average Annual Crude Rate'),
 ('gallons', 'Per capita alcohol consumption'),
 ('pack sales per capita', 'Number')]

# What we have to do here now is to identify which questions are more likely to make use of the population values from the Population table we've just calculated and from there 

## Alcohol use among youth

![intial powerbi analyses (1).png](./figures%20&%20images/intial%20powerbi%20analyses%20(1).png)

it seems that `alcohol use among youth` question is something that can't make use of the `Population` table at first glance as what is measured here is alcohol use. But if we look closely the data value can't exactly say for sure whether alcohol amount is used e.g. 3.6% isn't exactly a measurement of alcohol amount but more likely a percentage of population namely the youth using alcohol

![intial powerbi analyses (2).png](./figures%20&%20images/intial%20powerbi%20analyses%20(2).png)

We also have other information like the datavaluetype used for the question. This is basically how PowerBI works when we put a slicer in our workspace we essentially get the unique values a unique value can take in, in this case a topic of `alcohol` has its unique questions like `alcohol use among youth` and then under it it has the unique `datavaluetype's` it uses like `crude prevalence`, and we know `crude prevalence` has exactly one corresponding `datavalueunit` which is the `%` symbol

because this question can make use of the `Population` table we can calculate the population of youth using alcohol. $\frac{datavalue}{100} \cdot youth\_population$

## amount of alcohol excise tax by beverage type (beer)

![](./figures%20&%20images/intial%20powerbi%20analyses%20(4).png)

in this case `amount of alcohol excise tax by beverage type (beer)` can't make use of the Population table as what is measured here is `amount of alcohol excise tax`

![](./figures%20&%20images/intial%20powerbi%20analyses%20(3).png)

calculation will be made here and the `datavalue` alone would just be used. 

## binge drinking frequency among adults aged >= 18 years who binge drink

![](./figures%20&%20images/intial%20powerbi%20analyses%20(5).png)

it seems that `binge drinking frequency among adults aged >= 18 years who binge drink` question is something that can't make use of the `Population` table as is only specific to the population of those 18+ **who binge drink**. To get a tangible number namely the drinking frequency of these people aged 18+ who binge drink, we need to get the prevalence of these people aged 18+ who indeed binge drink, which we know can be derived from other questions namely `binge drinking prevalence among adults aged >= 18 years` and then from there multiply it by the binge drinking frequency, since it is assumed that each person that binge drinks has this frequency.

![](./figures%20&%20images/intial%20powerbi%20analyses%20(6).png)

calculation would be $prevalence\_of\_binge\_drinkers\_aged\_18+ \cdot datavalue$ but since we don't have prevalence of binge drinkers aged 18+ until we further calculate it the alternative calculation would be $mean\_binge\_drinking\_frequency\_among\_adults\_aged\_18+\_who\_binge\_drink$, since its datavalueunit is just Number meaning this number as it is will be the representation of this indicator

## binge drinking prevalence among adults aged >=  18 years

![](./figures%20&%20images/intial%20powerbi%20analyses%20(7).png)

here `binge drinking prevalence among adults aged 18+` can make use of the `Population` table since the only demographic being focused on are adults aged 18+

![](./figures%20&%20images/intial%20powerbi%20analyses%20(8).png)

we can calculate the number of cases or the so called prevalence of these adults aged 18+ that have binge drinking by $ \frac{datavalue}{100} \cdot population\_of\_adults\_aged\_18+$

## chronic liver disease mortality

![](./figures%20&%20images/intial%20powerbi%20analyses%20(9).png)

There is not explicit age group listed in the question of `chronic liver disease mortality` so it is assumed that persons of all age groups will be in this pool, so we look to the stratification details of this data point. But because through secondary SQL transformation we have already created the table containing the `Population` table values based on age, sex, race, origin, state, and year. Again we can make use of the `Population` table here as we and calculate a tangible number pertaining not to the prevalence of a chronic disease indicator but the mortality rate of those having chronic disease.

Again like previous questions each data point has its stratification which we need to take into account and we had taken into account during the creation of the `Population` table, all we have to do now is do calculations to this specific `Population` value coupled with the given `datavalue`

![](./figures%20&%20images/intial%20powerbi%20analyses%20(10).png)

however we are looking now at the rate and we know for `datavaluetype`s of Age-Adjusted Rate and Crude Rate, these don't use `datavalueunit`s of % or percentages, where we can use our `datavalue` and divide it by mere 100. Since age-adjusted rate and crude rate vary in `datavalueunit`s namely cases per 1000, cases per 10000, cases per 100000, and cases per 1000000, we will have to conditionally change our denominator that our datavalue will divide to based on these `datavalueunit`s 

in this question the datavalueunit is not % but cases per 100000, so we should use 100000 as the denominator. Final calculation would be $\frac{datavalue}{100000} \cdot population\_value(considering age, state, year, sex, race, ethnicity)$ 

## Per capita alcohol consumption among persons aged >= 14 years 

![](./figures%20&%20images/intial%20powerbi%20analyses%20(11).png)

Again this wouldn't make sense for us to use the population value of those persons aged 14+ to calculate the number of cases of something as its not really the cases being measured here but the per capita alcohol consumption, but we can still make use of the `Population` table value for this data point in order to get the total amount of alcohol consumed by the whole population for this demographic of 14+ 

![](./figures%20&%20images/intial%20powerbi%20analyses%20(12).png)

To get the total number of "pack sales" for this population, you would multiply the per capita consumption by the total population: $Per Capita Alcohol Consumption (DataValue) * Total Population (>= 14 years)$

## We can actually do this in SQL instead of doing it in powerbi

* '$', 'US Dollars', 'USD': $datavalue$
* '%', 'Age-adjusted Prevalence', 'AGEADJPREV': $\frac{datavalue}{100} \cdot populationvalue$
* '%', 'Crude Prevalence', 'CRDPREV': $\frac{datavalue}{100} \cdot populationvalue$
* '%', 'Percent', 'PRCT': $\frac{datavalue}{100} \cdot populationvalue$
* '%', 'Prevalence', 'PREV': $\frac{datavalue}{100} \cdot populationvalue$
-----
* 'Number', 'Median', 'MEDIAN': $datavalue$
* 'Number', 'Age-adjusted Mean', 'AGEADJMEAN': $datavalue$ 
* 'Number', 'Mean', 'MEAN': $datavalue$
* 'Number', 'Number', 'NMBR': $datavalue$
* 'Years', 'Number', 'NMBR': $datavalue$
-----
* 'cases per 1,000', 'Crude Rate', 'CRDRATE': $\frac{datavalue}{1000} \cdot populationvalue$
* 'cases per 1,000', 'Age-adjusted Rate', 'AGEADJRATE': $\frac{datavalue}{1000} \cdot populationvalue$
-----
* 'cases per 1,000,000', 'Number', 'NMBR': 
* 'cases per 1,000,000', 'Age-adjusted Rate', 'AGEADJRATE': $\frac{datavalue}{1000000} \cdot populationvalue$
* 'cases per 1,000,000', 'Adjusted by age, sex, race and ethnicity', 'AGESEXRACEADJRATE': $\frac{datavalue}{1000000} \cdot populationvalue$
* 'cases per 1,000,000', 'Crude Rate', 'CRDRATE': $\frac{datavalue}{1000000} \cdot populationvalue$
-----
* 'cases per 10,000', 'Crude Rate', 'CRDRATE': $\frac{datavalue}{10000} \cdot populationvalue$
* 'cases per 10,000', 'Age-adjusted Rate', 'AGEADJRATE': $\frac{datavalue}{10000} \cdot populationvalue$
-----
* 'cases per 100,000', 'Average Annual Crude Rate', 'AVGANNCRDRATE': $\frac{datavalue}{100000} \cdot populationvalue$
* 'cases per 100,000', 'Crude Rate', 'CRDRATE': $\frac{datavalue}{100000} \cdot populationvalue$
* 'cases per 100,000', 'Age-adjusted Rate', 'AGEADJRATE': $\frac{datavalue}{100000} \cdot populationvalue$
* 'cases per 100,000', 'Average Annual Age-adjusted Rate', 'AVGANNAGEADJRATE: $\frac{datavalue}{100000} \cdot populationvalue$
* 'cases per 100,000', 'Number', 'NMBR':
-----
* 'gallons', 'Per capita alcohol consumption', 'PERCAPALC': 
* 'pack sales per capita', 'Number', 'NMBR': depends on question if what is measured can make use of the number of cases of people having the CDI

# Asking questions

## 1. What is the prevalence of alcohol use among youth (male and female) in listed year ranges?

![](./figures%20&%20images/initial%20analyses%20powerbi%20(2).png)

![](./figures%20&%20images/initial%20analyses%20powerbi.png)

what we want to do now is to filter the chronic disease table into the `alcohol use among youth` question, with the stratification across hispanic and non hispanic, female and male, and all races

we have this slicer here and it basically filters everything in the CDI table that has the alcohol use among youth question and so we want to calculate some sort of average of alcohol use among youth or any kind of calculation with the table resulting from the slicer filtering this question. Question is how can we use this calculated table visual to make aggregations to it and using some kind of DAX/SQL to queery from this table?

**DAX query** to create the column `TotalEvents` representing the tangible number of a chronic disease indicator:
```
TotalEvents = SWITCH(
    TRUE(),
    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"USD", "MEDIAN", "AGEADJMEAN", "NMBR", "MEAN"},
    'chronic_disease_analyses_db   main   CDI'[DataValue],

    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"AGEADJPREV", "CRDPREV", "PRCT", "PREV"} && 'chronic_disease_analyses_db   main   CDI'[DataValueUnit] = "%",
    ('chronic_disease_analyses_db   main   CDI'[DataValue] / 100) * RELATED('chronic_disease_analyses_db   main   CalculatedPopulation'[Population]),

    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"CRDRATE", "AGEADJRATE"} && 'chronic_disease_analyses_db   main   CDI'[DataValueUnit] = "cases per 1,000",
    ('chronic_disease_analyses_db   main   CDI'[DataValue] / 1000) * RELATED('chronic_disease_analyses_db   main   CalculatedPopulation'[Population]),

    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"CRDRATE", "AGEADJRATE"} && 'chronic_disease_analyses_db   main   CDI'[DataValueUnit] = "cases per 10,000",
    ('chronic_disease_analyses_db   main   CDI'[DataValue] / 10000) * RELATED('chronic_disease_analyses_db   main   CalculatedPopulation'[Population]),

    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"CRDRATE", "AGEADJRATE", "AVGANNCRDRATE", "AVGANNAGEADJRATE"} && 'chronic_disease_analyses_db   main   CDI'[DataValueUnit] = "cases per 100,000",
    ('chronic_disease_analyses_db   main   CDI'[DataValue] / 100000) * RELATED('chronic_disease_analyses_db   main   CalculatedPopulation'[Population]),

    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"CRDRATE", "AGEADJRATE", "AGESEXRACEADJRATE"} && 'chronic_disease_analyses_db   main   CDI'[DataValueUnit] = "cases per 1,000,000",
    ('chronic_disease_analyses_db   main   CDI'[DataValue] / 1000000) * RELATED('chronic_disease_analyses_db   main   CalculatedPopulation'[Population]),

    'chronic_disease_analyses_db   main   CDI'[DataValueTypeID] IN {"PERCAPALC"} && 'chronic_disease_analyses_db   main   CDI'[DataValueUnit] = "gallons",
    'chronic_disease_analyses_db   main   CDI'[DataValue] * RELATED('chronic_disease_analyses_db   main   CalculatedPopulation'[Population])
)
```

Now that we have the `TotalEvents` column we can use it in a visual that shows the top states in a specific year range that has the most youth population estimates using alcohol

![](./figures%20&%20images/initial%20analyses%20powerbi%20(4).png)

this answers our question above in multiple ways
- the states with the most estimated youth populations recorded in 2013 using alcohol were in Texas, New York, Florida, Illinois, and Ohio
- the states with the most estimated youth populations recorded in 2015 using alcohol were in California, Florida, New York, Illinois, and Pennsylvania
- the states with the most estimated youth populations recorded in 2017 using alcohol were in California, Texas, New York, Florida, and Pennsylvania
- the states with the most estimated youth populations recorded in 2019 using alcohol were in California, Texas, New York, Florida, and Illinois

# 2. 