In [26]:
import duckdb
import os

from dotenv import load_dotenv
from pathlib import Path
from duckdb.typing import *

from utilities.utils import get_flat_table_rows

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Remote connection

In [27]:
# Build paths inside the project like this: BASE_DIR / 'subdir'.
# use this only in development
print("loading env variables...")
env_dir = Path('./').resolve()
load_dotenv(os.path.join(env_dir, '.env'))
print("env variables loaded.\n")

loading env variables...
env variables loaded.



In [28]:
# jdbc:duckdb:md:chronic_disease_analyses_db
# duckdb:///md:chronic_disease_analyses_db
print("connecting to duckdb...")
conn = duckdb.connect(f"md:chronic_disease_analyses_db?motherduck_token={os.environ['MOTHERDUCK_TOKEN']}")
print("connected to duckdb.\n")

connecting to duckdb...
connected to duckdb.



In [29]:
tables = get_flat_table_rows(conn.sql("""SHOW TABLES""").fetchall())
tables

['CDI',
 'CDILocation',
 'CDIStratification',
 'CalculatedPopulation',
 'DataValueType',
 'Population',
 'PopulationState',
 'PopulationStratification',
 'Question',
 'Stratification',
 'Topic']

In [30]:
for table in tables:
    count = conn.sql(f"""SELECT COUNT(*) FROM {table}""").fetchall()[0][0]
    print(f"table {table} count: {count}")

table CDI count: 678471
table CDILocation count: 51
table CDIStratification count: 11
table CalculatedPopulation count: 678471
table DataValueType count: 15
table Population count: 2947392
table PopulationState count: 51
table PopulationStratification count: 28
table Question count: 192
table Stratification count: 39
table Topic count: 17


In [31]:
conn.sql("""
    SELECT * FROM Question
""")

┌────────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────┬────────┐
│ QuestionID │ TopicID │                                                   Question                                                   │ AgeStart │ AgeEnd │
│  varchar   │ varchar │                                                   varchar                                                    │  double  │ double │
├────────────┼─────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────┼──────────┼────────┤
│ ART1_1     │ ART     │ Arthritis among adults aged >= 18 years                                                                      │     18.0 │    inf │
│ ALC4_0     │ ALC     │ Binge drinking intensity among adults aged >= 18 years who binge drink                                       │     18.0 │    inf │
│ NPAW2_3    │ NPAW    │ Overweight or obesity among women aged 

In [36]:
topic_id = get_flat_table_rows(conn.sql("""
    SELECT DISTINCT(Question.TopicID, Topic)
    FROM Question
    JOIN Topic
    ON Question.TopicID = Topic.TopicID
""").fetchall())
topic_id

[('CAN', 'Cancer'),
 ('MTH', 'Mental Health'),
 ('IMM', 'Immunization'),
 ('OLD', 'Older Adults'),
 ('CVD', 'Cardiovascular Disease'),
 ('DIA', 'Diabetes'),
 ('ART', 'Arthritis'),
 ('COPD', 'Chronic Obstructive Pulmonary Disease'),
 ('ALC', 'Alcohol'),
 ('RPH', 'Reproductive Health'),
 ('NPAW', 'Nutrition, Physical Activity, and Weight Status'),
 ('CKD', 'Chronic Kidney Disease'),
 ('TOB', 'Tobacco'),
 ('AST', 'Asthma'),
 ('DIS', 'Disability'),
 ('ORH', 'Oral Health'),
 ('OVC', 'Overarching Conditions')]

In [42]:
for id, topic in topic_id:
    topic_id_questions = get_flat_table_rows(conn.sql(f"""
        SELECT Question
        FROM Question
        WHERE TopicID = '{id}'  
    """).fetchall())
    print(f"topic {topic}, questions: {topic_id_questions}\n")

topic Cancer, questions: ['Papanicolaou smear use among adult women aged 21-65 years', 'Recent Papanicolaou smear use among women aged 21-44 years', 'Mammography use among women aged 50-74 years', 'Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years', 'Invasive cancer of the prostate, incidence', 'Cancer of the colon and rectum (colorectal), incidence', 'Cancer of the oral cavity and pharynx, mortality', 'Invasive cancer of the oral cavity or pharynx, incidence', 'Cancer of the female breast, mortality', 'Melanoma, mortality', 'Cancer of the prostate, mortality', 'Invasive cancer (all sites combined), incidence', 'Cancer of the lung and bronchus, mortality', 'Invasive cancer of the female breast, incidence', 'Cancer of the lung and bronchus, incidence', 'Cancer of the colon and rectum (colorectal), mortality', 'Invasive melanoma, incidence', 'Invasive cancer (all sites combined), mortality', 'Invasive cancer of the cervix, incidence', 'Cancer of the fem

#### Job here is to now identify which questions one by one if I have to the ones that can be compatible with my CalculatedPopulation table containing the populations by sex age race and origin. 