# Analysis

## Setup

Zuerst laden wir die benötigten Daten herunter und initialisieren die genutzten Python Objekte.

In [149]:
tables = [
    "careplans",
    "conditions",
    "observations",
    "patients",
]

files = [
    "data/others/",
    "data/asthma/",
    "data/gallstones/",
    "data/hypertension/",
]

In [150]:
!mkdir -p data/allergy

Syntaxfehler.


In [151]:
from urllib.request import urlopen
import os

def ensure_file_has_been_downloaded(filename):
    full_filename = "../" + filename

    url = "https://raw.githubusercontent.com/Fuenfgeld/DMA2022DataProjectC/main/" + filename
    if os.path.isfile(full_filename):
        print("File {} already exists, skipping download".format(filename))
    else:
        print("Downloading {}".format(filename))
        download_file(url, full_filename)

def download_file(url, filename):
    with open(filename, 'wb') as out_file:
        with urlopen(url) as file:
            out_file.write(file.read())

if not os.path.isfile("extract.py"):
    download_file(
        "https://raw.githubusercontent.com/Fuenfgeld/DMA2022DataProjectC/main/src/extract.py",
        "extract.py"
    )

for file in files:
    for table in tables:
        ensure_file_has_been_downloaded(file+table+".csv")

File data/others/careplans.csv already exists, skipping download
File data/others/conditions.csv already exists, skipping download
File data/others/observations.csv already exists, skipping download
File data/others/patients.csv already exists, skipping download
File data/asthma/careplans.csv already exists, skipping download
File data/asthma/conditions.csv already exists, skipping download
File data/asthma/observations.csv already exists, skipping download
File data/asthma/patients.csv already exists, skipping download
File data/gallstones/careplans.csv already exists, skipping download
File data/gallstones/conditions.csv already exists, skipping download
File data/gallstones/observations.csv already exists, skipping download
File data/gallstones/patients.csv already exists, skipping download
File data/hypertension/careplans.csv already exists, skipping download
File data/hypertension/conditions.csv already exists, skipping download
File data/hypertension/observations.csv already exis

In [152]:
from logger import Logger
from test_executer import TestExecutor

logger = Logger()
testExecutor = TestExecutor(logger)

## Mit Datenbank verbinden

In [153]:
import extract
import time

databaseFile = "data.sqlite"

logger.startTimeMeasurement('open-db', 'Connected to db and created tables')
connection = extract.connect_to_db(logger, databaseFile)  # create table patients, observations, conditions, careplans
logger.endTimeMeasurement('open-db')

In [154]:
def test_sqliteConnection(_logger):
    cursor = connection.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tablesInDb = list(map(lambda tableResult: tableResult[0], cursor.fetchall()))
    tablesInDb.sort()

    for table in tables:
        if not(table in tablesInDb):
            raise Exception('Table not found:', table)

testExecutor.execute('Test connection to database', test_sqliteConnection)

{"type": "info", "time": 1656318806675, "message": "✅ Test ran successfully: Test connection to database", "params": null}


## Daten in Datenbank laden

Lade der verwendete Daten in die Datenbank:

-   careplans
-   conditions
-   observations
-   patients

In [155]:
logger.startTimeMeasurement('load-data', 'Loading data into db')
for file in files:
    for table in tables:
        extract.insert_values_to_table(logger, connection.cursor(), table, "../"+ file + table + ".csv")  # TODO: insert ALL values in the right tables 
        connection.commit()
    logger.endTimeMeasurement('load-data')

{"type": "info", "time": 1656318809505, "message": "🏗 Extracting data from ../data/others/careplans.csv", "params": null}
{"type": "info", "time": 1656318809524, "message": "🏗 Extracting data from ../data/others/conditions.csv", "params": null}
{"type": "info", "time": 1656318809645, "message": "🏗 Extracting data from ../data/others/observations.csv", "params": null}
{"type": "info", "time": 1656318810430, "message": "🏗 Extracting data from ../data/others/patients.csv", "params": null}
{"type": "info", "time": 1656318810442, "message": "🏗 Extracting data from ../data/asthma/careplans.csv", "params": null}
{"type": "info", "time": 1656318810465, "message": "🏗 Extracting data from ../data/asthma/conditions.csv", "params": null}
{"type": "info", "time": 1656318810608, "message": "🏗 Extracting data from ../data/asthma/observations.csv", "params": null}
{"type": "info", "time": 1656318813414, "message": "🏗 Extracting data from ../data/asthma/patients.csv", "params": null}
{"type": "info", "

## Messung der Datenfehler

Für unsere Forschungsfrage sind nur alle Daten mit gemessenen BMI relevant. Wurde dieser nicht vermessen oder eingetragen können die Daten für die Forschungsfrage nicht verwendet werden und sind somit unbrauchbar.

In [156]:
all_patients_query = """
SELECT COUNT(id) FROM patients;"""
count_bmi_query = """
SELECT COUNT(distinct id) FROM patients JOIN observations on patients.id == observations.patient WHERE observations.Code = '59576-9'"""

count_all_bmi_query = f"""
SELECT COUNT(patient) FROM observations WHERE observations.Code = '59576-9'"""

patient_all_count = connection.execute(all_patients_query).fetchall()[0][0]
patient_bmi_count = connection.execute(count_bmi_query).fetchall()[0][0]
bmi_count = connection.execute(count_all_bmi_query).fetchall()[0][0]
ratio = round(patient_bmi_count/patient_all_count, 3) * 100

logger.log(f"Total num of patients {patient_all_count}.")
logger.log(f"Found {patient_bmi_count} patients ({ratio}%) with {bmi_count} BMI-values.")

{"type": "info", "time": 1656318820072, "message": "Total num of patients 396.", "params": null}
{"type": "info", "time": 1656318820074, "message": "Found 140 patients (35.4%) with 1157 BMI-values.", "params": null}


In [157]:
import pandas as pd
cursor = connection.cursor()

In [158]:
print(pd.read_sql_query('''SELECT COUNT(DISTINCT ID) FROM PATIENTS''', connection))

   COUNT(DISTINCT ID)
0                 396


In [159]:
# print tables of the database
cursor.execute('SELECT name FROM sqlite_master where type="table"') 
print(cursor.fetchall())

[('patients',), ('encounters',), ('careplans',), ('conditions',), ('procedures',), ('observations',), ('immunizations',)]


# Star Schema

#### Erstellen Dimensionstabellen

Tabelle patients_

In [None]:
# table patients_
cursor.execute('''DROP TABLE IF EXISTS patients_;''')
cursor.execute('''
        CREATE TABLE patients_ ( 
        ID STRING PRIMARY KEY UNIQUE,
        BIRTHDATE DATE,
        RACE STRING, 
        ETHNICITY STRING,
        GENDER STRING
        );''')
   
cursor.execute('INSERT INTO patients_ (ID, BIRTHDATE, RACE, ETHNICITY, GENDER) SELECT ID, BIRTHDATE, RACE, ETHNICITY, GENDER FROM PATIENTS;')

print(pd.read_sql_query("PRAGMA table_info(patients_)", connection))

Tabelle observations_

In [None]:
# table observations_
cursor.execute('''DROP TABLE IF EXISTS observations_;''')
cursor.execute('''
        CREATE TABLE observations_ ( 
        CODE STRING PRIMARY KEY UNIQUE,
        DESCRIPTION STRING,
        UNITS STRING, 
        TYPE STRING
        );''')

#cursor.execute('''INSERT INTO observations_ (CODE) SELECT DISTINCT CODE FROM OBSERVATIONS;''')
cursor.execute('''INSERT INTO observations_ (CODE, DESCRIPTION, UNITS, TYPE) SELECT CODE, DESCRIPTION, UNITS, TYPE FROM OBSERVATIONS;''')

print(pd.read_sql_query("PRAGMA table_info(observations_)", connection))
#print(pd.read_sql_query("SELECT * FROM observations_", connection))

Tabelle careplans_code

In [None]:
# table careplans_code
cursor.execute('''DROP TABLE IF EXISTS careplans_code;''')
cursor.execute('''
        CREATE TABLE careplans_code ( 
        CODE STRING PRIMARY KEY UNIQUE,
        DESCRIPTION STRING
        );''')
   
cursor.execute('INSERT INTO careplans_code (CODE, DESCRIPTION) SELECT CODE, DESCRIPTION FROM careplans;')

print(pd.read_sql_query("PRAGMA table_info(careplans_code)", connection))
#print(pd.read_sql_query("SELECT * FROM careplans_code", connection))

Tabelle careplans_reasoncode

In [None]:
# table careplans_reasoncode
cursor.execute('''DROP TABLE IF EXISTS careplans_reasoncode;''')
cursor.execute('''
        CREATE TABLE careplans_reasoncode ( 
        REASONCODE STRING PRIMARY KEY UNIQUE,
        REASONDESCRIPTION STRING
        );''')
   
cursor.execute('INSERT INTO careplans_reasoncode (REASONCODE, REASONDESCRIPTION) SELECT REASONCODE, REASONDESCRIPTION FROM careplans;')

print(pd.read_sql_query("PRAGMA table_info(careplans_reasoncode)", connection))
#print(pd.read_sql_query("SELECT * FROM careplans_reasoncode", connection))

Tabelle conditions_

In [None]:
# table conditions
cursor.execute('''DROP TABLE IF EXISTS conditions_;''')
cursor.execute('''
        CREATE TABLE conditions_ ( 
        CODE STRING PRIMARY KEY UNIQUE,
        DESCRIPTION STRING
        );''')
   
cursor.execute('INSERT INTO conditions_ (CODE, DESCRIPTION) SELECT CODE, DESCRIPTION FROM conditions;')

print(pd.read_sql_query("PRAGMA table_info(conditions_)", connection))
#print(pd.read_sql_query("SELECT * FROM conditions_", connection))

#### Erstelle Faktentabelle

In [None]:
cursor.execute('''DROP TABLE IF EXISTS fact_table;''')
cursor.execute('''
        CREATE TABLE fact_table ( 
        PATIENT_ID STRING,
        OBSERVATION_CODE STRING,
        VALUE STRING,
        DATE DATE,
        CONDITIONS_CODE STRING,
        ENDDATE DATE,
        CAREPLANS_CODE STRING,
        CAREPLANS_REASONCODE STRING,
        FOREIGN KEY (PATIENT_ID)
            REFERENCES patients_(ID),
        FOREIGN KEY (OBSERVATION_CODE)
            REFERENCES observations_(CODE),
        FOREIGN KEY (CONDITIONS_CODE)
            REFERENCES conditions_(CODE),
        FOREIGN KEY (CAREPLANS_CODE)
            REFERENCES careplans_code(CODE),
        FOREIGN KEY (CAREPLANS_REASONCODE)
            REFERENCES careplans_reasoncode(REASONCODE)
        );''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, OBSERVATION_CODE, VALUE, DATE) 
                    SELECT PATIENT, CODE, VALUE, DATE 
                    FROM OBSERVATIONS
                    ;''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, CAREPLANS_CODE, DATE, ENDDATE) 
                    SELECT PATIENT, CODE, START, STOP 
                    FROM CAREPLANS
                    ;''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, CAREPLANS_REASONCODE, DATE, ENDDATE) 
                    SELECT PATIENT, REASONCODE, START, STOP 
                    FROM CAREPLANS
                    ;''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, CONDITIONS_CODE, DATE, ENDDATE) 
                    SELECT PATIENT, CODE, START, STOP 
                    FROM CONDITIONS
                    ;''')

In [None]:
print(pd.read_sql_query("PRAGMA foreign_key_list(fact_table)", connection))

In [None]:
cursor.execute('SELECT name FROM sqlite_master where type="table"')
print(cursor.fetchall())

## Analysis

In [None]:
print(pd.read_sql_query("SELECT reasoncode FROM careplans_reasoncode WHERE reasondescription='Diabetes'", connection))

In [None]:
print(pd.read_sql_query('''SELECT fact_table.patient_ID
                        FROM fact_table
                        WHERE careplans_reasoncode=
                        (SELECT reasoncode FROM careplans_reasoncode WHERE reasondescription='Diabetes' OR reasondescription='Prediabetes')
                        ''', connection))

## Aufräumen & Logs speichern

In [35]:
connection.close()
#logger.logTimings()
#logger.writeToFile("../artefacts-for-release/analysis-log.json")