# ETL-Process

In [31]:
%reset -f

# Genutzte Umgebung

In [32]:
import pandas as pd
from logger import Logger
from test_executer import TestExecutor
import extract
import sys
import numpy as np
import hashlib

logger = Logger()
testExecutor = TestExecutor(logger)

dependencies = [
    ('system', sys.version_info),
    ('pandas', pd.__version__),
    ('numpy', np.__version__),
]
for dependency in dependencies:
    logger.log(f"{dependency[0]} is installed with version {dependency[1]}")

{"type": "info", "time": 1657818070484, "message": "system is installed with version sys.version_info(major=3, minor=8, micro=10, releaselevel='final', serial=0)", "params": null}
{"type": "info", "time": 1657818070484, "message": "pandas is installed with version 1.4.3", "params": null}
{"type": "info", "time": 1657818070484, "message": "numpy is installed with version 1.23.0", "params": null}


## Setup der Daten

Zuerst laden wir die benötigten Daten herunter und initialisieren die genutzten Python Objekte.

In [33]:
tables = [
    "careplans",
    "conditions",
    "observations",
    "patients",
]

files = [
    "data/others/",
    "data/asthma/",
    "data/gallstones/",
    "data/hypertension/",
]

md5Hashes = {
    "data/others/careplans.csv": "365403e27541792755361bc0f6125506",
    "data/others/conditions.csv": "ce0034e9ed9185b7d4c408ee9916de18",
    "data/others/observations.csv": "b9e3bf1b033dc4af7f7ade78a48a50a4",
    "data/others/patients.csv": "530570c8e30b77a822b37e927d1486b2",
    "data/asthma/careplans.csv": "3c4aff1d0d576de6624c3726ec2dd544",
    "data/asthma/conditions.csv": "e7965095ec41ef88498540341c79c49e",
    "data/asthma/observations.csv": "1b8583de62d4d9e80c224005d74dd736",
    "data/asthma/patients.csv": "b139ef00c850308c3d3f8e7fa0f97724",
    "data/gallstones/careplans.csv": "35399fb01b2771b770c2f9f312e62dc2",
    "data/gallstones/conditions.csv": "8a19bf13191cf074c64534c2fa01f15c",
    "data/gallstones/observations.csv": "9d3807dc05cd7b4ccc3f0ee7b4f7b55e",
    "data/gallstones/patients.csv": "3766f46941ee2155e0d1ed6e749e8ba7",
    "data/hypertension/careplans.csv": "ddf053c4e56f24c4ce28e6d57edfd8b1",
    "data/hypertension/conditions.csv": "8310cdc07924b48e07aa841f9075b488",
    "data/hypertension/observations.csv": "f7564c732eebe9ace17a46e50b3cc857",
    "data/hypertension/patients.csv": "2ebdf6b168e9c968ffa949463cd074e7",
}

In [34]:
!mkdir -p data/allergy

In [35]:
from urllib.request import urlopen
import os

def ensure_file_has_been_downloaded(filename):
    full_filename = "../" + filename

    url = "https://raw.githubusercontent.com/Fuenfgeld/DMA2022DataProjectC/main/" + filename
    if os.path.isfile(full_filename):
        logger.log("File {} already exists, skipping download".format(filename))
    else:
        logger.log("Downloading {}".format(filename))
        download_file(url, full_filename)

def download_file(url, filename):
    with open(filename, 'wb') as out_file:
        with urlopen(url) as file:
            out_file.write(file.read())

if not os.path.isfile("extract.py"):
    download_file(
        "https://raw.githubusercontent.com/Fuenfgeld/DMA2022DataProjectC/main/src/extract.py",
        "extract.py"
    )

dataChanged = False
for file in files:
    for table in tables:
        filename = file+table+".csv"
        ensure_file_has_been_downloaded(filename)

        with open("../" + filename) as fileHandle:
            fileContent = fileHandle.read()
            fileHandle.close()

        md5Hash = hashlib.md5(fileContent.encode()).hexdigest()
        if md5Hashes[filename] != md5Hash:
            dataChanged = True
    
if dataChanged:
    logger.log("❌ Data set changed")
else:
    logger.log("✅ Using original data set")

{"type": "info", "time": 1657818070713, "message": "File data/others/careplans.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070714, "message": "File data/others/conditions.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070715, "message": "File data/others/observations.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070739, "message": "File data/others/patients.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070739, "message": "File data/asthma/careplans.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070740, "message": "File data/asthma/conditions.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070742, "message": "File data/asthma/observations.csv already exists, skipping download", "params": null}
{"type": "info", "time": 1657818070844, "message

## Mit Datenbank verbinden

In [36]:
databaseFile = "data.sqlite"

logger.startTimeMeasurement('open-db', 'Connected to db and created tables')
connection = extract.connect_to_db(logger, databaseFile)  # create table patients, observations, conditions, careplans
logger.endTimeMeasurement('open-db')

In [37]:
def test_sqliteConnection(_logger):
    cursor = connection.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
    tablesInDb = list(map(lambda tableResult: tableResult[0], cursor.fetchall()))
    tablesInDb.sort()

    for table in tables:
        if not(table in tablesInDb):
            raise Exception('Table not found:', table)

testExecutor.execute('Test connection to database', test_sqliteConnection)

{"type": "info", "time": 1657818071067, "message": "✅ Test ran successfully: Test connection to database", "params": null}


## Daten in Datenbank laden

Lade der verwendete Daten in die Datenbank:

-   careplans
-   conditions
-   observations
-   patients

In [38]:
logger.startTimeMeasurement('load-data', 'Loading data into db')
for file in files:
    for table in tables:
        extract.insert_values_to_table(logger, connection.cursor(), table, "../"+ file + table + ".csv")  # TODO: insert ALL values in the right tables 
        connection.commit()

logger.endTimeMeasurement('load-data')

{"type": "info", "time": 1657818071089, "message": "🏗 Extracting data from ../data/others/careplans.csv", "params": null}
{"type": "info", "time": 1657818071095, "message": "🏗 Extracting data from ../data/others/conditions.csv", "params": null}
{"type": "info", "time": 1657818071111, "message": "🏗 Extracting data from ../data/others/observations.csv", "params": null}
{"type": "info", "time": 1657818071415, "message": "🏗 Extracting data from ../data/others/patients.csv", "params": null}
{"type": "info", "time": 1657818071420, "message": "🏗 Extracting data from ../data/asthma/careplans.csv", "params": null}
{"type": "info", "time": 1657818071429, "message": "🏗 Extracting data from ../data/asthma/conditions.csv", "params": null}
{"type": "info", "time": 1657818071499, "message": "🏗 Extracting data from ../data/asthma/observations.csv", "params": null}
{"type": "info", "time": 1657818072487, "message": "🏗 Extracting data from ../data/asthma/patients.csv", "params": null}
{"type": "info", "

# Annonymisierung

In [39]:
import pandas as pd
import random
import hashlib

patientDf = pd.read_sql_query('SELECT * FROM patients;', connection)
careplansDf = pd.read_sql_query('SELECT * FROM careplans;', connection)
conditionsDf = pd.read_sql_query('SELECT * FROM conditions;', connection)
observationsDf = pd.read_sql_query('SELECT * FROM observations;', connection)

patientIds = [*patientDf.Id, *careplansDf.PATIENT, *conditionsDf.PATIENT, *observationsDf.PATIENT]

def test_sanityCheckCombiningIds(_logger): 
    expectedLen = len(patientDf) + len(careplansDf) + len(conditionsDf) + len(observationsDf)
    actualLen = len(patientIds)
    if actualLen != expectedLen:
        raise Exception('Not all patient ids were concatenated')

testExecutor.execute('Sanity check: extracting all ids worked', test_sanityCheckCombiningIds)

# Converts list to a set with only unique values
uniqueIds = set(patientIds)
logger.log(f"{len(uniqueIds)} unique patient ids found")
if len(uniqueIds) >= len(patientDf.Id):
    logger.log(f"⚠️ The dataset contains {len(uniqueIds)} unique patientIds but only {len(patientDf.Id)} patients.")

annonymizedIds = {}
for id in uniqueIds:
    # Use uppercase here so it is easy to see if annonymized ids are used.
    annonymizedIds[id] = hashlib.sha256(f"{id}={random.random()}".encode()).hexdigest().upper()

def test_sanityEnsureAllIdsAreAnnonymized(_logger): 
    for id in patientIds:
        if id in annonymizedIds:
            raise Exception('A origin id still exists in anonnymized id list')

testExecutor.execute('Sanity check: no origin ids exist anymore', test_sanityCheckCombiningIds)

logger.startTimeMeasurement('annonymizedPatients', 'Writing annonymized patients')
patientDf = patientDf.replace({"Id": annonymizedIds})
patientDf.to_sql(name="anonnymized_patients", con=connection, if_exists='replace')
logger.endTimeMeasurement('annonymizedPatients')

logger.startTimeMeasurement('annonymizedCareplans', 'Writing annonymized careplans')
careplansDf = careplansDf.replace({"PATIENT": annonymizedIds})
careplansDf.to_sql(name="anonnymized_careplans", con=connection, if_exists='replace')
logger.endTimeMeasurement('annonymizedCareplans')

logger.startTimeMeasurement('annonymizedConditions', 'Writing annonymized conditions')
conditionsDf = conditionsDf.replace({"PATIENT": annonymizedIds})
conditionsDf.to_sql(name="anonnymized_conditions", con=connection, if_exists='replace')
logger.endTimeMeasurement('annonymizedConditions')

logger.startTimeMeasurement('annonymizedObservations', 'Writing annonymized Observations')
observationsDf = observationsDf.replace({"PATIENT": annonymizedIds})
observationsDf.to_sql(name="anonnymized_observations", con=connection, if_exists='replace')
logger.endTimeMeasurement('annonymizedObservations')

{"type": "info", "time": 1657818075967, "message": "✅ Test ran successfully: Sanity check: extracting all ids worked", "params": null}
{"type": "info", "time": 1657818076001, "message": "1330 unique patient ids found", "params": null}
{"type": "info", "time": 1657818076002, "message": "⚠️ The dataset contains 1330 unique patientIds but only 1326 patients.", "params": null}
{"type": "info", "time": 1657818076003, "message": "✅ Test ran successfully: Sanity check: no origin ids exist anymore", "params": null}


## Messung der Datenfehler

Für unsere Forschungsfrage sind nur alle Daten mit gemessenen BMI relevant. Wurde dieser nicht vermessen oder eingetragen können die Daten für die Forschungsfrage nicht verwendet werden und sind somit unbrauchbar.

### NULL-Values

Die Rohdaten werden zuvor auf die Anzahl an NULL-Values überprüft. Weisen mehr als **ein drittel der Daten**   Lücken in der Codierung auf, wird ein Fehler in der Verfassung angenommen und die Daten müssen manuell Überprüft werden.

In [40]:
null_counter = 0
num_of_elements = 0
for table in tables:
    querie = f"SELECT * from {table};"
    df = pd.read_sql_query(querie,connection)
    result_string = str(df.isna().sum()).replace("\n"," NULL-Values in Column ")
    logger.log(f"Found {result_string} null-values in {table}.")
    null_counter = df.isna().sum().sum() + null_counter
    num_of_elements = num_of_elements + df.size
perc_null_val = round(null_counter / num_of_elements,3)

if perc_null_val > 0.33:
    logger.log(f"Found {perc_null_val} null-values.",type='Warning')
else:
    logger.log(f"Found {perc_null_val} null-values.")

{"type": "info", "time": 1657818129142, "message": "Found Id                   0 NULL-Values in Column START                0 NULL-Values in Column STOP                 0 NULL-Values in Column PATIENT              0 NULL-Values in Column ENCOUNTER            0 NULL-Values in Column CODE                 0 NULL-Values in Column DESCRIPTION          0 NULL-Values in Column REASONCODE           0 NULL-Values in Column REASONDESCRIPTION    0 NULL-Values in Column dtype: int64 null-values in careplans.", "params": null}
{"type": "info", "time": 1657818129212, "message": "Found START          0 NULL-Values in Column STOP           0 NULL-Values in Column PATIENT        0 NULL-Values in Column ENCOUNTER      0 NULL-Values in Column CODE           0 NULL-Values in Column DESCRIPTION    0 NULL-Values in Column dtype: int64 null-values in conditions.", "params": null}
{"type": "info", "time": 1657818130965, "message": "Found DATE                0 NULL-Values in Column PATIENT             0 NULL-V

### Prüfung auf Duplikate

Duplikate verfälschen die  Ergebnisse des Anlyseteil durch Steigerung der Grundgesamheit mit gleichen Werten. Somit müssen die Daten auf Duplikate in den einzelnen Files überprüft werden, um gleiche Messungen zu finden und gegebenfalls im ETL-Process zu entfernen.


In [41]:
num_of_duplicates = 0
num_of_elements = 0
for table in tables:
    querie = f"SELECT * from {table};"
    df = pd.read_sql_query(querie,connection)
    duplicates = df.groupby(df.columns.tolist()).size().reset_index().\
    rename(columns={0:'records'})
    curr_num_duplicate = (duplicates.records -1).sum() 
    num_of_duplicates = num_of_duplicates + curr_num_duplicate
    logger.log(f"Found {curr_num_duplicate} duplicate-values in {table}.")
    num_of_elements = num_of_elements + df.size
perc_duplicates = round(num_of_duplicates / num_of_elements,3)
logger.log(f"Found {perc_duplicates} duplicate-values.")

{"type": "info", "time": 1657818131815, "message": "Found 0 duplicate-values in careplans.", "params": null}
{"type": "info", "time": 1657818131924, "message": "Found 0 duplicate-values in conditions.", "params": null}
{"type": "info", "time": 1657818134069, "message": "Found 20608 duplicate-values in observations.", "params": null}
{"type": "info", "time": 1657818134234, "message": "Found 0 duplicate-values in patients.", "params": null}
{"type": "info", "time": 1657818134234, "message": "Found 0.003 duplicate-values.", "params": null}


### Prozentuales Anzahl von Gewichts und BMI Werten für Patieten

Für unsere Forschungsfrage sind BMI-Werte relevant und müssen für den Patienten mindestes einmal codiert worden sein. Um die Forschungsfrage mit den zur vorliegenden Daten zu beanworten, sollten auch hier mindestens **ein drittel der Daten** mit einen BMI Codiert worden sein.


In [42]:
all_patients_query = """
SELECT COUNT(id) FROM patients;"""
count_bmi_query = """
SELECT COUNT(distinct id) FROM patients JOIN observations on patients.id == observations.patient WHERE observations.Code = '59576-9'"""

count_all_bmi_query = f"""
SELECT COUNT(patient) FROM observations WHERE observations.Code = '59576-9'"""

patient_all_count = connection.execute(all_patients_query).fetchall()[0][0]
patient_bmi_count = connection.execute(count_bmi_query).fetchall()[0][0]
bmi_count = connection.execute(count_all_bmi_query).fetchall()[0][0]
ratio = round(patient_bmi_count/patient_all_count, 3) 

logger.log(f"Total num of patients {patient_all_count}.")

if ratio > 0.33:
    logger.log(f"Found {patient_bmi_count} patients ({round(ratio*100,3)}%) with {bmi_count} BMI-values.")
else:
    logger.log(f"Found {patient_bmi_count} patients ({round(ratio*100,3)}%) with {bmi_count} BMI-values.")


{"type": "info", "time": 1657818134384, "message": "Total num of patients 1326.", "params": null}
{"type": "info", "time": 1657818134384, "message": "Found 450 patients (33.9%) with 3539 BMI-values.", "params": null}


# Star Schema

In [43]:
cursor = connection.cursor()

#### Erstellen Dimensionstabellen

Tabelle patients_

In [44]:
# table patients_
cursor.execute('''DROP TABLE IF EXISTS patients_;''')
cursor.execute('''
        CREATE TABLE patients_ ( 
        ID STRING PRIMARY KEY UNIQUE,
        RACE STRING, 
        ETHNICITY STRING,
        GENDER STRING,
        AGE INT64
        );''')

<sqlite3.Cursor at 0x7f36dd0c3650>

In [45]:
# create df patients
cursor.execute('''SELECT ID, BIRTHDATE, DEATHDATE, RACE, ETHNICITY, GENDER FROM PATIENTS;''')
df_patients = pd.DataFrame(cursor.fetchall(), columns=['ID', 'BIRTHDATE', 'DEATHDATE', 'RACE', 'ETHNICITY', 'GENDER'])

In [46]:
# convert to date
df_patients["DEATHDATE"] = pd.to_datetime(df_patients["DEATHDATE"])
df_patients["BIRTHDATE"] = pd.to_datetime(df_patients["BIRTHDATE"])
# fill null values withh todays date
df_patients['DEATHDATE'] = df_patients.DEATHDATE.fillna(pd.to_datetime("today"))
# calculate age
df_patients["AGE"] = df_patients.DEATHDATE.dt.year - df_patients.BIRTHDATE.dt.year
# drop unnecessary variables
df_patients = df_patients.drop(['BIRTHDATE', 'DEATHDATE'], axis=1)

In [47]:
df_patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1326 entries, 0 to 1325
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         1326 non-null   object
 1   RACE       1326 non-null   object
 2   ETHNICITY  1326 non-null   object
 3   GENDER     1326 non-null   object
 4   AGE        1326 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 51.9+ KB


In [48]:
print("Number of Duplicated Rows", df_patients.duplicated(df_patients.columns).sum())

Number of Duplicated Rows 0


In [49]:
df_patients.to_sql('df_patients', connection, if_exists='replace', index=False)
cursor.execute('INSERT INTO patients_ (ID, RACE, ETHNICITY, GENDER, AGE) SELECT ID, RACE, ETHNICITY, GENDER, RACE FROM df_patients;')
cursor.execute('''DROP TABLE IF EXISTS df_patients;''')

print(pd.read_sql_query("PRAGMA table_info(patients_)", connection))

   cid       name    type  notnull dflt_value  pk
0    0         ID  STRING        0       None   1
1    1       RACE  STRING        0       None   0
2    2  ETHNICITY  STRING        0       None   0
3    3     GENDER  STRING        0       None   0
4    4        AGE   INT64        0       None   0


Tabelle observations_

In [50]:
# table observations_
cursor.execute('''DROP TABLE IF EXISTS observations_;''')
cursor.execute('''
        CREATE TABLE observations_ ( 
        CODE STRING PRIMARY KEY UNIQUE,
        DESCRIPTION STRING,
        UNITS STRING, 
        TYPE STRING
        );''')

# create df observation
cursor.execute('''SELECT CODE, DESCRIPTION, UNITS, TYPE FROM OBSERVATIONS;''')
df_observations = pd.DataFrame(cursor.fetchall(), columns=['CODE','DESCRIPTION', 'UNITS', 'TYPE'])

df_observations = df_observations.drop_duplicates(subset='CODE')

# transform dt in table
df_observations.to_sql('df_observations', connection, if_exists='replace', index=False)

cursor.execute('''INSERT INTO observations_ (CODE, DESCRIPTION, UNITS, TYPE) SELECT CODE, DESCRIPTION, UNITS, TYPE FROM df_observations;''')

cursor.execute('''DROP TABLE IF EXISTS df_observations;''')
print(pd.read_sql_query("PRAGMA table_info(observations_)", connection))
#print(pd.read_sql_query("SELECT * FROM observations_", connection))

   cid         name    type  notnull dflt_value  pk
0    0         CODE  STRING        0       None   1
1    1  DESCRIPTION  STRING        0       None   0
2    2        UNITS  STRING        0       None   0
3    3         TYPE  STRING        0       None   0


Tabelle careplans_code

In [51]:
# table careplans_code
cursor.execute('''DROP TABLE IF EXISTS careplans_code;''')
cursor.execute('''
        CREATE TABLE careplans_code ( 
        CODE STRING PRIMARY KEY UNIQUE,
        DESCRIPTION STRING
        );''')

# create df careplans_code
cursor.execute('''SELECT CODE, DESCRIPTION FROM CAREPLANS;''')
df_careplans_code = pd.DataFrame(cursor.fetchall(), columns=['CODE','DESCRIPTION'])

df_careplans_code = df_careplans_code.drop_duplicates(subset='CODE')

# transform dt in table
df_careplans_code.to_sql('df_careplans_code', connection, if_exists='replace', index=False)

cursor.execute('''INSERT INTO careplans_code (CODE, DESCRIPTION) SELECT CODE, DESCRIPTION FROM df_careplans_code;''')

cursor.execute('''DROP TABLE IF EXISTS df_careplans_code;''')
print(pd.read_sql_query("PRAGMA table_info(careplans_code)", connection))
#print(pd.read_sql_query("SELECT * FROM careplans_code", connection))

   cid         name    type  notnull dflt_value  pk
0    0         CODE  STRING        0       None   1
1    1  DESCRIPTION  STRING        0       None   0


Tabelle careplans_reasoncode

In [52]:
# table careplans_reasoncode
cursor.execute('''DROP TABLE IF EXISTS careplans_reasoncode;''')
cursor.execute('''
        CREATE TABLE careplans_reasoncode ( 
        REASONCODE STRING PRIMARY KEY UNIQUE,
        REASONDESCRIPTION STRING
        );''')

# create df careplans_code
cursor.execute('''SELECT REASONCODE, REASONDESCRIPTION FROM CAREPLANS;''')
df_careplans_reasoncode = pd.DataFrame(cursor.fetchall(), columns=['REASONCODE','REASONDESCRIPTION'])

df_careplans_reasoncode = df_careplans_reasoncode.drop_duplicates(subset='REASONCODE')

# transform dt in table
df_careplans_reasoncode.to_sql('df_careplans_reasoncode', connection, if_exists='replace', index=False)
   
cursor.execute('INSERT INTO careplans_reasoncode (REASONCODE, REASONDESCRIPTION) SELECT REASONCODE, REASONDESCRIPTION FROM df_careplans_reasoncode;')

cursor.execute('''DROP TABLE IF EXISTS df_careplans_reasoncode;''')
print(pd.read_sql_query("PRAGMA table_info(careplans_reasoncode)", connection))
#print(pd.read_sql_query("SELECT * FROM careplans_reasoncode", connection))

   cid               name    type  notnull dflt_value  pk
0    0         REASONCODE  STRING        0       None   1
1    1  REASONDESCRIPTION  STRING        0       None   0


Tabelle conditions_

In [53]:
# table conditions
cursor.execute('''DROP TABLE IF EXISTS conditions_;''')
cursor.execute('''
        CREATE TABLE conditions_ ( 
        CODE STRING PRIMARY KEY UNIQUE,
        DESCRIPTION STRING
        );''')

# create df conditions
cursor.execute('''SELECT CODE, DESCRIPTION FROM CONDITIONS;''')
df_conditions = pd.DataFrame(cursor.fetchall(), columns=['CODE','DESCRIPTION'])

df_conditions = df_conditions.drop_duplicates(subset='CODE')

# transform dt in table
df_conditions.to_sql('df_conditions', connection, if_exists='replace', index=False)

cursor.execute('INSERT INTO conditions_ (CODE, DESCRIPTION) SELECT CODE, DESCRIPTION FROM df_conditions;')

cursor.execute('''DROP TABLE IF EXISTS df_conditions;''')
print(pd.read_sql_query("PRAGMA table_info(conditions_)", connection))
#print(pd.read_sql_query("SELECT * FROM conditions_", connection))

   cid         name    type  notnull dflt_value  pk
0    0         CODE  STRING        0       None   1
1    1  DESCRIPTION  STRING        0       None   0


Datumstabelle

In [54]:
from datetime import datetime
def create_date_table(start='1900-01-01', end=datetime.today().strftime('%Y-%m-%d')):
    
    df_date = pd.DataFrame({"Date": pd.date_range(start, end)})

    days_names = {
        i: name
        for i, name
        in enumerate(['Monday', 'Tuesday', 'Wednesday',
                      'Thursday', 'Friday', 'Saturday', 
                      'Sunday'])
    }
   
    df_date["Day"] = df_date.Date.dt.dayofweek.map(days_names.get)
    df_date["Week"] = df_date.Date.dt.weekofyear
    df_date["Quarter"] = df_date.Date.dt.quarter
    df_date["Year"] = df_date.Date.dt.year
    df_date["Year_half"] = (df_date.Quarter + 1) // 2
    
    return df_date

In [55]:
# table date_table
cursor.execute('''DROP TABLE IF EXISTS date_table;''')
cursor.execute('''
        CREATE TABLE date_table ( 
        DATE DATE PRIMARY KEY UNIQUE,
        DAY STRING,
        WEEK INT16,
        QUARTER INT16,
        YEAR INT16,
        YEAR_HALF INT16
        );''')

<sqlite3.Cursor at 0x7f36dd0c3650>

In [56]:
df_date = create_date_table()

# transform dt in table
df_date.to_sql('df_date', connection, if_exists='replace', index=False)

cursor.execute('INSERT INTO date_table (DATE, DAY, WEEK, QUARTER, YEAR, YEAR_HALF) SELECT DATE, DAY, WEEK, QUARTER, YEAR, YEAR_HALF FROM df_date;')

cursor.execute('''DROP TABLE IF EXISTS df_date;''')
print(pd.read_sql_query("PRAGMA table_info(date_table)", connection))

   cid       name    type  notnull dflt_value  pk
0    0       DATE    DATE        0       None   1
1    1        DAY  STRING        0       None   0
2    2       WEEK   INT16        0       None   0
3    3    QUARTER   INT16        0       None   0
4    4       YEAR   INT16        0       None   0
5    5  YEAR_HALF   INT16        0       None   0


  df_date["Week"] = df_date.Date.dt.weekofyear


#### Erstelle Faktentabelle

In [57]:
cursor.execute('''DROP TABLE IF EXISTS fact_table;''')
cursor.execute('''
        CREATE TABLE fact_table ( 
        PATIENT_ID STRING,
        OBSERVATION_CODE STRING,
        VALUE STRING,
        DATE DATE,
        CONDITIONS_CODE STRING,
        ENDDATE DATE,
        CAREPLANS_CODE STRING,
        CAREPLANS_REASONCODE STRING,
        FOREIGN KEY (PATIENT_ID)
            REFERENCES patients_(ID),
        FOREIGN KEY (OBSERVATION_CODE)
            REFERENCES observations_(CODE),
        FOREIGN KEY (CONDITIONS_CODE)
            REFERENCES conditions_(CODE),
        FOREIGN KEY (CAREPLANS_CODE)
            REFERENCES careplans_code(CODE),
        FOREIGN KEY (CAREPLANS_REASONCODE)
            REFERENCES careplans_reasoncode(REASONCODE)
        );''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, OBSERVATION_CODE, VALUE, DATE) 
                    SELECT PATIENT, CODE, VALUE, DATE 
                    FROM OBSERVATIONS
                    ;''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, CAREPLANS_CODE, DATE, ENDDATE) 
                    SELECT PATIENT, CODE, START, STOP 
                    FROM CAREPLANS
                    ;''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, CAREPLANS_REASONCODE, DATE, ENDDATE) 
                    SELECT PATIENT, REASONCODE, START, STOP 
                    FROM CAREPLANS
                    ;''')

cursor.execute('''INSERT INTO fact_table        
                    (PATIENT_ID, CONDITIONS_CODE, DATE, ENDDATE) 
                    SELECT PATIENT, CODE, START, STOP 
                    FROM CONDITIONS
                    ;''')
connection.commit()

In [58]:
print(pd.read_sql_query("PRAGMA foreign_key_list(fact_table)", connection))

   id  seq                 table                  from          to  on_update  \
0   0    0  careplans_reasoncode  CAREPLANS_REASONCODE  REASONCODE  NO ACTION   
1   1    0        careplans_code        CAREPLANS_CODE        CODE  NO ACTION   
2   2    0           conditions_       CONDITIONS_CODE        CODE  NO ACTION   
3   3    0         observations_      OBSERVATION_CODE        CODE  NO ACTION   
4   4    0             patients_            PATIENT_ID          ID  NO ACTION   

   on_delete match  
0  NO ACTION  NONE  
1  NO ACTION  NONE  
2  NO ACTION  NONE  
3  NO ACTION  NONE  
4  NO ACTION  NONE  


In [59]:
cursor.execute('SELECT name FROM sqlite_master where type="table"')
print(cursor.fetchall())

[('patients',), ('careplans',), ('conditions',), ('observations',), ('anonnymized_patients',), ('anonnymized_careplans',), ('anonnymized_conditions',), ('anonnymized_observations',), ('patients_',), ('observations_',), ('careplans_code',), ('careplans_reasoncode',), ('conditions_',), ('date_table',), ('fact_table',)]


## Aufräumen & Logs speichern

In [60]:
connection.close()
logger.logTimings()
logger.writeToFile("../artefacts-for-release/etl-log.json")

{"type": "info", "time": 1657818136086, "message": "⏳ Connected to db and created tables in 1853261ms", "params": {"timingInMilliseconds": 1853261}}
{"type": "info", "time": 1657818136087, "message": "⏳ Loading data into db in 1856849ms", "params": {"timingInMilliseconds": 1856849}}
{"type": "info", "time": 1657818136087, "message": "⏳ Writing annonymized patients in 1852616ms", "params": {"timingInMilliseconds": 1852616}}
{"type": "info", "time": 1657818136087, "message": "⏳ Writing annonymized careplans in 1852679ms", "params": {"timingInMilliseconds": 1852679}}
{"type": "info", "time": 1657818136087, "message": "⏳ Writing annonymized conditions in 1853792ms", "params": {"timingInMilliseconds": 1853792}}
{"type": "info", "time": 1657818136087, "message": "⏳ Writing annonymized Observations in 1904065ms", "params": {"timingInMilliseconds": 1904065}}


TypeError: unsupported operand type(s) for -: 'NoneType' and 'int'