<a href="https://colab.research.google.com/github/Fuenfgeld/DMA2023TeamB/blob/main/Implementierung_des_COVID_19_Data_warehouse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Datawarehouse creation and transformation process for data from COVID-19 source database to datawarehouse db


Importing required libraries and mountig googledrive with scharedfolder location containig source database

In [39]:
import sqlite3
from sqlite3 import Error

In [40]:
# mount drive to access database
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Defining input path for Data form source database and  and output data path for Datawarehouse 

In [41]:
# type of patients
patient_type = "covid19"

In [42]:
# path of source database in shared googledrive Folder
DB_SOURCE_PATH = f"/content/drive/Shareddrives/DMA_2023_D/source_dbs/source_covid19_test.db"

In [43]:
# path of datawarehouse with Filename DWH_COVID-19_2023.db
DB_DWH_PATH = f"/content/drive/Shareddrives/DMA_2023_D/source_dbs/DWH_COVID-19_2023.db"

Creating Datawarehouse 

In [44]:
class DB(object):
  def __init__(self, db_file):
    self.conn = sqlite3.connect(db_file)
    self.cur = self.conn.cursor()
    self.__init_db()
  
  def __del__(self):
      self.conn.commit()
      self.conn.close()

  def __init_db(self):
    # sql queries to create tables in Datawarehouse 

    #  sql query to create patients_info table -> Isoliert ID, Geburts-/Todesdatum, Gender
    create_patients_info = """CREATE TABLE IF NOT EXISTS patients_info (
                           Id STRING PRIMARY KEY,
                           BIRTHDATE DATE,
                           DEATHDATE DATE,
                           GENDER STRING
                       );"""

    #  sql query to create disease table -> Disease Table nicht benötigt


    # sql query to create condition table -> Encounter ex, Encounters Table nicht gebraucht, Description as Code
    create_conditions_info = """CREATE TABLE IF NOT EXISTS conditions_info (
                           START DATE,
                           STOP DATE,
                           PATIENT STRING,
                           CODE STRING,
                           DESCRIPTION STRING,
                           FOREIGN KEY (PATIENT)
                              REFERENCES patients_info (Id) 
                       );"""

    # sql query to create careplans table -> nicht benötigt
  

    # sql query to create procedures table -> encounter ex, Reduktion auf Pat.ID und SNOMED Code
    create_procedures_info = """CREATE TABLE IF NOT EXISTS procedures_info (
                           DATE DATE,
                           PATIENT STRING,
                           CODE STRING,
                           DESCRIPTION STRING,
                           FOREIGN KEY (PATIENT)
                              REFERENCES patients_info (Id) 
                       );"""

    # sql query to create medications_info table -> DESCRIPTION BELASSEN, DA MULTIPLE Medikamente
    create_medications_info = """CREATE TABLE IF NOT EXISTS medications_info (
                           START DATE,
                           STOP DATE,
                           PATIENT STRING,
                           CODE STRING,
                           DESCRIPTION STRING,
                           FOREIGN KEY (PATIENT)
                              REFERENCES patients_info (Id)   
                       );"""

    # sql query to create observations table -> Table nicht benötigt


    # sql query to create devices table
    create_devices_info = """CREATE TABLE IF NOT EXISTS devices_info (
                           START DATE,
                           STOP DATE,
                           PATIENT STRING,
                           CODE STRING,
                           DESCRIPTION STRING,
                           FOREIGN KEY (PATIENT)
                              REFERENCES patients_info (Id)                 
                       );"""
   
   # sql query to create Imaging_studies table -> ist leer
   # zusätzlich sollte ein immunizations table angelegt werden:
   # sql query to create immunizations table
    create_immunizations_info = """CREATE TABLE IF NOT EXISTS immunizations_info (
                           DATE DATE,
                           ENCOUNTER STRING,
                           PATIENT STRING,
                           CODE STRING,
                           DESCRIPTION STRING,
                           FOREIGN KEY (PATIENT)
                              REFERENCES patients_info (Id)                 
                       );"""


    create_tables = [create_patients_info, # patient data
                     create_conditions_info, create_procedures_info, create_devices_info,  # symptoms and procedure data
                     create_medications_info, # medication data
                     create_immunizations_info # immunization data
                     ]
     

    if self.conn is not None:
      # self.cur.execute(f"drop table if exists medications_info")
      for query in create_tables:
          self.cur.execute(query)
    else:
      print('Connection to database failed')



ETL/ELT (Extract, transform, load )

In [45]:
# exporting queries
class SqlQuery:
  def __init__(self, source_table, column_names, sink_table):
    self.source_table = source_table
    self.column_numbers = len(column_names)
    self.column_names = ', '.join(column_names)
    self.sink_table = sink_table

  def extract_query(self):
    return 'SELECT ' + self.column_names + ' FROM ' + self.source_table 

  def load_query(self):
    values_str = '?,' * self.column_numbers
    # print("*****", values_str, column_names, column_numbers)
    values_str = values_str[:-1]
    return 'INSERT OR REPLACE INTO ' + self.sink_table + ' VALUES (' + values_str + ')'

    # return 'INSERT INTO ' + self.sink_table + '(' + self.column_names + ') VALUES (' + values_str + ')'
  



In [46]:
def etl(query, source_cnx, target_cnx):
  # extract data from source db
  source_cursor = source_cnx.cursor()
  source_cursor.execute(query.extract_query())
  data = source_cursor.fetchall()
  source_cursor.close()

  # load data into warehouse db
  if data:
    target_cursor = target_cnx.cursor()
    target_cursor.executemany(query.load_query(), data)
    print('data loaded to warehouse db') 
    target_cnx.commit()
    target_cursor.close()
  else:
    print('data is empty')


def etl_process(queries, target_cnx, db_source):
  """
  queries: list
        a list of queries
  target_cnx: SQLite connection
  db_source: str
        path of source database      
  
  """  
  # establish source db connection
  try:
    source_cnx = sqlite3.connect(db_source)
  except Error as err:
    print(err)
  
  # loop through sql queries
  for query in etl_queue:
    etl(query, source_cnx, target_cnx)
    
  # close the source db connection
  source_cnx.close()

Datawarehouse Creation 

In [47]:
# create Datawarehouse
dwh_db = DB(DB_DWH_PATH)

In [48]:
print('starting etl')   
# list for iteration
etl_queue = []

# patient table
patients_columns = ['Id', 'BIRTHDATE', 'DEATHDATE', 'GENDER']
sql_query_patients = SqlQuery("patients", patients_columns, "patients_info")
etl_queue.append(sql_query_patients)

# symptoms and procedures table
conditions_columns = ['START', 'STOP', 'PATIENT', 'CODE', 'DESCRIPTION']
sql_query_conditions = SqlQuery("conditions", conditions_columns, "conditions_info")
etl_queue.append(sql_query_conditions)

# procedures table
procedures_columns = ['DATE', 'PATIENT', 'CODE', 'DESCRIPTION']
sql_query_procedures = SqlQuery("procedures", procedures_columns, "procedures_info")
etl_queue.append(sql_query_procedures)

# devices table
devices_columns = ['START', 'STOP', 'PATIENT', 'CODE', 'DESCRIPTION']
sql_query_devices = SqlQuery("devices", devices_columns, "devices_info")
etl_queue.append(sql_query_devices)

# medications table
medications_columns = ['START', 'STOP', 'PATIENT', 'CODE', 'DESCRIPTION']
sql_query_medications = SqlQuery("medications", medications_columns, "medications_info")
etl_queue.append(sql_query_medications)

# immunizations table
immunizations_columns = ['DATE', 'ENCOUNTER', 'PATIENT', 'CODE','DESCRIPTION']
sql_query_immunizations = SqlQuery("immunizations", immunizations_columns, "immunizations_info")
etl_queue.append(sql_query_immunizations)

# list for iteration
etl_queue

starting etl


[<__main__.SqlQuery at 0x7f62d0d470a0>,
 <__main__.SqlQuery at 0x7f62d0d472b0>,
 <__main__.SqlQuery at 0x7f62d0d47970>,
 <__main__.SqlQuery at 0x7f62d0d47340>,
 <__main__.SqlQuery at 0x7f62d0d47ac0>,
 <__main__.SqlQuery at 0x7f62cd41e100>]

In [49]:
# establish connection for target database (sql-server)
target_cnx = dwh_db.conn
etl_process(etl_queue, target_cnx, DB_SOURCE_PATH)

data loaded to warehouse db
data loaded to warehouse db

Exception ignored in: <function DB.__del__ at 0x7f62d46ab310>
Traceback (most recent call last):
  File "<ipython-input-32-e422f2702e6e>", line 8, in __del__
sqlite3.ProgrammingError: SQLite objects created in a thread can only be used in that same thread. The object was created in thread id 140062728399936 and this is thread id 140062397892352.



data loaded to warehouse db
data loaded to warehouse db
data loaded to warehouse db
data loaded to warehouse db


In [50]:
target_cnx.commit()

In [51]:
#target_cnx.close()