<a href="https://colab.research.google.com/github/Fuenfgeld/DMA2024TeamC/blob/main/Code/Datenbank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Kursarbeit Datenmanagement und -Archivierung WS 23/24**

Master-Code Name-CALICO:MA


# Skript wurde geändert bzgl. des Ausfüllens der Faktentabelle. Die aktuelle Version befindet sich unter ETL2Datewarehouse.ipynb

**Erstellung der Datenbank**


*   Datenbankinitialisierung:



> Laden Libraries

In [None]:
import pandas as pd
from functools import reduce
import sqlite3
from sqlite3 import Error
import csv
import requests

> Verbindung zu Googledrive herstellen (Ablageort der CSV-Dateien)

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive




> Verbindung zu SQLite herstellen und Datebankinitialisierung Local




In [None]:
def create_connection_local(local_path):
  conn = None;
  try:
    #Establishing the connection
    conn = sqlite3.connect(local_path+'/datawarehouse.db')
    return conn
    print(sqlite3.version)
  except Error as e:
    print("Error while connecting to sqlite", e)
conn = create_connection_local('/content/drive/My Drive/Datenmanagement_und_Archivierung_im_Umfeld_der_Forschung/CALICO_MA')
# Creating a cursor object using the cursor() method
cur = conn.cursor()
print("Successfully Connected to SQLite Public Data Warehouse")


Successfully Connected to SQLite Public Data Warehouse


> Quelldatenimport

In [None]:
# Load cancer CSVs
procedures = pd.read_csv('/content/drive/My Drive/Data_source/procedures.csv', sep=",")
encounters = pd.read_csv('/content/drive/My Drive/Data_source/encounters.csv', sep=",")
immunizations = pd.read_csv('/content/drive/My Drive/Data_source/immunizations.csv', sep=",")
medications = pd.read_csv('/content/drive/My Drive/Data_source/medications.csv', sep=",")
observations = pd.read_csv('/content/drive/My Drive/Data_source/observations.csv', sep=",")
patients = pd.read_csv('/content/drive/My Drive/Data_source/patients.csv', sep=",")

In [None]:
#print(pd.read_sql_query("PRAGMA table_info('patients')", conn))

In [None]:
#Delete Tables in case they exist
cur.execute("DROP TABLE IF EXISTS procedures")
cur.execute("DROP TABLE IF EXISTS encounters")
cur.execute("DROP TABLE IF EXISTS immunizations")
cur.execute("DROP TABLE IF EXISTS medications")
cur.execute("DROP TABLE IF EXISTS observations")
cur.execute("DROP TABLE IF EXISTS patients")
cur.execute("DROP TABLE IF EXISTS facts_table")

<sqlite3.Cursor at 0x7c1e72600d40>

In [None]:
# Create tables including facts_table
sql_create_source_data = requests.get('https://raw.githubusercontent.com/Fuenfgeld/DMA2024TeamC/main/Code/create_statements_db.sql').text
cur.executescript(sql_create_source_data)
conn.commit()
print("Successfully created tables in the database")

Successfully created tables in the database


In [None]:
#Insert data into tables

procedures.to_sql('procedures', conn, if_exists='append', index=False)
encounters.to_sql('encounters', conn, if_exists='append', index=False)
immunizations.to_sql('immunizations', conn, if_exists='append', index=False)
medications.to_sql('medications', conn, if_exists='append', index=False)
observations.to_sql('observations', conn, if_exists='append', index=False)
patients.to_sql('patients', conn, if_exists='append', index=False)

3084


> Daten in Faktentabelle übertragen



In [None]:
cur.execute('''INSERT INTO facts_table
                    ( cancer_type, patient_ID, encounter_ID, patient_LAT, patient_LON, patient_HEALTHCARE_EXPENSES , patient_HEALTHCARE_COVERAGE, patient_BIRTHDATE)
                    SELECT
                        p.cancer_type AS cancer_type,
                        p.Id AS patient_ID,
                        NULL AS encounter_ID,  -- Replace with the actual encounter_ID or set to NULL if not applicable
                        p.LAT AS patient_LAT,
                        p.LON AS patient_LON,
                        p.HEALTHCARE_EXPENSES AS patient_HEALTHCARE_EXPENSES,
                        p.HEALTHCARE_COVERAGE AS patient_HEALTHCARE_COVERAGE,
                        p.BIRTHDATE AS patient_BIRTHDATE
                    FROM patients p
                    LEFT JOIN encounters e ON p.Id = e.PATIENT
                    ;''')

cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID, encounter_BASE_ENCOUNTER_COST)
                    SELECT cancer_type, PATIENT, Id, BASE_ENCOUNTER_COST
                    FROM encounters
                    ;''')

cur.execute('''INSERT INTO facts_table
                    ( cancer_type, patient_ID, observations_VALUE, observations_DATE)
                    SELECT cancer_type, PATIENT, VALUE, DATE
                    FROM observations
                    ;''')


cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID,  procedures_BASE_COST )
                    SELECT cancer_type, PATIENT, ENCOUNTER, BASE_COST
                    FROM procedures
                    ;''')


cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID, medications_TOTALCOST )
                    SELECT cancer_type, PATIENT, ENCOUNTER, TOTALCOST
                    FROM medications
                    ;''')

cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID, immunizations_BASE_COST)
                    SELECT cancer_type, PATIENT,  ENCOUNTER, BASE_COST
                    FROM immunizations
                    ;''')


conn.commit()


In [None]:
#Select data
# show table: retrieve all the column of each records and 4 rows
cur.execute("SELECT * FROM patients")
records = cur.fetchall()
# show 4 rows
for row in records[:4]:
  print(row)

('d2061cc7-bee0-0e6c-3ac4-15c197c474e0', '22/06/1956', None, '999-51-6528', 'S99944910', 'X29408602X', 'Mr.', 'Lucio648', 'Simonis280', None, None, 'M', 'white', 'nonhispanic', 'M', 'Worcester  Massachusetts  US', '636 Wiegand Loaf', 'Cambridge', 'Massachusetts', 'Middlesex County', 2140, '42,359925869', '-71,113260249', 1475230, '4244,64', 'breast_cancer')
('073d8e80-ff90-1c8d-57e4-29bfca52c87f', '28/08/1964', None, '999-90-4728', 'S99976204', 'X17497441X', 'Mrs.', 'Buffy238', 'Wolf938', None, 'Williamson769', 'M', 'white', 'nonhispanic', 'F', 'Somerville  Massachusetts  US', '972 Satterfield Trafficway Apt 71', 'North Brookfield', 'Massachusetts', 'Worcester County', 1535, '42,257845470', '-72,026316805', 1489125, '4016,36', 'breast_cancer')
('e1ff7e68-4097-9faf-514d-e4cfcfdf252e', '28/08/1998', None, '999-82-3645', 'S99973929', 'X88553898X', 'Ms.', 'Debora709', 'Klocko335', None, None, None, 'white', 'nonhispanic', 'F', 'Williamstown  Massachusetts  US', '589 Koss Station', 'Worcest

In [None]:
# List of columns in a table
cur.execute ("select * from facts_table")
col_names = cur.description
for row in col_names:
  print(row[0])


cancer_type
patient_ID
encounter_ID
patient_LAT
patient_LON
patient_HEALTHCARE_EXPENSES
patient_HEALTHCARE_COVERAGE
patient_BIRTHDATE
observations_VALUE
observations_DATE
procedures_BASE_COST
medications_TOTALCOST
immunizations_BASE_COST
encounter_BASE_ENCOUNTER_COST


In [None]:
print(pd.read_sql_query("PRAGMA table_info('facts_table')", conn))

    cid                           name          type  notnull dflt_value  pk
0     0                    cancer_type        STRING        0       None   0
1     1                     patient_ID        STRING        0       None   0
2     2                   encounter_ID        STRING        0       None   0
3     3                    patient_LAT  DECIMAL(6,2)        0       None   0
4     4                    patient_LON  DECIMAL(6,2)        0       None   0
5     5    patient_HEALTHCARE_EXPENSES  DECIMAL(6,2)        0       None   0
6     6    patient_HEALTHCARE_COVERAGE  DECIMAL(6,2)        0       None   0
7     7              patient_BIRTHDATE          DATE        0       None   0
8     8             observations_VALUE        STRING        0       None   0
9     9              observations_DATE          DATE        0       None   0
10   10           procedures_BASE_COST  DECIMAL(6,2)        0       None   0
11   11          medications_TOTALCOST  DECIMAL(6,2)        0       None   0