<a href="https://colab.research.google.com/github/Fuenfgeld/DMA2024TeamC/blob/main/Code/Datenbank.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Kursarbeit Datenmanagement und -Archivierung WS 23/24**

Master-Code Name-CALICO:MA


**Erstellung der Datenbank**


*   Datenbankinitialisierung:



> Importing Libraries

In [47]:
import pandas as pd
from functools import reduce
import sqlite3
from sqlite3 import Error
import csv
import requests

> Verbindung zu Googledrive herstellen (Ablageort der CSV-Dateien)

In [48]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [49]:
material_path = "/content/drive/My Drive/csv_data"

In [50]:
!ls "/content/drive/My Drive/csv_data"

 allergy       asthma	       colorectal_cancer   dermatitis	 metabolic_syndrome_disease
 allergy.csv   breast_cancer   covid19		   lung_cancer	'Outline COVID-19.gdoc'


> Verbindung zu SQLite herstellen und Datebankinitialisierung im Memory


In [51]:
# SQLite connection
# Create a SQLite database in memory from python

def create_connection_memory():
    """ create a database connection to a database that resides
        in the memory
    """
    conn = None;
    try:
       #Establishing the connection
        conn = sqlite3.connect(':memory:')
        return conn
        print(sqlite3.version)
    except Error as e:
        print("Error while connecting to sqlite", e)
    # finally:
    #     if conn:
    #         conn.close()

In [52]:
# Creating a temporary empty database in the memory.
conn = create_connection_memory()
# Creating a cursor object using the cursor() method
cur = conn.cursor()
print("Successfully Connected to SQLite")


Successfully Connected to SQLite




> Verbindung zu SQLite herstellen und Datebankinitialisierung Local




In [53]:
def create_connection_local(local_path):
  conn = None;
  try:
    #Establishing the connection
    conn = sqlite3.connect(local_path+'/datawarehouse.db')
    return conn
    print(sqlite3.version)
  except Error as e:
    print("Error while connecting to sqlite", e)
conn = create_connection_local('/content/drive/My Drive/Datenmanagement_und_Archivierung_im_Umfeld_der_Forschung/CALICO_MA')
# Creating a cursor object using the cursor() method
cur = conn.cursor()
print("Successfully Connected to SQLite Public Data Warehouse")


Successfully Connected to SQLite Public Data Warehouse


> Quelldatenimport

In [54]:
# Load cancer CSVs
procedures = pd.read_csv('/content/drive/My Drive/Data_source/procedures.csv', sep=",")
encounters = pd.read_csv('/content/drive/My Drive/Data_source/encounters.csv', sep=",")
immunizations = pd.read_csv('/content/drive/My Drive/Data_source/immunizations.csv', sep=",")
medications = pd.read_csv('/content/drive/My Drive/Data_source/medications.csv', sep=",")
observations = pd.read_csv('/content/drive/My Drive/Data_source/observations.csv', sep=",")
patients = pd.read_csv('/content/drive/My Drive/Data_source/patients.csv', sep=",")

In [55]:
#Delete Tables in case they exist
cur.execute("DROP TABLE IF EXISTS procedures")
cur.execute("DROP TABLE IF EXISTS encounters")
cur.execute("DROP TABLE IF EXISTS immunizations")
cur.execute("DROP TABLE IF EXISTS medications")
cur.execute("DROP TABLE IF EXISTS observations")
cur.execute("DROP TABLE IF EXISTS patients")
cur.execute("DROP TABLE IF EXISTS facts_table")

<sqlite3.Cursor at 0x7bf212cebc40>

In [56]:
# Create tables including facts_table
sql_create_source_data = requests.get('https://raw.githubusercontent.com/Fuenfgeld/DMA2024TeamC/main/Code/create_statements_db.sql').text
cur.executescript(sql_create_source_data)
conn.commit()
print("Successfully created tables in the database")

Successfully created tables in the database


In [57]:
#Insert data into tables

procedures.to_sql('procedures', conn, if_exists='append', index=False)
encounters.to_sql('encounters', conn, if_exists='append', index=False)
immunizations.to_sql('immunizations', conn, if_exists='append', index=False)
medications.to_sql('medications', conn, if_exists='append', index=False)
observations.to_sql('observations', conn, if_exists='append', index=False)
patients.to_sql('patients', conn, if_exists='append', index=False)

3084


> Daten in Faktentabelle übertragen



In [58]:
cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, patient_BIRTHDATE, patient_LAT, patient_LON, patient_HEALTHCARE_EXPENSES , patient_HEALTHCARE_COVERAGE)
                    SELECT Table_Names, Id, BIRTHDATE, LAT, LON, HEALTHCARE_EXPENSES, HEALTHCARE_COVERAGE
                    FROM patients
                    ;''')


cur.execute('''INSERT INTO facts_table
                    ( cancer_type, patient_ID, observation_code, observations_VALUE, observations_DATE)
                    SELECT Table_Names, PATIENT, CODE, VALUE, DATE
                    FROM observations
                    ;''')


cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID,  procedures_BASE_COST )
                    SELECT Table_Names, PATIENT, ENCOUNTER, BASE_COST
                    FROM procedures
                    ;''')


cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID, medications_TOTALCOST )
                    SELECT Table_Names, PATIENT, ENCOUNTER, TOTALCOST
                    FROM medications
                    ;''')

cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID, immunizations_BASE_COST)
                    SELECT Table_Names, PATIENT,  ENCOUNTER, BASE_COST
                    FROM immunizations
                    ;''')

cur.execute('''INSERT INTO facts_table
                    (cancer_type, patient_ID, encounter_ID, encounter_BASE_ENCOUNTER_COST)
                    SELECT Table_Names, PATIENT, Id, BASE_ENCOUNTER_COST
                    FROM encounters
                    ;''')



<sqlite3.Cursor at 0x7bf212cebc40>

In [59]:
conn.commit()

In [60]:
#connect to DB
conn = sqlite3.connect('/content/drive/My Drive/Datenmanagement_und_Archivierung_im_Umfeld_der_Forschung/CALICO_MA/datawarehouse.db')
cursor = conn.cursor()


In [61]:
#Select data
# show table: retrieve all the column of each records and 4 rows
cur.execute("SELECT * FROM patients")
records = cur.fetchall()
# show 4 rows
for row in records[:4]:
  print(row)

('d2061cc7-bee0-0e6c-3ac4-15c197c474e0', '22/06/1956', None, '999-51-6528', 'S99944910', 'X29408602X', 'Mr.', 'Lucio648', 'Simonis280', None, None, 'M', 'white', 'nonhispanic', 'M', 'Worcester  Massachusetts  US', '636 Wiegand Loaf', 'Cambridge', 'Massachusetts', 'Middlesex County', 2140, '42,359925869', '-71,113260249', 1475230, '4244,64', 'patients_bc')
('073d8e80-ff90-1c8d-57e4-29bfca52c87f', '28/08/1964', None, '999-90-4728', 'S99976204', 'X17497441X', 'Mrs.', 'Buffy238', 'Wolf938', None, 'Williamson769', 'M', 'white', 'nonhispanic', 'F', 'Somerville  Massachusetts  US', '972 Satterfield Trafficway Apt 71', 'North Brookfield', 'Massachusetts', 'Worcester County', 1535, '42,257845470', '-72,026316805', 1489125, '4016,36', 'patients_bc')
('e1ff7e68-4097-9faf-514d-e4cfcfdf252e', '28/08/1998', None, '999-82-3645', 'S99973929', 'X88553898X', 'Ms.', 'Debora709', 'Klocko335', None, None, None, 'white', 'nonhispanic', 'F', 'Williamstown  Massachusetts  US', '589 Koss Station', 'Worcester',

In [62]:
# List of columns in a table
cur.execute ("select * from facts_table")
col_names = cur.description
for row in col_names:
  print(row[0])


cancer_type
patient_ID
observation_code
encounter_ID
patient_LAT
patient_LON
patient_HEALTHCARE_EXPENSES
patient_HEALTHCARE_COVERAGE
patient_BIRTHDATE
observations_VALUE
observations_DATE
procedures_BASE_COST
medications_TOTALCOST
immunizations_BASE_COST
encounter_BASE_ENCOUNTER_COST
