<a href="https://colab.research.google.com/github/Fuenfgeld/DMA2023TeamC/blob/main/Datenbank/creating_db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a Database
and importing the data from .csv files

First, reset all possible variables

In [None]:
%reset -f

## Libraries

### Import the libraries

In [None]:
import csv
import sqlite3
from sqlite3 import Error
import pandas as pd

### Check the versions of the libraries

The versions of the packages when working on the project are:  
csv:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1.0  
sqlite3: &nbsp;3.31.1  
pandas: 1.3.5

In [None]:
print('The current version of the csv module is ' + csv.__version__)
print('The current version of sqlite3 is ' + sqlite3.sqlite_version)
print('The current version of pandas is ' + pd.__version__)

## Creating the database

Connect to the google drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

print(f'The current working directory is:')
!pwd

Store the following paths in variables for later use

In [None]:
disease = 'metabolic_syndrome_disease'
path_csv_files = f"/content/drive/Shareddrives/TeamC/Material/csv_data/{disease}"
path_teamc = "content/drive/Shareddrives/TeamC"

Define a finction to create a database file and then run it.

In [None]:
# define a function to create a database file in the specified path

def create_db_file(db_file):
  # if the db file already exists, return a message that informs about it
  if db_file:
    print('file already exists')
  else:
    conn = None
    try:
      conn = sqlite3.connect(db_file) #if the file does not exist, it will be created
      print('Everything went ok')
    except Error as err:
      print(err)
    finally:
      # close connection
      if conn:
        conn.close()

# execute the function to connect to the database file and, in this case, 
# create an empty database file in die shared drive TeamC folder

create_db_file(r'/content/drive/Shareddrives/TeamC/teamc_db.db')

## Explore the csv files

Get a column list for every csv file

In [None]:
# list of columns from a chosen csv file in the path_csv_files directory
def column_list(csv_file_name):
  with open(f'{path_csv_files}/{csv_file_name}') as csv_file:
    reader_obj = csv.reader(csv_file, delimiter = ',')
    for row in reader_obj:
      columns_list = row
      break
  print(columns_list)

# print the column list in the csv files
list_csv = ['patients', 'careplans', 'organizations', 'payers', 'providers', 'encounters', 'conditions', 
            'devices', 'immunizations', 'medications', 'observations', 'payer_transitions', 'procedures']
for n in list_csv:
  column_list(f'{n}.csv')


## Load the data to the dabatase

Create a function to connect to the database

In [None]:
# define a function to create connection with the dababase

def create_connection(db_file):

  # if the specified file does not exist...:
  if not db_file:
    print('file not found')

  try:
    conn = sqlite3.connect(db_file)
    return conn
  except Error as err:
    print(err)

Store the connection object in a variable, then create a cursor and store it in a variable.

In [None]:
 # use the predefined function to create a connection with the db file and store
 # the connection object in a variable
conn = create_connection('/content/drive/Shareddrives/TeamC/teamc_db.db')
c = conn.cursor()

Drop the existing tables, if they exist

In [None]:
# in case the code is run more than once, now drop the existing tables
for tab in list_csv:
  c.execute(
      f'''DROP TABLE IF EXISTS {tab}'''
  )

Create empty tables

In [None]:
# Create table patients

c.execute(
    '''CREATE TABLE IF NOT EXISTS patients (
      Id STRING PRIMARY KEY, 
      BIRTHDATE DATE, 
      DEATHDATE DATE,
      SSN STRING,
      DRIVERS STRING,
      PASSPORT STRING,
      PREFIX STRING,
      FIRST STRING,
      LAST STRING,
      SUFFIX STRING,
      MAIDEN STRING,
      MARITAL STRING,
      RACE STRING,
      ETHNICITY STRING,
      GENDER STRING,
      BIRTHPLACE STRING,
      ADDRESS STRING,
      CITY STRING,
      STATE STRING,
      COUNTY STRING,
      ZIP STRING,
      LAT STRING,
      LON STRING,
      HEALTHCARE_EXPENSES INTEGER,
      HEALTHCARE_COVERAGE INTEGER
      )''' 
    )

In [None]:
# create table organizations

c.execute(
    '''CREATE TABLE IF NOT EXISTS organizations (
      Id STRING PRIMARY KEY,
      NAME STRING,
      ADDRESS STRING,
      CITY STRING,
      STATE STRING,
      ZIP STRING,
      LAT STRING,
      LON STRING,
      PHONE STRING,
      REVENUE FLOAT, 
      UTILIZATION INTEGER
    )'''
)

In [None]:
# create table payers
c.execute(
    '''CREATE TABLE IF NOT EXISTS payers (
      Id PRIMARY KEY,
      NAME STRING,
      ADDRESS STRING,
      CITY STRING,
      STATE_HEADQUARTERED STRING,
      ZIP STRING,
      PHONE STRING,
      AMOUNT_COVERED FLOAT,
      AMOUNT_UNCOVERED FLOAT,
      REVENUE FLOAT,
      COVERED_ENCOUNTERS INTEGER, 
      UNCOVERED_ENCOUNTERS INTEGER,
      COVERED_MEDICATIONS INTEGER,
      UNCOVERED_MEDICATIONS INTEGER,
      COVERED_PROCEDURES INTEGER,
      UNCOVERED_PROCEDURES INTEGER,
      COVERED_IMMUNIZATIONS INTEGER,
      UNCOVERED_IMMUNIZATIONS INTEGER,
      UNIQUE_CUSTOMERS INTEGER,
      QOLS_AVG REAL,
      MEMBER_MONTHS INTEGER
    )'''
)

In [None]:
# create table providers

c.execute(
    '''CREATE TABLE IF NOT EXISTS providers (
      Id STRING PRIMARY KEY,
      ORGANIZATION STRING, 
      NAME STRING,
      GENDER STRING,
      SPECIALITY STRING,
      ADDRESS STRING,
      CITY STRING,
      STATE STRING,
      ZIP STRING,
      LAT STRING,
      LON STRING,
      UTILIZATION INTEGER,
      FOREIGN KEY (ORGANIZATION) REFERENCES organizations (Id)
    )'''
)

In [None]:
# create table encounters

c.execute(
    '''CREATE TABLE IF NOT EXISTS encounters (
      Id STRING PRIMARY KEY,
      START DATE,
      STOP DATE,
      PATIENT STRING,
      ORGANIZATION STRING,
      PROVIDER STRING,
      PAYER STRING,
      ENCOUNTERCLASS STRING,
      CODE STRING,
      DESCRIPTION STRING,
      BASE_ENCOUNTER_COST FLOAT,
      TOTAL_CLAIM_COST FLOAT,
      PAYER_COVERAGE FLOAT,
      REASONCODE STRING,
      REASONDESCRIPTION STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ORGANIZATION) REFERENCES organizations (Id)
      FOREIGN KEY (PROVIDER) REFERENCES providers (Id)
      FOREIGN KEY (PAYER) REFERENCES payers (Id)
    )'''
)

In [None]:
# create table conditions

c.execute(
    '''CREATE TABLE IF NOT EXISTS conditions (
      START DATE,
      STOP DATE, 
      PATIENT STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

In [None]:
# create table devices

c. execute(
    '''CREATE TABLE IF NOT EXISTS devices (
      START DATE,
      STOP DATE, 
      PATIENT STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      UDI STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

In [None]:
# create table careplans

c.execute(
    '''CREATE TABLE IF NOT EXISTS careplans (
      Id STRING PRIMARY KEY,
      START DATE,
      STOP DATE,
      PATIENT STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      REASONCODE STRING,
      REASONDESCRIPTION STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

In [None]:
# create table immunizations

c.execute(
    '''CREATE TABLE IF NOT EXISTS immunizations (
      DATE DATE,
      PATIENT STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      BASE_COST INTEGER,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

In [None]:
#create table medications

c.execute(
    '''CREATE TABLE IF NOT EXISTS medications (
      START DATE,
      STOP DATE,
      PATIENT STRING,
      PAYER STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      BASE_COST INTEGER,
      PAYER_COVERAGE FLOAT,
      DISPENSES INTEGER,
      TOTALCOST FLOAT,
      REASONCODE STRING,
      REASONDESCRIPTION STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (PAYER) REFERENCES payers (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

In [None]:
#create table observations

c.execute(
    '''CREATE TABLE IF NOT EXISTS observations (
      DATE DATE,
      PATIENT STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      VALUE FLOAT,
      UNITS STRING,
      TYPE STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

In [None]:
#create table payer transisions

c.execute(
    '''CREATE TABLE IF NOT EXISTS payer_transitions (
      PATIENT STRING,
      START_YEAR INTEGER,
      END_YEAR INTEGER,
      PAYER STRING,
      OWNERSHIP STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (PAYER) REFERENCES payers (Id)
    )'''
)

In [None]:
# create table procedures

c.execute(
    '''CREATE TABLE IF NOT EXISTS procedures (
      DATE DATE,
      PATIENT STRING,
      ENCOUNTER STRING,
      CODE STRING,
      DESCRIPTION STRING,
      BASE_COST FLOAT,
      REASONCODE STRING,
      REASONDESCRIPTION STRING,
      FOREIGN KEY (PATIENT) REFERENCES patients (Id)
      FOREIGN KEY (ENCOUNTER) REFERENCES encounters (Id)
    )'''
)

Using pandas, load the csv data into the database

In [None]:
# fill the tables
# use a loop to pass all the table names to a pandas command
for n in list_csv:
  df = pd.read_csv(f'{path_csv_files}/{n}.csv')
  df.to_sql(f'{n}', conn, if_exists = 'append', index = False)

Print out a list of all the tables in the newly created database

In [None]:
# list all tables in the db to check where they were created
c.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(c.fetchall())

Create indexes on the observations most important in the project

In [None]:
# create index on the columns that we'll be most often using in our project
c.execute('''CREATE INDEX index_patients ON patients (RACE, ETHNICITY, GENDER);''')
c.execute('''CREATE INDEX index_conditions ON conditions (PATIENT, CODE);''')
c.execute('''SELECT name FROM sqlite_master WHERE TYPE = 'index' ''')
print(c.fetchall())

## Commit and close

In [None]:
conn.commit()
conn.close()