In [1]:
# install the newest version 
# !pip3 install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

# Importing Libraries

In [2]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from pandas_profiling import ProfileReport

In [3]:
# will make plot outputs appear and stored within the notebook.
%matplotlib inline

In [4]:
from google.colab import drive
# mount drive to access database
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
material_path = "/content/drive/MyDrive"

In [6]:
def connect_to_db(db_file):
    sqlite3_conn = None
    try:
        sqlite3_conn = sq.connect(db_file)
        return sqlite3_conn

    except Error as err:
        print(err)

        if sqlite3_conn is not None:
            sqlite3_conn.close()
  
dwh_conn = sqlite3.connect('/content/drive/MyDrive/DWH.db')

In [7]:
# list of tables in db
if dwh_conn is not None:
  dwh_cursor = dwh_conn.cursor()
  dwh_cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
  print("List of Tables", dwh_cursor.fetchall())


List of Tables [('Zentrum',), ('dimObservations',), ('dimEncounters',), ('dimProcedures',), ('dimConditions',), ('dimPatients',)]


Erstellen eines Ursprungsdataframes aus dem Datawarehouse

In [8]:
df = pd.read_sql_query("""
select dimEncounters.id as ENCOUNTER_ID, dimEncounters.start, dimEncounters.stop, dimEncounters.encounterclass, dimEncounters.code, dimEncounters.description,dimConditions.patient_id, dimConditions.code as DISEASE, dimConditions.description from dimConditions
left JOIN dimEncounters
on dimEncounters.id = dimConditions.encounter_id
;""", dwh_conn
)

In [9]:
df.head(6)


Unnamed: 0,ENCOUNTER_ID,START,STOP,ENCOUNTERCLASS,CODE,DESCRIPTION,PATIENT_ID,DISEASE,DESCRIPTION.1
0,6e0279b7-c477-a2c6-e75c-a911f03a264c,1946-01-11T11:40:19Z,1946-01-11T11:55:19Z,ambulatory,185345009,Encounter for symptom,3575b903-dbd0-1d55-6146-9e8aa4ed52a5,232353008,Perennial allergic rhinitis with seasonal vari...
1,40ca71e3-5668-dd3a-d7e1-209a43835384,1955-01-17T11:40:19Z,1955-01-17T11:55:19Z,wellness,410620009,Well child visit (procedure),3575b903-dbd0-1d55-6146-9e8aa4ed52a5,162864005,Body mass index 30+ - obesity (finding)
2,dbaad3dc-c5a8-9daf-5d5f-330a8022263f,1999-06-06T16:56:37Z,1999-06-06T17:11:37Z,wellness,162673000,General examination of patient (procedure),17f0c6d9-8931-8839-66cb-3ca6fb066d3e,162864005,Body mass index 30+ - obesity (finding)
3,459c64dc-6783-3e8c-91d1-177c47c7258f,1999-11-25T00:48:18Z,1999-11-25T03:01:18Z,emergency,50849002,Emergency Room Admission,aff157cc-b6d3-412b-ccbe-bfd5fac1c2d5,128613002,Seizure disorder
4,459c64dc-6783-3e8c-91d1-177c47c7258f,1999-11-25T00:48:18Z,1999-11-25T03:01:18Z,emergency,50849002,Emergency Room Admission,aff157cc-b6d3-412b-ccbe-bfd5fac1c2d5,703151001,History of single seizure (situation)
5,be256df1-75cb-cbf7-0109-e0e1ae2c6348,2010-01-20T00:48:18Z,2010-01-20T01:03:18Z,wellness,162673000,General examination of patient (procedure),aff157cc-b6d3-412b-ccbe-bfd5fac1c2d5,59621000,Hypertension


In [68]:
#Entfernen von Duplikaten
df.drop_duplicates(keep="first",inplace=True) 
print("Size of dataset after removing duplicated rows", df.shape)

Size of dataset after removing duplicated rows (9514, 9)


In [10]:
# Übersicht 
df.nunique(axis=0)

ENCOUNTER_ID      3446
START             3400
STOP              3417
ENCOUNTERCLASS       6
CODE                18
DESCRIPTION         21
PATIENT_ID        1947
DISEASE            140
DESCRIPTION        141
dtype: int64

In [81]:
#Ursachen des Encounter
df.groupby(["DISEASE"]).size()

DISEASE
10509002           59
109838007          12
124171000119105     1
126906006           7
127013003           3
                   ..
88805009            6
92691004            5
93761005            4
94260004            1
95417003            2
Length: 140, dtype: int64

Erstellen eines Dataframes, der nur Angaben zu COVID-positiven Patienten enthält. 

In [11]:
print(str(["DISEASE"]))


['DISEASE']


In [12]:
DISEASE = map(int, "DISEASE")

In [22]:
#Selektion der enc Testergebnisse
dfencount = df.loc[(df["ENCOUNTERCLASS"] == "inpatient")] 

In [30]:
dfencount.head(3)


Unnamed: 0,ENCOUNTER_ID,START,STOP,ENCOUNTERCLASS,CODE,DESCRIPTION,PATIENT_ID,DISEASE,DESCRIPTION.1
11,e31a4c7a-a35c-0df1-60b2-b6c7ef012bce,1975-12-10T08:03:01Z,1975-12-11T08:54:01Z,inpatient,183452005,Encounter Inpatient,9bbbcada-7a45-92f0-6ae6-d197bcefc0d4,428251008,History of appendectomy
74,ccf7943f-b369-6ab8-b9e7-dafd2526804d,2020-02-28T00:30:05Z,2020-03-08T01:02:05Z,inpatient,1505002,Hospital admission for isolation (procedure),28124841-1cf3-2818-d4ee-8574fac23298,233604007,Pneumonia (disorder)
75,ccf7943f-b369-6ab8-b9e7-dafd2526804d,2020-02-28T00:30:05Z,2020-03-08T01:02:05Z,inpatient,1505002,Hospital admission for isolation (procedure),28124841-1cf3-2818-d4ee-8574fac23298,389087006,Hypoxemia (disorder)


In [40]:
#Selektion der positiven Testergebnisse
dfpos = df.loc[(df["DISEASE"] == "840544004") | (df["DISEASE"] == "840539006")]

In [41]:
print(dfpos.shape)
dfpos.head(20)

(1946, 9)


Unnamed: 0,ENCOUNTER_ID,START,STOP,ENCOUNTERCLASS,CODE,DESCRIPTION,PATIENT_ID,DISEASE,DESCRIPTION.1
30,3c7f472a-2602-bcdd-76d0-53f967923ded,2020-03-10T11:40:19Z,2020-03-10T12:31:19Z,ambulatory,185345009,Encounter for symptom (procedure),3575b903-dbd0-1d55-6146-9e8aa4ed52a5,840544004,Suspected COVID-19
31,3c7f472a-2602-bcdd-76d0-53f967923ded,2020-03-10T11:40:19Z,2020-03-10T12:31:19Z,ambulatory,185345009,Encounter for symptom (procedure),3575b903-dbd0-1d55-6146-9e8aa4ed52a5,840539006,COVID-19
42,dcee0f8b-907d-69cf-c998-5e73711f2276,2020-03-03T16:56:37Z,2020-03-03T18:08:37Z,ambulatory,185345009,Encounter for symptom (procedure),17f0c6d9-8931-8839-66cb-3ca6fb066d3e,840544004,Suspected COVID-19
65,61c250a0-1668-e4ce-aefc-77a7842574c2,2020-03-07T00:48:18Z,2020-03-07T02:02:18Z,ambulatory,185345009,Encounter for symptom (procedure),aff157cc-b6d3-412b-ccbe-bfd5fac1c2d5,840544004,Suspected COVID-19
66,61c250a0-1668-e4ce-aefc-77a7842574c2,2020-03-07T00:48:18Z,2020-03-07T02:02:18Z,ambulatory,185345009,Encounter for symptom (procedure),aff157cc-b6d3-412b-ccbe-bfd5fac1c2d5,840539006,COVID-19
72,58a7a1b7-df6d-9bad-d58d-72bd4fe627cd,2020-02-27T23:37:05Z,2020-02-28T00:30:05Z,ambulatory,185345009,Encounter for symptom (procedure),28124841-1cf3-2818-d4ee-8574fac23298,840544004,Suspected COVID-19
73,58a7a1b7-df6d-9bad-d58d-72bd4fe627cd,2020-02-27T23:37:05Z,2020-02-28T00:30:05Z,ambulatory,185345009,Encounter for symptom (procedure),28124841-1cf3-2818-d4ee-8574fac23298,840539006,COVID-19
84,28b87b45-d287-5e9d-6a3d-c2e83ab9662e,2020-03-06T15:01:44Z,2020-03-06T15:56:44Z,ambulatory,185345009,Encounter for symptom (procedure),27b0d72c-f2fb-7e25-38c0-7d5120ebbedf,840544004,Suspected COVID-19
85,28b87b45-d287-5e9d-6a3d-c2e83ab9662e,2020-03-06T15:01:44Z,2020-03-06T15:56:44Z,ambulatory,185345009,Encounter for symptom (procedure),27b0d72c-f2fb-7e25-38c0-7d5120ebbedf,840539006,COVID-19
96,fbe97c06-afa3-050c-9a70-2523051b3bff,2020-03-10T18:24:13Z,2020-03-10T19:32:13Z,ambulatory,185345009,Encounter for symptom (procedure),bdf7af8e-c765-1e02-dc9d-24b49ad290a3,840544004,Suspected COVID-19


In [42]:
dfpos.groupby(["ENCOUNTERCLASS"]).size()

ENCOUNTERCLASS
ambulatory    1946
dtype: int64

In [37]:
#Selektion der positiven Testergebnisse+stationärer Aufenthalt--> nicht vorhanden deswegen kein weitere Auswertung möglich
dfposin = df.loc[(df["DISEASE"] == "840544004") & (df["ENCOUNTERCLASS"] == "inpatient")| (df["DISEASE"] == "840539006") & (df["ENCOUNTERCLASS"] == "inpatient")]


In [38]:
print(dfposin.shape)
dfposin.head(6)

(0, 9)


Unnamed: 0,ENCOUNTER_ID,START,STOP,ENCOUNTERCLASS,CODE,DESCRIPTION,PATIENT_ID,DISEASE,DESCRIPTION.1
