In [25]:
import functools
import numpy as np
import pandas as pd
from scipy.stats import kstest
import matplotlib.pyplot as plt
import pylab as pl
import psycopg2
from tableone import TableOne
%matplotlib inline
import datetime
plt.style.use('ggplot')

In [26]:
extracted_data = pd.read_csv("./extracted_data.csv", index_col=0)

In [27]:
extracted_data.columns

Index(['subject_id', 'hadm_id', 'icustay_id', 'icd9_code', 'age', 'gender',
       'icustay_seq', 'los_hospital', 'ethnicity', 'admission_type', 'los_icu',
       'mort_hosp', 'comobidities', 'diabetes', 'ckd', 'cih', 'arf',
       'cardiop_disease', 'liver_disease', 'ibd', 'malignancy', 'aniongap',
       'albumin', 'bands', 'bicarbonate', 'bilirubin', 'creatinine',
       'chloride', 'glucose_lab', 'hematocrit', 'hemoglobin', 'lactate',
       'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 'bun', 'wbc',
       'calcium_1st', 'freecalcium_1st', 'heartrate', 'resprate', 'glucose',
       'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp', 'vent', 'urineoutput',
       'oasis', 'gcs'],
      dtype='object')

In [28]:
extracted_data.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,icd9_code,age,gender,icustay_seq,los_hospital,ethnicity,admission_type,...,glucose,spo2,tempc,sysbp,diasbp,meanbp,vent,urineoutput,oasis,gcs
0,46489,152416,200035,845,30.6612,1,1,18.4188,BLACK/AFRICAN AMERICAN,EMERGENCY,...,83.0,92.0,36.277778,126.0,74.0,83.0,0,2950.0,24,15.0
1,69995,164810,200153,845,51.5414,0,1,10.5861,WHITE,EMERGENCY,...,736.0,100.0,36.888889,87.0,46.0,56.0,0,1820.0,21,15.0
2,16687,114879,200206,845,58.7653,1,1,10.6549,WHITE,EMERGENCY,...,85.0,100.0,36.222221,86.0,36.0,52.666698,0,,29,15.0
3,67348,179548,200550,845,79.5328,1,1,21.9326,WHITE,EMERGENCY,...,194.0,97.0,36.277778,130.0,75.0,90.0,0,2075.0,23,15.0
4,27875,145333,200608,845,71.1797,0,1,9.6931,WHITE,EMERGENCY,...,178.0,97.0,38.555556,138.0,55.0,76.0,0,2585.0,28,15.0


In [29]:
extracted_data.shape

(1315, 54)

In [48]:
columns = ['icustay_id', 
           'age', 'gender', 
           'comobidities',
           'ckd', 'cih', 'arf', 'diabetes', 
           'cardiop_disease', 'liver_disease', 'ibd', 'malignancy',
           'aniongap', 'albumin', 'bands', 'bicarbonate', 'bilirubin', 
           'creatinine', 'chloride', 
           'glucose_lab', 'hematocrit', 'hemoglobin', 'lactate',
           'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 
           'bun', 'wbc','calcium_1st', 'freecalcium_1st', 
           'heartrate', 'resprate', 'glucose', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp', 'vent', 'urineoutput', 
           'oasis', 'gcs', 'mort_hosp']

In [49]:
data = extracted_data[columns]

In [50]:
data.to_csv("./final_data.csv")

In [51]:
data.head()

Unnamed: 0,icustay_id,age,gender,comobidities,ckd,cih,arf,diabetes,cardiop_disease,liver_disease,...,spo2,tempc,sysbp,diasbp,meanbp,vent,urineoutput,oasis,gcs,mort_hosp
0,200035,30.6612,1,1,0,0,0,1,0,0,...,92.0,36.277778,126.0,74.0,83.0,0,2950.0,24,15.0,1
1,200153,51.5414,0,0,0,0,0,0,0,0,...,100.0,36.888889,87.0,46.0,56.0,0,1820.0,21,15.0,0
2,200206,58.7653,1,0,0,0,1,0,0,0,...,100.0,36.222221,86.0,36.0,52.666698,0,,29,15.0,1
3,200550,79.5328,1,0,0,0,0,0,0,0,...,97.0,36.277778,130.0,75.0,90.0,0,2075.0,23,15.0,0
4,200608,71.1797,0,1,1,0,1,1,0,0,...,97.0,38.555556,138.0,55.0,76.0,0,2585.0,28,15.0,0


In [55]:
# Columns to include in the summary table
columns = ['age', 'gender', 'comobidities', 
           'aniongap', 'albumin', 'bands', 'bicarbonate', 'bilirubin', 
           'creatinine', 'chloride', 
           'glucose_lab', 'hematocrit', 'hemoglobin', 'lactate',
           'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 
           'bun', 'wbc','calcium_1st', 'freecalcium_1st', 
           'heartrate', 'resprate', 'glucose', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp', 'vent', 'urineoutput', 
           'oasis', 'gcs', 'mort_hosp']


# List of categorical variables
categorical = ['gender', 'vent', 'comobidities', 'mort_hosp']

# Group the data
groupby = 'mort_hosp'

# Display the top n number of categorical variables
limit = 4

nonnormal = columns.copy()

# Compute p values
pval = True

# Display a count of null values
isnull = True

# t = TableOne(data, columns=columns, categorical=categorical, limit=limit, nonnormal=nonnormal,
#              groupby=groupby, pval=pval, isnull=isnull)
t = TableOne(data, columns=columns, categorical=categorical, limit=limit,
             pval=False, isnull=isnull)

t.tableone

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by mort_hosp,Grouped by mort_hosp,Grouped by mort_hosp,Grouped by mort_hosp,Grouped by mort_hosp
Unnamed: 0_level_1,Unnamed: 1_level_1,isnull,0,1,pval,ptest
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
n,,,1074,241,,
age,,0.0,"69.20 [57.64,79.08]","73.56 [61.35,81.27]",0.010,Kruskal-Wallis
gender,0.0,0.0,545 (50.74),104 (43.15),0.039,Chi-squared
gender,1.0,,529 (49.26),137 (56.85),,
comobidities,0.0,0.0,699 (65.08),134 (55.6),0.007,Chi-squared
comobidities,1.0,,375 (34.92),107 (44.4),,
aniongap,,2.0,"15.00 [13.00,18.00]","16.00 [13.00,20.00]",<0.001,Kruskal-Wallis
albumin,,305.0,"2.80 [2.40,3.20]","2.50 [2.00,2.90]",<0.001,Kruskal-Wallis
bands,,786.0,"5.00 [2.00,11.00]","4.00 [2.00,11.00]",0.184,Kruskal-Wallis
bicarbonate,,2.0,"24.00 [21.00,27.00]","21.00 [18.00,25.00]",<0.001,Kruskal-Wallis


In [53]:
# Columns to include in the summary table
columns = ['age', 'gender', 'comobidities', 
           'aniongap', 'albumin', 'bands', 'bicarbonate', 'bilirubin', 
           'creatinine', 'chloride', 
           'glucose_lab', 'hematocrit', 'hemoglobin', 'lactate',
           'platelet', 'potassium', 'ptt', 'inr', 'pt', 'sodium', 
           'bun', 'wbc','calcium_1st', 'freecalcium_1st', 
           'heartrate', 'resprate', 'glucose', 'spo2', 'tempc', 'sysbp', 'diasbp', 'meanbp', 'vent', 'urineoutput', 
           'oasis', 'gcs']


# List of categorical variables
categorical = ['gender', 'vent', 'comobidities']

# Group the data
groupby = 'mort_hosp'

# Display the top n number of categorical variables
limit = 4

# nonnormal = columns.copy()

# Compute p values
pval = True

# Display a count of null values
isnull = True

t = TableOne(data, columns=columns, categorical=categorical, limit=limit,
             groupby=groupby, pval=pval, isnull=isnull)

t.tableone

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by mort_hosp,Grouped by mort_hosp,Grouped by mort_hosp,Grouped by mort_hosp,Grouped by mort_hosp
Unnamed: 0_level_1,Unnamed: 1_level_1,isnull,0,1,pval,ptest
variable,level,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
n,,,1074,241,,
age,,0.0,66.96 (15.04),69.52 (14.50),0.014,Two Sample T-test
gender,0.0,0.0,545 (50.74),104 (43.15),0.039,Chi-squared
gender,1.0,,529 (49.26),137 (56.85),,
comobidities,0.0,0.0,699 (65.08),134 (55.6),0.007,Chi-squared
comobidities,1.0,,375 (34.92),107 (44.4),,
aniongap,,2.0,15.73 (4.68),16.99 (5.27),0.001,Two Sample T-test
albumin,,305.0,2.84 (0.64),2.53 (0.71),<0.001,Two Sample T-test
bands,,786.0,9.07 (10.37),8.25 (9.36),0.388,Two Sample T-test
bicarbonate,,2.0,23.77 (5.64),21.60 (6.11),<0.001,Two Sample T-test
