In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymc3 as pm

%matplotlib inline

In [2]:
%%bash
ls data

Outcomes-a.txt
PhysioNet_Computing in_Cardiology_Challenge_2012.pdf
Reference.txt
seta_data.csv


## Exploratory Data Analysis

Questions:

- What is the more common outcome? Survival or death?
- What are the general ranges of each field?
- Which fields are most predictive of the outcome?
- When does missingness happen?
- Does missingness tell us inform us about the probability of survival? Or is it missing completely at random?
- Is there a subtype of people that the model is doing poorly on? Is there something in common among people that are mislabeled?
- Does the length of time tell us something about the measurements tell us something about survival?
- Should we see the times?
- What are the "normal" ranges for variable, given the literature? How much does an individual stay in (or outside) the "normal" range? given the 48 hours???



In [3]:
seta_data = pd.read_csv('data/seta_data.csv')
seta_data

Unnamed: 0,PATIENT_ID,Parameter,Time,Value
0,132539,RecordID,00:00,132539.00
1,132539,Age,00:00,54.00
2,132539,Gender,00:00,0.00
3,132539,Height,00:00,-1.00
4,132539,ICUType,00:00,4.00
5,132539,Weight,00:00,-1.00
6,132539,GCS,00:07,15.00
7,132539,HR,00:07,73.00
8,132539,NIDiasABP,00:07,65.00
9,132539,NIMAP,00:07,92.33


In [37]:
outcomes.groupby('Survival').count()

Unnamed: 0_level_0,RecordID,SAPS-I,SOFA,Length_of_stay,In-hospital_death
Survival,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,2526,2526,2526,2526,2526
0,3,3,3,3,3
1,26,26,26,26,26
2,57,57,57,57,57
3,52,52,52,52,52
4,39,39,39,39,39
5,45,45,45,45,45
6,27,27,27,27,27
7,30,30,30,30,30
8,28,28,28,28,28


In [33]:
seta_data['Time'] > '00:00'

PATIENT_ID
132539    False
132539    False
132539    False
132539    False
132539    False
132539    False
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
132539     True
          ...  
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
142673     True
Name: Time, L

In [29]:
outcomes = pd.read_csv('data/Outcomes-a.txt')

In [5]:
num_survivals = outcomes.groupby('In-hospital_death').count()['Survival']

In [6]:
num_survivals

In-hospital_death
0    3446
1     554
Name: Survival, dtype: int64

In [26]:
in_hospital_death_outcomes = outcomes[outcomes['In-hospital_death'] == 1]
not_in_hospital_death_outcomes = outcomes[outcomes['In-hospital_death'] == 0]

In [9]:
print(str(num_survivals[0] / (num_survivals[0] + num_survivals[1])) + ' of the sample survive.')

0.8615 of the sample survive.


In [10]:
seta_data.index = seta_data.PATIENT_ID

In [11]:
unique_patient_ids = seta_data.index.unique()

We could compare the values of those who did survive vs those who didn't survive, for each category. See if there are any patterns.

When there's a "-1", maybe we could just skip it?

Prior knowledge: 

1. People are more likely to die if their measurements are too high or too low (i.e. not normal).
2. Huge variability of measurements within an individual could indicate severity of a problem. Severity obviously influences mortality.
3. Where they end up affects the types of measurements that are collected, which might give us information about severity.

We could maybe find the optimal number of samples. Then we could take the average (or some statistic) of that certain section so we have a standardized data set.

In [13]:
{
    'HR': [60, 100] # https://www.mayoclinic.org/healthy-lifestyle/fitness/expert-answers/heart-rate/faq-20057979
    
}

{'HR': [60, 100]}

In [14]:
unique_parameters = seta_data['Parameter'].unique()
params_without_record_id = list(set(unique_parameters) - set(['RecordID']))

In [30]:
in_hospital_death_data = seta_data[seta_data.index.isin(in_hospital_death_outcomes['RecordID'])]
not_in_hospital_death_data = seta_data[seta_data.index.isin(not_in_hospital_death_outcomes['RecordID'])]

### plot data

In [19]:
def plot(parameter, data, ax):
    for patient_id in data.index.unique():
        patient = data.loc[patient_id]
        param = patient['Parameter'] == parameter
        if param.sum() > 0:
            # patient[param ]
            patient[param].plot(x='Time', y='Value', ax=ax, alpha=0.3, legend=False)

In [22]:
params_without_record_id

['ALT',
 'SysABP',
 'HCT',
 'Platelets',
 'TroponinT',
 'HR',
 'PaO2',
 'Lactate',
 'Temp',
 'Glucose',
 'GCS',
 'K',
 'Albumin',
 'MechVent',
 'HCO3',
 'MAP',
 'NIMAP',
 'SaO2',
 'Height',
 'Bilirubin',
 'FiO2',
 'Creatinine',
 'Urine',
 'TroponinI',
 'NIDiasABP',
 'Na',
 'NISysABP',
 'pH',
 'Weight',
 'ALP',
 'Gender',
 'Cholesterol',
 'WBC',
 'PaCO2',
 'AST',
 'ICUType',
 'Mg',
 'RespRate',
 'BUN',
 'DiasABP',
 'Age']

In [60]:
mean = seta_data.groupby((seta_data.index, 'Parameter')).mean()
mean

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,PATIENT_ID,Value
PATIENT_ID,Parameter,Unnamed: 2_level_1,Unnamed: 3_level_1
132539,Age,132539,54.000000
132539,BUN,132539,10.500000
132539,Creatinine,132539,0.750000
132539,GCS,132539,14.923077
132539,Gender,132539,0.000000
132539,Glucose,132539,160.000000
132539,HCO3,132539,27.000000
132539,HCT,132539,32.500000
132539,HR,132539,70.810811
132539,Height,132539,-1.000000


Convert -1 to None so when we take aggregates, we don't make it seem that missing is basically almost the same as 0.

In [103]:
mean_reset = mean[['Value']].reset_index()

In [104]:
unique_parameters

array(['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight', 'GCS',
       'HR', 'NIDiasABP', 'NIMAP', 'NISysABP', 'RespRate', 'Temp',
       'Urine', 'HCT', 'BUN', 'Creatinine', 'Glucose', 'HCO3', 'Mg',
       'Platelets', 'K', 'Na', 'WBC', 'pH', 'PaCO2', 'PaO2', 'DiasABP',
       'FiO2', 'MAP', 'MechVent', 'SysABP', 'SaO2', 'Albumin', 'ALP',
       'ALT', 'AST', 'Bilirubin', 'Lactate', 'Cholesterol', 'TroponinI',
       'TroponinT'], dtype=object)

In [121]:
def get_series(data, parameter, prefix, patient_id_column='PATIENT_ID'):
    """
        Returns a dataframe where the only column is the parameter. 
        Indexed by patient_id_column.
        
        Parameters:
            data: pandas.df
            
            parameter: string.
                name of the parameter (e.g. Age)
            
            patient_id_column: string. (Optional)
                the name of the column that identifies the patient
    """
    
    df = data[data['Parameter'] == parameter]
    grouped = df.groupby(patient_id_column).sum()
    grouped.columns = [prefix + parameter]
    
    return grouped

In [122]:
get_series(mean_reset, 'Age', prefix='mean_')

Unnamed: 0_level_0,mean_Age
PATIENT_ID,Unnamed: 1_level_1
132539,54.0
132540,76.0
132541,44.0
132543,68.0
132545,88.0
132547,64.0
132548,68.0
132551,78.0
132554,64.0
132555,74.0


In [125]:
mean_reset

Unnamed: 0,PATIENT_ID,Parameter,Value
0,132539,Age,54.000000
1,132539,BUN,10.500000
2,132539,Creatinine,0.750000
3,132539,GCS,14.923077
4,132539,Gender,0.000000
5,132539,Glucose,160.000000
6,132539,HCO3,27.000000
7,132539,HCT,32.500000
8,132539,HR,70.810811
9,132539,Height,-1.000000


In [128]:
def generate_aggregated_table(data, parameters, prefix, patient_id_column='PATIENT_ID'):
    """
        Produces a dataframe where columns are of aggregated quantities
        
        parameters:
            data: pandas.DataFrame
                Contains a patient id column (indicated by patient_id_column parameter),
                a Parameter column, and a Value column
                
            parameters: array of strings
                Contains the names of the parameters
                
            prefix: string
                The prefix to prepend to the column names. Used to indicate what type
                of aggregation there is (e.g. 'mean', 'var')
                
            patient_id_column: string (Optional)
                The name of the column that stands for the patient ID.
                Defaults to 'PATIENT_ID'
                
        returns: pandas.DataFrame
            Columns are of aggregated quantities
            
    """
    
    collection_df = get_series(data, parameters[0], prefix=prefix)
    
    for i in range(1, len(parameters)):
        
        collection_df = pd.merge(
            left=collection_df,
            right=get_series(data, parameters[i], prefix=prefix),
            how='left',
            on=patient_id_column
        )
        
    return pd.DataFrame(collection_df)

In [130]:
mean_data = generate_aggregated_table(mean_reset, unique_parameters, prefix='mean')
mean_data

Unnamed: 0_level_0,meanRecordID,meanAge,meanGender,meanHeight,meanICUType,meanWeight,meanGCS,meanHR,meanNIDiasABP,meanNIMAP,...,meanSaO2,meanAlbumin,meanALP,meanALT,meanAST,meanBilirubin,meanLactate,meanCholesterol,meanTroponinI,meanTroponinT
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,132539.0,54.0,0.0,-1.0,4.0,-1.000000,14.923077,70.810811,50.147059,71.559118,...,,,,,,,,,,
132540,132540.0,76.0,1.0,175.3,2.0,80.670588,13.333333,80.794118,56.714286,75.308571,...,96.833333,,,,,,,,,
132541,132541.0,44.0,0.0,-1.0,3.0,56.700000,5.923077,83.759259,79.000000,96.751316,...,95.000000,2.50,116.000000,83.000000,199.500000,2.900000,1.366667,,,
132543,132543.0,68.0,1.0,180.3,3.0,84.600000,14.944444,70.983333,65.051724,83.885517,...,,4.40,105.000000,12.000000,15.000000,0.200000,,,,
132545,132545.0,88.0,0.0,-1.0,3.0,-1.000000,15.000000,74.958333,45.720930,74.946512,...,,3.30,,,,,,,,
132547,132547.0,64.0,1.0,180.3,1.0,114.000000,8.666667,88.531915,70.500000,81.985000,...,97.000000,,101.000000,52.500000,104.500000,0.400000,,212.0,1.300000,
132548,132548.0,68.0,0.0,162.6,3.0,87.000000,15.000000,68.338983,72.000000,102.147143,...,,,,,,,,,0.750000,
132551,132551.0,78.0,0.0,162.6,3.0,48.400000,11.846154,70.945205,30.697674,55.177907,...,96.400000,1.90,47.000000,46.000000,82.000000,0.300000,1.637500,,3.300000,
132554,132554.0,64.0,0.0,-1.0,3.0,60.700000,15.000000,127.239130,64.478261,84.477391,...,,,,,,,,,,
132555,132555.0,74.0,1.0,175.3,2.0,68.582759,14.083333,85.189655,53.000000,75.670000,...,97.333333,,,,,,,,,


In [133]:
mean_data.isnull().describe()

Unnamed: 0,meanRecordID,meanAge,meanGender,meanHeight,meanICUType,meanWeight,meanGCS,meanHR,meanNIDiasABP,meanNIMAP,...,meanSaO2,meanAlbumin,meanALP,meanALT,meanAST,meanBilirubin,meanLactate,meanCholesterol,meanTroponinI,meanTroponinT
count,4000,4000,4000,4000,4000,4000,4000,4000,4000,4000,...,4000,4000,4000,4000,4000,4000,4000,4000,4000,4000
unique,1,1,1,1,1,1,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
top,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,False,True,True,True
freq,4000,4000,4000,4000,4000,4000,3936,3937,3483,3481,...,2208,2385,2310,2279,2275,2282,2183,3695,3795,3137


In [131]:
mean_data.describe()

Unnamed: 0,meanRecordID,meanAge,meanGender,meanHeight,meanICUType,meanWeight,meanGCS,meanHR,meanNIDiasABP,meanNIMAP,...,meanSaO2,meanAlbumin,meanALP,meanALT,meanAST,meanBilirubin,meanLactate,meanCholesterol,meanTroponinI,meanTroponinT
count,4000.0,4000.0,4000.0,4000.0,4000.0,4000.0,3936.0,3937.0,3483.0,3481.0,...,1792.0,1615.0,1690.0,1721.0,1725.0,1718.0,2183.0,305.0,205.0,863.0
mean,137605.122,64.2475,0.56075,88.919475,2.76,76.809694,11.604248,87.159565,57.351445,76.300441,...,96.564943,2.971867,104.634113,171.741809,240.465689,1.902148,2.319835,156.57377,6.968969,1.065439
std,2923.608886,17.560946,0.497867,86.531203,1.002572,31.903953,3.25569,14.603828,11.702949,12.106704,...,3.393059,0.636383,106.377076,642.7564,887.235095,4.426709,1.628966,46.333551,9.602334,2.622287
min,132539.0,15.0,-1.0,-1.0,1.0,-1.0,3.0,42.784314,0.0,0.0,...,38.8,1.1,12.0,3.0,6.0,0.1,0.4,28.0,0.3,0.01
25%,135075.75,52.75,0.0,-1.0,2.0,63.2,9.25,77.066667,49.264232,67.9335,...,96.0,2.5,57.0,17.0,25.0,0.4,1.35,123.0,0.8,0.0495
50%,137592.5,67.0,1.0,152.4,3.0,78.300357,12.428571,86.325581,56.275862,74.786,...,97.25,3.0,78.0,31.0,46.0,0.7,1.883333,153.0,2.133333,0.14
75%,140100.25,78.0,1.0,170.2,4.0,93.058712,14.770105,96.701754,64.591667,83.405435,...,98.0,3.4,110.0,70.0,104.5,1.4,2.75,188.0,9.55,0.765
max,142673.0,90.0,1.0,431.8,4.0,300.0,15.0,137.842105,107.5,132.54,...,100.0,5.3,1472.333333,9143.428571,15680.0,46.366667,24.8,330.0,49.2,24.04


In [None]:
params_to_plot = params_without_record_id
fig, ax = plt.subplots(len(params_to_plot),2, figsize=(20,len(params_to_plot) * 3))


for index, param in enumerate(params_to_plot):
    survive_axis = ax[index, 0]
    plot(parameter=param, data=survivor_data, ax=survive_axis)
#     survive_axis.set_xlim(0,100)
    survive_axis.set_title('Survived: ' + param)
    
    die_axis = ax[index, 1]
    plot(parameter=param, data=dead_data, ax=die_axis)
#     die_axis.set_xlim(0,100)
    die_axis.set_title('Died: ' + param)

plt.tight_layout()


in singular transformations; automatically expanding.
left=0.0, right=0.0
  ax.set_xlim(left, right)


KeyboardInterrupt: 

ERROR:root:Invalid alias: The name clear can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name more can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name less can't be aliased because it is another magic command.
ERROR:root:Invalid alias: The name man can't be aliased because it is another magic command.


In [26]:
pd.merge(left=seta_data, right=outcomes, left_on='PATIENT_ID', right_on='RecordID')

Unnamed: 0,PATIENT_ID,Parameter,Time,Value,RecordID,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
0,132539,RecordID,00:00,132539.00,132539,6,1,5,-1,0
1,132539,Age,00:00,54.00,132539,6,1,5,-1,0
2,132539,Gender,00:00,0.00,132539,6,1,5,-1,0
3,132539,Height,00:00,-1.00,132539,6,1,5,-1,0
4,132539,ICUType,00:00,4.00,132539,6,1,5,-1,0
5,132539,Weight,00:00,-1.00,132539,6,1,5,-1,0
6,132539,GCS,00:07,15.00,132539,6,1,5,-1,0
7,132539,HR,00:07,73.00,132539,6,1,5,-1,0
8,132539,NIDiasABP,00:07,65.00,132539,6,1,5,-1,0
9,132539,NIMAP,00:07,92.33,132539,6,1,5,-1,0


In [7]:
seta_data['Parameter'].unique()

array(['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight', 'GCS',
       'HR', 'NIDiasABP', 'NIMAP', 'NISysABP', 'RespRate', 'Temp',
       'Urine', 'HCT', 'BUN', 'Creatinine', 'Glucose', 'HCO3', 'Mg',
       'Platelets', 'K', 'Na', 'WBC', 'pH', 'PaCO2', 'PaO2', 'DiasABP',
       'FiO2', 'MAP', 'MechVent', 'SysABP', 'SaO2', 'Albumin', 'ALP',
       'ALT', 'AST', 'Bilirubin', 'Lactate', 'Cholesterol', 'TroponinI',
       'TroponinT'], dtype=object)