In [46]:
# This notebook reads in the discretised input data and then preprocesses the model features
# Firstly, values deemed excessively high/low are capped
# Relevant binary features and normally/log-normally features are standardised accordingly
# Training and test sets are split - 70% train, 10% validation, 20% test
# Resulting datasets are saved to file.

In [47]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import DataFrame

In [48]:
disc_inp_data = pd.read_csv("../data/discretised_input_data.csv")   #

In [58]:
disc_inp_data.items

<bound method DataFrame.items of         bloc  icustayid   charttime  gender           age  elixhauser  \
0          1          3  7245486000       0  17639.826435           0   
1          2          3  7245500400       0  17639.826435           0   
2          3          3  7245514800       0  17639.826435           0   
3          4          3  7245529200       0  17639.826435           0   
4          5          3  7245543600       0  17639.826435           0   
...      ...        ...         ...     ...           ...         ...   
278881     9      99995  4612888440       0   8538.739340           0   
278882    10      99995  4612902840       0   8538.739340           0   
278883    11      99995  4612917240       0   8538.739340           0   
278884    12      99995  4612931640       0   8538.739340           0   
278885    13      99995  4612946040       0   8538.739340           0   

        re_admission  died_in_hosp  died_within_48h_of_out_time  \
0                  0   

In [49]:
# add rewards - sparsely for now; reward function shaping comes in a separate script
disc_inp_data['reward'] = 0
for i in disc_inp_data.index:
    if i == 0:
        continue
    else:
        if disc_inp_data.loc[i, 'icustayid'] != disc_inp_data.loc[i-1, 'icustayid']:
            if disc_inp_data.loc[i-1, 'died_in_hosp'] == 1:
                disc_inp_data.loc[i-1,'reward'] = -100
            elif disc_inp_data.loc[i-1, 'died_in_hosp'] == 0:
                disc_inp_data.loc[i-1,'reward'] = 100
            else:
                print ("error in row", i-1)
if disc_inp_data.loc[len(disc_inp_data)-1, 'died_in_hosp'] == 1:
    disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] = -100
elif disc_inp_data.loc[len(disc_inp_data)-1, 'died_in_hosp'] == 0:
     disc_inp_data.loc[len(disc_inp_data)-1, 'reward'] = 100
print (disc_inp_data['reward'].value_counts())

 0      257927
 100     18070
-100      2889
Name: reward, dtype: int64


In [51]:
# now split into train/validation/test sets
import random
unique_ids = disc_inp_data['icustayid'].unique()
random.shuffle(unique_ids)
train_sample = 0.7
val_sample = 0.1
test_sample = 0.2
train_num = int(len(unique_ids) * 0.7)
val_num = int(len(unique_ids)*0.1) + train_num
train_ids = unique_ids[:train_num]
val_ids = unique_ids[train_num:val_num]
test_ids = unique_ids[val_num:]

In [52]:
train_set = DataFrame()
train_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(train_ids)]

val_set = DataFrame()
val_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(val_ids)]

test_set = DataFrame()
test_set = disc_inp_data.loc[disc_inp_data['icustayid'].isin(test_ids)]

In [62]:
#cap values in train and test
caps = pd.read_csv("capping_values.csv")
print(caps.index)
for i in caps.index:
    param = caps.loc[i,'Parameter'][1:-1]
    print(param)
    maxval = caps.loc[i,'maxval']
    minval = caps.loc[i,'minval']
    train_set[param][train_set[param] >= maxval] = maxval
    train_set[param][train_set[param] <= minval] = minval
    val_set[param][val_set[param] >= maxval] = maxval
    val_set[param][val_set[param] <= minval] = minval
    test_set[param][test_set[param] >= maxval] = maxval
    test_set[param][test_set[param] <= minval] = minval

RangeIndex(start=0, stop=44, step=1)
g


KeyError: 'g'

In [34]:
binary_fields = ['gender','mechvent','re_admission']
norm_fields= ['age','Weight_kg','GCS','HR','SysBP','MeanBP','DiaBP','RR','Temp_C','FiO2_1',
    'Potassium','Sodium','Chloride','Glucose','Magnesium','Calcium',
    'Hb','WBC_count','Platelets_count','PTT','PT','Arterial_pH','paO2','paCO2',
    'Arterial_BE','HCO3','Arterial_lactate','SOFA','SIRS','Shock_Index',
    'PaO2_FiO2','cumulated_balance', 'elixhauser', 'Albumin', u'CO2_mEqL', 'Ionised_Ca']
log_fields = ['max_dose_vaso','SpO2','BUN','Creatinine','SGOT','SGPT','Total_bili','INR',
              'input_total','input_4hourly','output_total','output_4hourly', 'bloc']

In [35]:
# normalise binary fields
train_set[binary_fields] = train_set[binary_fields] - 0.5 
val_set[binary_fields] = val_set[binary_fields] - 0.5 
test_set[binary_fields] = test_set[binary_fields] - 0.5 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [36]:
train_set[0:50]

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,input_total,input_4hourly,output_total,output_4hourly,cumulated_balance,SOFA,SIRS,vaso_input,iv_input,reward
7,1,11,6898241400,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,0.0,0.0,0.0,12,0,0.0,0.0,0
8,2,11,6898255800,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,460.0,460.0,-460.0,10,0,0.0,0.0,0
9,3,11,6898270200,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,1020.0,560.0,-1020.0,10,1,0.0,0.0,0
10,4,11,6898284600,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,1600.0,580.0,-1600.0,11,1,0.0,0.0,0
11,5,11,6898299000,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,2020.0,420.0,-2020.0,11,1,0.0,0.0,0
12,6,11,6898313400,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,2630.0,610.0,-2630.0,10,1,0.0,0.0,0
13,7,11,6898327800,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,2865.0,235.0,-2865.0,7,1,0.0,0.0,0
14,8,11,6898342200,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,3085.0,220.0,-3085.0,5,1,0.0,0.0,0
15,9,11,6898356600,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,3605.0,520.0,-3605.0,3,1,0.0,0.0,0
16,10,11,6898371000,0.5,30766.069028,6,0.5,0,0,0,...,0.0,0.0,4015.0,410.0,-4015.0,3,1,0.0,0.0,0


In [37]:
# normal distn fields
for item in norm_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    print(item,av,std)
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

age 23565.21912320851 6097.14157755618
Weight_kg 83.0633366756617 24.63589579651438
GCS 12.586981516063545 3.424210536774359
HR 87.25062976826653 16.902307444122616
SysBP 119.89554867962566 20.29458238268682
MeanBP 78.18095633712906 13.410841709559834
DiaBP 57.10853240465548 13.242557666734902
RR 20.2658890299689 5.211091455027511
Temp_C 36.91209774582588 2.3649232808054186
FiO2_1 0.4596264919786729 0.18311680275032374
Potassium 4.074671100959275 0.5562551252040637
Sodium 138.73120278732713 4.894762262567318
Chloride 104.76768141037621 6.244858108293621
Glucose 138.94623267352364 50.86820572524071
Magnesium 2.0508813570946276 0.3464138617032827
Calcium 8.310606992635241 0.7972679750761802
Hb 10.286812701315602 1.7344910901788975
WBC_count 12.169611132068125 7.947125371524313
Platelets_count 228.5554703349824 138.22780160549382
PTT 37.72840535832849 19.14744624329115
PT 16.19104321270956 6.673724204279878
Arterial_pH 7.391196141361279 0.07406499068517902
paO2 124.89094002820522 72.24107

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [38]:
# log normal fields
train_set[log_fields] = np.log(0.1 + train_set[log_fields])
val_set[log_fields] = np.log(0.1 + val_set[log_fields])
test_set[log_fields] = np.log(0.1 + test_set[log_fields])
for item in log_fields:
    av = train_set[item].mean()
    std = train_set[item].std()
    print(item,av,std)
    train_set[item] = (train_set[item] - av) / std
    val_set[item] = (val_set[item] - av) / std
    test_set[item] = (test_set[item] - av) / std

max_dose_vaso -2.124978940109556 0.5232597314781121
SpO2 4.5744212777263815 0.028532582654383274
BUN 3.128095549924698 0.7043999391666751
Creatinine 0.19929128030993806 0.6337280491148798
SGOT 3.96685210327164 1.1249114614678657
SGPT 3.700331489766721 1.2107758904239274
Total_bili 0.11158711413831304 1.0985019770680287
INR 0.4097010226713994 0.3211790970813749
input_total 7.625791260622102 2.983955464658244
input_4hourly 2.9857540280049255 3.5022917025285096
output_total 6.782433345845891 3.5135998056453044
output_4hourly 3.94594084942043 3.313139627631117
bloc 1.8864857111668978 0.7814379976754602


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer

In [39]:
train_set.head()

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,input_total,input_4hourly,output_total,output_4hourly,cumulated_balance,SOFA,SIRS,vaso_input,iv_input,reward
7,-2.292153,11,6898241400,0.5,1.181021,0.921676,0.5,0,0,0,...,-3.327254,-1.509965,-2.585673,-1.885983,-0.117188,1.636881,-1.55524,0.0,0.0,0
8,-1.464669,11,6898255800,0.5,1.181021,0.921676,0.5,0,0,0,...,-3.327254,-1.509965,-0.185277,0.659647,-0.151334,1.063631,-1.55524,0.0,0.0,0
9,-0.966274,11,6898270200,0.5,1.181021,0.921676,0.5,0,0,0,...,-3.327254,-1.509965,0.041332,0.719008,-0.192903,1.063631,-0.598577,0.0,0.0,0
10,-0.608492,11,6898284600,0.5,1.181021,0.921676,0.5,0,0,0,...,-3.327254,-1.509965,0.169452,0.729598,-0.235956,1.350256,-0.598577,0.0,0.0,0
11,-0.329195,11,6898299000,0.5,1.181021,0.921676,0.5,0,0,0,...,-3.327254,-1.509965,0.235789,0.632195,-0.267132,1.350256,-0.598577,0.0,0.0,0


In [40]:
train_set.to_csv('../data/rl_train_set_unscaled.csv',index = False)
val_set.to_csv('../data/rl_val_set_unscaled.csv', index = False)
test_set.to_csv('../data/rl_test_set_unscaled.csv', index = False)

In [41]:
# scale features to [0,1] in train set, similar in val and test
import copy
scalable_fields = copy.deepcopy(binary_fields)
scalable_fields.extend(norm_fields)
scalable_fields.extend(log_fields)
for col in scalable_fields:
    minimum = min(train_set[col])
    maximum = max(train_set[col])
    train_set[col] = (train_set[col] - minimum)/(maximum-minimum)
    val_set[col] = (val_set[col] - minimum)/(maximum-minimum)
    test_set[col] = (test_set[col] - minimum)/(maximum-minimum)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [45]:
train_set[0:50]

Unnamed: 0,bloc,icustayid,charttime,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,mortality_90d,...,input_total,input_4hourly,output_total,output_4hourly,cumulated_balance,SOFA,SIRS,vaso_input,iv_input,reward
7,0.0,11,6898241400,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.0,0.0,0.223817,0.5,0.0,0.0,0.0,0
8,0.22256,11,6898255800,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.574861,0.707254,0.223281,0.416667,0.0,0.0,0.0,0
9,0.356608,11,6898270200,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.629131,0.723746,0.222629,0.416667,0.25,0.0,0.0,0
10,0.452837,11,6898284600,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.659814,0.726688,0.221953,0.458333,0.25,0.0,0.0,0
11,0.527957,11,6898299000,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.675701,0.699627,0.221464,0.458333,0.25,0.0,0.0,0
12,0.589582,11,6898313400,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.693687,0.730916,0.220754,0.416667,0.25,0.0,0.0,0
13,0.641832,11,6898327800,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.69952,0.650949,0.22048,0.291667,0.25,0.0,0.0,0
14,0.687185,11,6898342200,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.704562,0.645421,0.220224,0.208333,0.25,0.0,0.0,0
15,0.727252,11,6898356600,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.715179,0.717533,0.219618,0.125,0.25,0.0,0.0,0
16,0.763137,11,6898371000,1.0,0.902327,0.428571,1.0,0,0,0,...,0.0,0.0,0.722521,0.697606,0.219141,0.125,0.25,0.0,0.0,0


In [43]:
train_set.to_csv('../data/rl_train_set_scaled.csv',index = False)
val_set.to_csv('../data/rl_val_set_scaled.csv', index = False)
test_set.to_csv('../data/rl_test_set_scaled.csv', index = False)