## Data loading and imports

In [254]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from matplotlib.pyplot import cm
import seaborn as sns

In [255]:
def trunc_length(ds, nb_hours):
#function that truncates the data to only consider the first nb_hours hours
    df = ds.loc[ds.hour_from_intime <=  nb_hours]
    df = df.loc[df.hour_from_intime > 0]
    return df

def create_batchs(ds):
    batchs = []
    ids = ds.stay_id.unique()
    for i in ids:
        batchs.append(ds.loc[ds['stay_id'] == i])
    return batchs

def remove_missing(df, var, threshold):
#remove from batch the entries where a too large proportion of the variables var are missing 
    res = df
    
    
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                    'percent_missing': percent_missing})
    for vital in var: 
        criterion = missing_value_df.loc[missing_value_df.column_name == vital].percent_missing >= threshold 
        if criterion:
            print('entry removed')
            print(missing_value_df.loc[missing_value_df.column_name == vital].percent_missing)
            df.drop([vital], axis = 1)
        else:
            res.append(batch[i])
    return res

def get_column_name(df):
    listn = [col for col in df.columns]
    return listn

def aggregation(batch, rate):
    'function that takes a batch of patients and returns the aggregated vitals with the correct aggregation rate'
    if rate == 1:
        return batch
    elif rate == 24:
        bch = []
        for df in batch:
            df['hour_slice'] = 0
            df['hour_slice'][range(25,49)] = 1
            df = df.groupby('hour_slice').mean()
            bch.append(df)
        return bch
    elif rate == 48:
        bch = []
        for df in batch:
            df['hour_slice'] = 0
            df = df.groupby('hour_slice').mean()
            bch.append(df)
        return bch

def arrange_ids(df1, df2, df3, df4, df5):
    ids1 = df1.stay_id.unique()
    ids2 = df2.stay_id.unique()
    ids3 = df3.stay_id.unique()
    ids4 = df4.stay_id.unique()
    ids5 = df5.stay_id.unique()

    min_ids = list(set(ids1) & set(ids2) & set(ids3) & set(ids4) & set(ids5))
    return df1.loc[df1['stay_id'].isin(min_ids)], df2.loc[df2['stay_id'].isin(min_ids)], df3.loc[df3['stay_id'].isin(min_ids)], df4.loc[df4['stay_id'].isin(min_ids)], df5.loc[df5['stay_id'].isin(min_ids)]


In [256]:
df_hourly = pd.read_csv(r'C:\Users\USER\Documents\Imperial\Summer_project\Azure\data\preprocessed_mimic4_hour.csv', delimiter=',')
df_24h = pd.read_csv(r'C:\Users\USER\Documents\Imperial\Summer_project\Azure\data\preprocessed_mimic4_24hour.csv', delimiter=',')
df_48h = pd.read_csv(r'C:\Users\USER\Documents\Imperial\Summer_project\Azure\data\preprocessed_mimic4_48hour.csv', delimiter=',')
df_med = pd.read_csv(r'C:\Users\USER\Documents\Imperial\Summer_project\Azure\data\preprocessed_mimic4_med.csv', delimiter=',')
df_demographic = pd.read_csv(r'C:\Users\USER\Documents\Imperial\Summer_project\Azure\data\demographics_mimic4.csv', delimiter=',')

df_hourly = df_hourly.drop(columns = ['icu_intime'])
df_24h = df_24h.drop(columns = ['icu_intime'])
df_48h = df_48h.drop(columns = ['icu_intime'])





In [257]:

print(labels)


0      0.0
1      0.0
3      1.0
4      1.0
5      1.0
      ... 
944    0.0
945    1.0
946    1.0
947    0.0
948    1.0
Name: los, Length: 928, dtype: float64


In [258]:
#truncate to only get 48 hours of stay.
df_hourly = trunc_length(df_hourly, 48)
df_24h = trunc_length(df_24h, 2)
df_demographic, df_med, df_hourly, df_24h, df_48h = arrange_ids(df_demographic, df_med, df_hourly, df_24h, df_48h)

#label extraction 
labels = df_demographic.pop('los')
labels[labels < 4] = 0
labels[labels > 4] = 1
print(labels)

#pivot the tables 
df_hourly = df_hourly.pivot_table(index = ['stay_id', 'hour_from_intime'], columns = 'feature_name', values = 'feature_mean_value')
df_24h = df_24h.pivot_table(index = ['stay_id', 'hour_from_intime'], columns = 'feature_name', values = 'feature_mean_value')
df_48h = df_48h.pivot_table(index = ['stay_id'], columns = 'feature_name', values = 'feature_mean_value')
df_med = df_med.pivot_table(index = ['stay_id'], columns = 'med_name', values = 'amount')

#one-hot encoding for the medication and the sex
df_med = df_med.fillna(value = 0)
df_med[df_med > 0] = 1
df_demographic.gender[df_demographic.gender == 'F'] = 1
df_demographic.gender[df_demographic.gender == 'M'] = 0


#create batches 
df_hourly = df_hourly.reset_index(level=['stay_id'])
df_24h = df_24h.reset_index(level=['stay_id'])
df_48h = df_48h.reset_index(level=['stay_id'])
df_med = df_med.reset_index(level=['stay_id'])

batch_hourly = create_batchs(df_hourly)
batch_24h = create_batchs(df_24h)
batch_48h = create_batchs(df_48h)
batch_med = create_batchs(df_med)
batch_demographic = create_batchs(df_demographic)

#reindex for patients that don't have entries at the begginning of their stays 
for i in range(len(batch_24h)):
    batch_hourly[i] = batch_hourly[i].reindex(range(1, 49), fill_value = None) 
    batch_24h[i] = batch_24h[i].reindex(range(1, 3), fill_value = None) 
    batch_hourly[i] = batch_hourly[i].drop(columns = 'stay_id')
    batch_24h[i] = batch_24h[i].drop(columns = 'stay_id')
    batch_48h[i] = batch_48h[i].drop(columns = 'stay_id')
    batch_med[i] = batch_med[i].drop(columns = 'stay_id')
    batch_demographic[i] = batch_demographic[i].drop(columns = 'stay_id')


df_hourly = pd.concat(batch_hourly)
df_24h = pd.concat(batch_24h)

#the stay ids column are dropped since we alreasy took care of them being in the same order for all datasets

df_48h = df_48h.drop(columns = 'stay_id')
df_med = df_med.drop(columns = 'stay_id')


0      0.0
1      0.0
3      1.0
4      1.0
5      1.0
      ... 
944    0.0
945    1.0
946    1.0
947    0.0
948    1.0
Name: los, Length: 928, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels[labels > 4] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_demographic.gender[df_demographic.gender == 'M'] = 0


In [259]:
#get vitals names
vitals_hourly = get_column_name(df_hourly)
vitals_24h = get_column_name(df_24h)
vitals_48h = get_column_name(df_48h)
med = get_column_name(df_med)

### Data imputation

In [260]:
#first linear inputation and then replaced by mean when it's not possible 
#pas incroyable de recalculer le mean à chaque itération... é changer 

for i in range(len(batch_hourly)):
   batch_hourly[i] = batch_hourly[i].interpolate(limit = 15)
   batch_24h[i] = batch_24h[i].interpolate(limit = 15)

for i in range(len(batch_hourly)):
   batch_hourly[i] = batch_hourly[i].interpolate(limit = 15)
   batch_24h[i] = batch_24h[i].interpolate(limit = 15)
   batch_48h[i] = batch_48h[i].fillna(df_48h.mean())
   batch_48h[i] = batch_24h[i].fillna(df_24h.mean())
   batch_hourly[i] = batch_hourly[i].fillna(df_hourly.mean())
   batch_demographic[i].bmi = batch_demographic[i].bmi.fillna(0)
   batch_demographic[i].gcs = batch_demographic[i].gcs.fillna(df_demographic.gcs.mean())





In [263]:
print(batch_demographic[0])

[  gender  age     bmi  death  gcs
0      0   53  22.784      0    6,   gender  age  bmi  death  gcs
1      1   56  0.0      0    6,   gender  age     bmi  death  gcs
3      1   50  24.285      0    9,   gender  age     bmi  death  gcs
4      1   19  27.588      0    9,   gender  age  bmi  death  gcs
5      1   89  0.0      0    7,   gender  age     bmi  death  gcs
6      1   91  18.819      0   10,   gender  age  bmi  death  gcs
7      0   65  0.0      0   12,   gender  age  bmi  death  gcs
8      1   40  0.0      0    8,   gender  age     bmi  death  gcs
9      1   88  21.479      0   10,    gender  age  bmi  death  gcs
10      0   85  0.0      0    9,    gender  age  bmi  death  gcs
11      0   75  0.0      0   10,    gender  age    bmi  death  gcs
12      1   74  24.94      0   10,    gender  age  bmi  death  gcs
13      1   62  0.0      1   13,    gender  age     bmi  death  gcs
14      0   63  33.927      0   10,    gender  age  bmi  death  gcs
15      1   71  0.0      0    9,   

In [264]:
#feature concatenation 
final_data = np.array([[np.concatenate([np.concatenate(batch_demographic[i].values), np.concatenate(batch_hourly[i].values), np.concatenate(batch_24h[i].values), np.concatenate(batch_48h[i].values), np.concatenate(batch_med[i].values)])] for i in range(len(batch_hourly))])
final_data = np.squeeze(final_data)

In [268]:
# print(np.concatenate(batch_hourly[1].values))
print(final_data.shape)
print(labels.values.shape)

(928, 426)
(928,)
