<a href="https://colab.research.google.com/github/Enrico-Call/RL-AKI/blob/main/2_Data_Aggregation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img src="https://github.com/AmsterdamUMC/AmsterdamUMCdb/blob/master/img/logo_amds.png?raw=1" alt="Logo" width=128px/>

# VUmc Research Project - Reinforcement Learning for Sepsis Prevention
# Data Aggregation

AmsterdamUMCdb version 1.0.2 March 2020  
Copyright &copy; 2003-2022 Amsterdam UMC - Amsterdam Medical Data Science

## 1. Set up the environment variables for Colab and GoogleBigQuery to access

In [46]:
import os
from google.colab import auth
from IPython.display import display
from google.colab import drive
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from IPython.display import display
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('use_inf_as_na', True)

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/MLRFH')
 
#sets dateset
PROJECT_ID = 'rl-aki'
DATASET_ID = 'version1_0_2'
LOCATION = 'eu'
 
#all libraries check this environment variable, so set it:
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
 
auth.authenticate_user()
print('Authenticated')

Mounted at /content/drive
Authenticated


## 2. Define Preprocessing Functions

In [52]:
#Some preprocessing functions 

def to_cols(data):

  grouped = data.pivot_table(index=['admissionid', 'time'], 
          columns=['item'], values='value')

  return grouped
  

def to_cols_action(data):

  grouped = data.pivot_table(index=['admissionid', 'time'], 
            columns=['item'], values='administered')

  return grouped

def remove_outliers(data):
  #delete outliers
  data = data.reset_index() #return to single index

  #select outlier cols
  all_cols = ['Kreatinine', 'Kreatinine (bloed)', 'KREAT enzym. (bloed)',
       'UrineSupraPubis', 'UrineIncontinentie', 'Nefrodrain re Uit',
       'Nefrodrain li Uit', 'UrineSpontaan', 'UrineUP', 'UrineSplint Re',
       'UrineSplint Li', 'UrineCAD', 'Chloor (bloed)', 'Natrium (bloed)',
       'Kalium (bloed)', 'HCO3', 'Natrium', 'Natrium Astrup',
       'Kalium Astrup', 'Chloor Astrup', 'Chloor', 'Kalium',
       'Act.HCO3 (bloed)', 'Na (onv.ISE) (bloed)', 'K (onv.ISE) (bloed)',
       'Cl (onv.ISE) (bloed)', 'Niet invasieve bloeddruk gemiddeld',
       'ABP gemiddeld II', 'ABP gemiddeld']
  
  # Natrium
  data['Natrium'][(data['Natrium'] < 65.) & (data['Natrium'] > 165.)] = np.nan
  data['Natrium (bloed)'][(data['Natrium (bloed)'] < 65.) & (data['Natrium (bloed)'] > 165.)] = np.nan
  data['Natrium Astrup'][(data['Natrium Astrup'] < 65.) & (data['Natrium Astrup'] > 165.)] = np.nan
  data['Na (onv.ISE) (bloed)'][(data['Na (onv.ISE) (bloed)'] < 65.) & (data['Na (onv.ISE) (bloed)'] > 165.)] = np.nan
  
  # Mean Blood Pressure
  data['ABP gemiddeld'][(data['ABP gemiddeld'] < 30.) & (data['ABP gemiddeld'] > 165.)] = np.nan
  data['Niet invasieve bloeddruk gemiddeld'][(data['Niet invasieve bloeddruk gemiddeld'] < 30.) & (data['Niet invasieve bloeddruk gemiddeld'] > 165.)] = np.nan
  data['ABP gemiddeld II'][(data['ABP gemiddeld II'] < 30) & (data['ABP gemiddeld II'] > 165)]

  # Kalium
  data['Kalium'][data['Kalium'] > 12.] = np.nan
  data['Kalium (bloed)'][data['Kalium (bloed)'] > 12.] = np.nan
  data['Kalium Astrup'][data['Kalium Astrup'] > 12.] = np.nan
  data['K (onv.ISE) (bloed)'][data['K (onv.ISE) (bloed)'] > 12.] = np.nan

  # Kreatinine
  data['Kreatinine'][data['Kreatinine'] < 30.] = np.nan
  data['Kreatinine (bloed)'][data['Kreatinine (bloed)'] < 30.] = np.nan
  data['KREAT enzym. (bloed)'][data['KREAT enzym. (bloed)'] < 30.] = np.nan

  # Bicarbonate
  data['HCO3'][(data['HCO3'] < 5.) & (data['HCO3'] > 45.)] = np.nan
  data['Act.HCO3 (bloed)'][(data['Act.HCO3 (bloed)'] < 5.) & (data['Act.HCO3 (bloed)'] > 45.)] = np.nan

  # Chloor
  data['Chloor'][(data['Chloor'] < 50.) & (data['Chloor'] > 200.)] = np.nan
  data['Chloor (bloed)'][(data['Chloor (bloed)'] < 50.) & (data['Chloor (bloed)'] > 200.)] = np.nan
  data['Chloor Astrup'][(data['Chloor Astrup'] < 50.) & (data['Chloor Astrup'] > 200.)] = np.nan
  data['Cl (onv.ISE) (bloed)'][(data['Cl (onv.ISE) (bloed)'] < 50.) & (data['Cl (onv.ISE) (bloed)'] > 200.)] = np.nan

  #make nans of all negative vals
  data[all_cols] = data[all_cols].applymap(lambda x: np.nan if x < 0 else x)
  return data


def remove_outliers_action(data):

  #delete outliers
  outliers = data.reset_index() #return to single index

  cols = ['Noradrenaline (Norepinefrine)', 'NaCl 0,45%/Glucose 2,5%']

  #select outlier cols
  data['Noradrenaline (Norepinefrine)'][data['Noradrenaline (Norepinefrine)'] > 10.] = np.nan
  data['NaCl 0,45%/Glucose 2,5%'][data['NaCl 0,45%/Glucose 2,5%'] > 500.] = np.nan
  

  data = data[cols].applymap(lambda x: np.nan if x < 0 else x)

  return data

def get_4h(data):
  #per patient, average the values in 4h timeslots

  data = data.sort_values('time')
  res = data.groupby([pd.Grouper('admissionid'),
                        pd.Grouper(key ='time', freq='4H')
                              
  ]).mean()

  return res

def get_4h_urine(data):
  #per patient, average the values in 4h timeslots

  data = data.sort_values('time')
  res = data.groupby([pd.Grouper('admissionid'),
                        pd.Grouper(key ='time', freq='4H')
                              
  ]).sum()

  return res

def aggregate_col(data, colname):

  #create new columns with cumulative count for consecutive nans
  data['nancount'] = np.zeros(len(data))
  data['nancount'] = data[colname].groupby((data[colname].notnull()).cumsum()).cumcount()
                           
  #manually set first row to 1 if nan since this is excluded in cumsum/count from line before
  for i, v in enumerate(data[colname]):
    if i == 0:
      if np.isnan(v):
        data["nancount"] += 1
  
  #and all other ones +1, except non-nan values
  data["nancount"][data[colname] == np.nan] += 1

  #set non-null values as 0 for nancount
  data["nancount"] = np.where(~data[colname].isnull(), 0, data["nancount"])

  #if value is not nan, then use previous value + 1 to get total cumulative nan count including the non-nan value
  #this is the value we want to divide through to get the right value per hour

  group_val = pd.DataFrame(data[colname]).reset_index()[colname]
  group_count = pd.DataFrame(data["nancount"]).reset_index()["nancount"]

  for i, v in enumerate(group_count):

    if v == 0: #where no null values
 
      if (i != 0): #first row: do nothing
        if group_val[i] != 0: #value 0: do not divide
          if group_count[i-1] != 0: #value before is not missing: do nothing
          
            group_val[i] = group_val[i] / (group_count[i-1]) #otherwise: divide through nancount of row before
  

  #and then fill backwards and return
  return group_val.bfill()

def sum_urine(data):
  # urine_cols = ['UrineCAD']
  urine_cols = ['UrineCAD', 'UrineSupraPubis', 'UrineUP', 'UrineSpontaan', 'UrineIncontinentie', 'UrineSplint Re', 'UrineSplint Li']
  data['Urine_summed'] = data[urine_cols].sum(axis=1)
  data['Urine_summed'] = np.where(data['Urine_summed'] == 0, np.nan, data['Urine_summed'])
  return data.drop(columns=urine_cols)

def aggregate_all_cols(data, space):

  if space == 'state':

    cols_to_agg = ['time', 'admissionid', 'Kreatinine', 'Kreatinine (bloed)', 'KREAT enzym. (bloed)',
       'Chloor (bloed)', 'Natrium (bloed)',
       'Kalium (bloed)', 'HCO3', 'Natrium', 'Natrium Astrup',
       'Kalium Astrup', 'Chloor Astrup', 'Chloor', 'Kalium',
       'Act.HCO3 (bloed)', 'Na (onv.ISE) (bloed)', 'K (onv.ISE) (bloed)',
       'Cl (onv.ISE) (bloed)', 'Niet invasieve bloeddruk gemiddeld',
       'ABP gemiddeld II', 'ABP gemiddeld']

    #group urine (sum)
    grouped = data.groupby('admissionid', as_index = False).apply(lambda x: aggregate_col(x, 'Urine_summed')).reset_index()['Urine_summed']
    data['Urine'] = list(grouped.head(len(grouped)))
    data = pd.DataFrame(data).reset_index()
    urine_aggr = get_4h_urine(data[['admissionid', 'time', 'Urine']])

    #group other variables (mean)
    data[cols_to_agg] = data[cols_to_agg].bfill()
    df_aggr = get_4h(data[cols_to_agg])

    #combine both aggregations
    combined = pd.concat([urine_aggr, df_aggr], axis=1)

    return combined

  if space == 'action':

    data = data.reset_index()
    cols_to_agg = ['time', 'admissionid', 'Dobutamine (Dobutrex)',
                   'Adrenaline (Epinefrine)', 'Dopamine (Inotropin)',
                   'Noradrenaline (Norepinefrine)', 'NaCl 0,45%/Glucose 2,5%']
    data[cols_to_agg] = data[cols_to_agg].bfill()
    df_aggr = get_4h(data[cols_to_agg])

    return df_aggr

  else:

    print("ERROR INVALID SPACE TYPE: options for space: state, action")


def interpolate(data_agg):
  #interpolate null values
  return data_agg.interpolate(limit_direction='forward')

def transform_df(data: pd.DataFrame = None,
                 time_col: str = 'time',
                 bins: list = None,
                 bin_labels: list = None,
                 group_cols: list = ['admissionid', 'binn'],
                 agg_func: dict = None):
    """
    Transforms the input data from the AmsterdamUMCdb and return a dataframe with bins assigned to each record based on the time column
    :param data: dataframe with single timestamps as integers, patientid and values
    :param bins: list of bins to divide the timestamps in
    :param bin_labels: list of labels to name the bins with
    :param group_cols: list of column to group by, including the newly created 'binn'
    :param agg_func: dictionary of kwargs passed to the .agg() method
    """
    
    data['binn'] = pd.cut(data[time_col], bins=bins, labels=bin_labels)
    data = data[data[time_col]>=0]
    grouped_data = data.groupby(group_cols).agg(**agg_func).reset_index().sort_values(by=group_cols, ascending=True)
    
    return grouped_data


def transform_daterange(data: pd.DataFrame,
                        time_col: str = 'time',
                        infer_start_time: bool = True,
                        multi_source: bool = False,
                        multi_source_col: str = None,
                        start_time: str = 'start_time',
                        end_time: str = 'end_time',
                        time_unit: str = 'm',
                        value_col: str = 'value',
                        group_col: list = None,
                        fill_method: str = 'backfill',
                        fill_lim: int = 540
                        ):
    """
    Transform interval data with single timestamps to time range, calculate production, resample and backward fill
    :param data: dataframe with id, value and timestamp
    :param time_col: string representing the column name for the time of registration in a single timestamp dataframe
    :param infer_start_time: boolean representing whether the start time should be inferred from the previous record
    :param start_time: string representing the column name with the record start time
    :param end_time: string representing the column name with the record end time
    :param time_unit: interpret the integer timestamp as the given time unit and convert back to this unit at the end
    :param value_col: string representing the column name with the values of the measurements
    :param group_col: list representing the ids of patients and/or products
    :param fill_method: string to represent the method as used in pandas.series.fillna
    :param fill_lim: integer to represent the number of time units to be filled
    """
    
    if group_col is None:
        group_col = ['admissionid'] # PM: defining a list as default will keep alterations when rerunning the function
    
    # convert to datetime and set index to time column
    data[time_col] = pd.to_datetime(data[time_col], unit=time_unit)
    data[start_time] = pd.to_datetime(data[start_time], unit=time_unit)
    
    if infer_start_time:
        # get start time from previous record
        data['start_time'] = data.groupby(group_col)[time_col].shift(1)
        start_time = 'start_time'
        end_time = time_col
    else:
        # transform other columns to datetime if they exist and are still integer type, otherwise leave as is
        for t_col in [start_time, end_time]:
            if t_col in data:
                if pd.api.types.is_integer_dtype(data[start_time]):
                    data[t_col] = pd.to_datetime(data[t_col], unit=time_unit)
    
    # get time difference from start and end times   
    data['time_diff'] = (data[end_time] - data[start_time]) / np.timedelta64(1, time_unit)

    if multi_source:
        if multi_source_col is None:
            # give each record a unique id to group by in order to handle simultaneous records
            data['administrationid'] = range(data.shape[0])
            group_col += ['administrationid']
        else:
            group_col += [multi_source_col]
    
    # get production per time unit
    data['prod'] = data[value_col] / data['time_diff']
    
    # if start and end time are registered in the same record, create a new record with the other value as index
    if infer_start_time:
        data_merged = data.copy()
        data_merged.index = data_merged[time_col]
    else:
        data_end = data.copy()
        data.index = data.start_time
        data_end.index = data_end.stop_time
        data_merged = pd.concat([data, data_end]).sort_values(group_col + [start_time, end_time])
    
    # resample for each unit
    res = data_merged[group_col + ['prod', start_time, end_time]].groupby(group_col).resample('1T').mean().drop(group_col, axis=1, errors='ignore').reset_index().copy()
    
    # fill missing values
    res['prod_fill'] = res.groupby(group_col)['prod'].fillna(method=fill_method, limit=fill_lim) #9 hours
    
    # reset time column to integer values
    transform_time_col = {'s': 1, 'm': 60, 'h': 3600, 'd': 86_400}
    if infer_start_time:
        res[time_col] = (res[time_col].view(np.int64) / (transform_time_col.get(time_unit) * 1_000_000_000)).astype(int)
    else:
        if multi_source:
            level_col = 'level_2'
        else:
            level_col = 'level_1'
        res[time_col] = (res[level_col].view(np.int64) / (transform_time_col.get(time_unit) * 1_000_000_000)).astype(int)
    
    return res
    

def get_demograhics(data, admissionid):
  # Get gender
  gender = data['gender'][data['admissionid'] == admissionid].head(1).item()
  if gender == 'Man':
    gender = 0
  elif gender == 'Vrouw':
    gender = 1
  else:
    gender = 'Unknown'

  # Get Age
  age = data['agegroup'][data['admissionid'] == admissionid].head(1).item()

  # Get Weight
  weight = data['weightgroup'][data['admissionid'] == admissionid].head(1).item()

  # Get Height
  height = data['heightgroup'][data['admissionid'] == admissionid].head(1).item()

  # Get PatientID
  patientID = data['patientid'][data['admissionid'] == admissionid].head(1).item()

  # Get Date of Death
  death = data['dateofdeath'][data['admissionid'] == admissionid].head(1).item()
  if death == np.nan:
    death = 0
  else:
    death = 1

  return gender, age, weight, height, patientID, death


def complete_state(data, df):
  data = data.sort_values(by=['admissionid', 'time']).reset_index()
  genders = []
  ages = []
  weights = []
  heights = []
  patientids = []
  deaths = []

  # Get demographics for each admission id
  for x in data['admissionid']:
    gender, age, weight, height, patientID, death = get_demograhics(df, x)
    genders.append(gender)
    ages.append(age)
    weights.append(weight)
    heights.append(height)
    patientids.append(patientID)
    deaths.append(death)

  data['gender'] = genders
  data['agegroup'] = ages
  data['weightgroup'] = weights
  data['heightgroup'] = heights
  data['patientid'] = patientids
  data['death'] = deaths

  # Transform categorical variables into numerical
  categories = {"agegroup": {"18-39": 1, "40-49": 2, "50-59": 3, "60-69":4, "70-79":5, "80+":6}, 
          "weightgroup": {'59-': 1,'60-69': 2, '80-89': 3, '70-79': 4, '90-99': 5, '100-109': 6, '110+': 7}, 
          "heightroup": {'159-': 1, '160-169': 2, '170-179': 3, '180-189': 4, '190+': 5}}
  data = data.replace(categories)
  
  return data


def process_statespace(data):
  data['time'] = pd.to_datetime(data['time'], unit='m', origin = 'unix')
  grouped = to_cols(data)
  grouped = remove_outliers(grouped)
  data_sum = sum_urine(grouped)
  data_agg = aggregate_all_cols(data_sum, space="state")
  data_agg = complete_state(data_agg, data)
  #data_filled = interpolate(data_agg)
  return data_agg.reset_index()

  
def process_actionspace(data):
  # data['time'] = pd.to_datetime(data['stop'] - data['start'], unit='ms')
  # data = data.drop(columns = ['start', 'stop'])
  # data['time'] = pd.to_datetime(data['time'], unit='ms', origin = 'unix')
  # grouped = to_cols_action(data)
  # #grouped = remove_outliers_action(grouped)
  # data_agg = aggregate_all_cols(grouped, space="action")
  # #data_filled = interpolate(data_agg)

  # Extract Fluids and Vasopressors
  fluids = data.loc[~data['itemid'].isin([7179,7178,6818,7229])]
  vasop = data.loc[data['itemid'].isin([7179,7178,6818,7229])]
  
  # Perform Aggregation
  df_aggr_fluids = transform_df(data=transform_daterange(fluids[['admissionid',
                                                                 'fluidin',
                                                                 'start_time',
                                                                 'stop_time']].sort_values(['admissionid', 'start_time']).copy(),
                                                     time_col = 'stop_time',
                                                     infer_start_time=False,
                                                     multi_source=False,
                                                     start_time = 'start_time',
                                                     end_time = 'stop_time',
                                                     value_col = 'fluidin',
                                                     group_col = ['admissionid']),
                                 time_col='stop_time',
                                 bins=range(0, 76*60, 4*60),
                                 bin_labels=range(0, 72*60, 4*60),
                                 group_cols=['admissionid', 'binn'],
                                 agg_func={'fluid_sum': ('prod_fill', 'sum')})
  df_aggr_vasops = transform_df(data=transform_daterange(vasop[['admissionid',
                                                                 'fluidin',
                                                                 'start_time',
                                                                 'stop_time']].sort_values(['admissionid', 'start_time']).copy(),
                                                     time_col = 'stop_time',
                                                     infer_start_time=False,
                                                     multi_source=False,
                                                     start_time = 'start_time',
                                                     end_time = 'stop_time',
                                                     value_col = 'fluidin',
                                                     group_col = ['admissionid']),
                                 time_col='stop_time',
                                 bins=range(0, 76*60, 4*60),
                                 bin_labels=range(0, 72*60, 4*60),
                                 group_cols=['admissionid', 'binn'],
                                 agg_func={'vasops_sum': ('prod_fill', 'sum')})
    
  df_aggr_fluids['fluid_sum'] = df_aggr_fluids['fluid_sum'].fillna(0)
  df_aggr_vasops['vasops_sum'] = df_aggr_vasops['vasops_sum'].fillna(0)
  df_aggr = pd.merge(df_aggr_fluids, df_aggr_vasops, how='outer', on=['admissionid', 'binn'])
  df_aggr['time'] = pd.to_datetime(df_aggr['binn'].astype(float), unit='m', origin = 'unix')

  return df_aggr

## 3. Import Dataframes and Perform Aggregation

In [53]:
statespace = pd.read_csv('final_state_space.csv')
actionspace = pd.read_csv('final_action_space.csv')

state = process_statespace(statespace)
action = process_actionspace(actionspace)

#merge datasets --> left merge on actionspace, as states without actions are not useful for our model
aggregated = action.merge(state, on=["admissionid", "time"], how="left")

#all the null values are patients that are not in the state space and only in the action space --> we cannot use them so they are dropped
aggregated.isnull().sum()
aggregated = aggregated.dropna()

In [54]:
aggregated

Unnamed: 0,admissionid,binn,fluid_sum,vasops_sum,time,index,Urine,Kreatinine,Kreatinine (bloed),KREAT enzym. (bloed),...,Cl (onv.ISE) (bloed),Niet invasieve bloeddruk gemiddeld,ABP gemiddeld II,ABP gemiddeld,gender,agegroup,weightgroup,heightgroup,patientid,death
0,11,0,1564.114410,127.987797,1970-01-01 00:00:00,0.0,10.666667,439.0,342.2,331.000000,...,114.0,70.0,61.0,64.000000,0,4.0,3.0,180-189,11.0,1.0
1,11,240,1081.298935,189.248434,1970-01-01 04:00:00,1.0,10.666667,439.0,337.0,331.000000,...,114.0,70.0,61.0,72.400000,0,4.0,3.0,180-189,11.0,1.0
2,11,480,564.661661,198.434211,1970-01-01 08:00:00,2.0,10.666667,439.0,316.0,331.000000,...,114.0,70.0,61.0,71.200000,0,4.0,3.0,180-189,11.0,1.0
3,11,720,4650.132862,155.536134,1970-01-01 12:00:00,3.0,9.633333,439.0,302.0,331.000000,...,114.0,70.0,61.0,66.750000,0,4.0,3.0,180-189,11.0,1.0
4,11,960,1158.447309,160.000000,1970-01-01 16:00:00,4.0,15.500000,439.0,302.0,331.000000,...,114.0,70.0,61.0,63.800000,0,4.0,3.0,180-189,11.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45737,21038,4080,6249.225680,0.000000,1970-01-03 20:00:00,39364.0,860.000000,281.0,81.0,60.000000,...,118.0,72.0,99.0,90.000000,1,1.0,4.0,160-169,18163.0,1.0
45738,21049,0,8151.945003,13.393176,1970-01-01 00:00:00,39365.0,200.000000,281.0,81.0,59.285714,...,118.0,72.0,99.0,68.857143,0,5.0,2.0,160-169,18171.0,1.0
45739,21049,240,590.869801,18.032787,1970-01-01 04:00:00,39366.0,218.000000,281.0,81.0,62.000000,...,118.0,72.0,99.0,67.500000,0,5.0,2.0,160-169,18171.0,1.0
45740,21049,480,2821.231277,17.821038,1970-01-01 08:00:00,39367.0,830.000000,281.0,81.0,71.714286,...,118.0,72.0,99.0,69.714286,0,5.0,2.0,160-169,18171.0,1.0


## 4. Save Aggregated Dataframe on Drive

In [55]:
aggregated.to_csv('aggregated.csv')

## 5. Plot Statistics

In [56]:
# Plot some stats, df with 1 row per patient for demographic stats
stats = state.drop_duplicates(subset=['admissionid'], keep='first')