<a href="https://colab.research.google.com/github/Enrico-Call/RL-AKI/blob/main/Data%20Aggregation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#sets the project id
PROJECT_ID = "rl-aki" #@param {type:"string"}

In [12]:
import os
from google.colab import auth
from IPython.display import display
from google.colab import drive
import os
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('use_inf_as_na', True)

drive.mount('/content/drive', force_remount=True)
os.chdir('/content/drive/MyDrive/MLRFH')
 
#sets dateset
PROJECT_ID = 'mlrh-330919'
DATASET_ID = 'version1_0_2'
LOCATION = 'eu'
 
#all libraries check this environment variable, so set it:
os.environ["GOOGLE_CLOUD_PROJECT"] = PROJECT_ID
 
auth.authenticate_user()
print('Authenticated')

Mounted at /content/drive
Authenticated


In [20]:
#Some preprocessing functions 

def to_cols(data):

  grouped = data.pivot_table(index=['admissionid', 'time'], 
          columns=['item'], values='value')

  return grouped
  

def to_cols_action(data):

  grouped = data.pivot_table(index=['admissionid', 'time'], 
            columns=['item'], values='administered')

  return grouped

def remove_outliers(data):
  #delete outliers
  data = data.reset_index() #return to single index

  #select outlier cols
  all_cols = ['Kalium (bloed)', 'ABP gemiddeld', 'Kreatinine (bloed)', 'Natrium (bloed)', 'UrineCAD', 'UrineSupraPubis', 'UrineSpontaan', 'UrineUP', 'Kreatinine', 'Nefrodrain re Uit', 'Nefrodrain li Uit', 'UrineIncontinentie']
  
  data['ABP gemiddeld'][(data['ABP gemiddeld'] < 30.) & (data['ABP gemiddeld'] > 165.)] = np.nan
  data['Natrium (bloed)'][(data['Natrium (bloed)'] < 65.) & (data['Natrium (bloed)'] > 165.)] = np.nan
  data['Niet invasieve bloeddruk gemiddeld'][(data['Niet invasieve bloeddruk gemiddeld'] < 30) & (data['Niet invasieve bloeddruk gemiddeld'] > 165)]

  #make nans of all negative vals
  data[all_cols] = data[all_cols].applymap(lambda x: np.nan if x < 0 else x)
  return data


def remove_outliers_action(data):

  #delete outliers
  outliers = data.reset_index() #return to single index

  cols = ['Noradrenaline (Norepinefrine)', 'NaCl 0,45%/Glucose 2,5%']

  #select outlier cols
  data['Noradrenaline (Norepinefrine)'][data['Noradrenaline (Norepinefrine)'] > 10.] = np.nan
  data['NaCl 0,45%/Glucose 2,5%'][data['NaCl 0,45%/Glucose 2,5%'] > 500.] = np.nan

  data = data[cols].applymap(lambda x: np.nan if x < 0 else x)

  return data


def get_4h(data):
  #per patient, average the values in 4h timeslots

  data = data.sort_values('time')
  res = data.groupby([pd.Grouper('admissionid'),
                        pd.Grouper(key ='time', freq='4H')
                              
  ]).mean()

  return res

def get_4h_urine(data):
  #per patient, average the values in 4h timeslots

  data = data.sort_values('time')
  res = data.groupby([pd.Grouper('admissionid'),
                        pd.Grouper(key ='time', freq='4H')
                              
  ]).sum()

  return res

def aggregate_col(data, colname):

  #create new columns with cumulative count for consecutive nans
  data['nancount'] = np.zeros(len(data))
  data['nancount'] = data[colname].groupby((data[colname].notnull()).cumsum()).cumcount()
                           
  #manually set first row to 1 if nan since this is excluded in cumsum/count from line before
  for i, v in enumerate(data[colname]):
    if i == 0:
      if np.isnan(v):
        data["nancount"] += 1
  
  #and all other ones +1, except non-nan values
  data["nancount"][data[colname] == np.nan] += 1

  #set non-null values as 0 for nancount
  data["nancount"] = np.where(~data[colname].isnull(), 0, data["nancount"])

  #if value is not nan, then use previous value + 1 to get total cumulative nan count including the non-nan value
  #this is the value we want to divide through to get the right value per hour

  group_val = pd.DataFrame(data[colname]).reset_index()[colname]
  group_count = pd.DataFrame(data["nancount"]).reset_index()["nancount"]

  for i, v in enumerate(group_count):

    if v == 0: #where no null values
 
      if (i != 0): #first row: do nothing
        if group_val[i] != 0: #value 0: do not divide
          if group_count[i-1] != 0: #value before is not missing: do nothing
          
            group_val[i] = group_val[i] / (group_count[i-1]) #otherwise: divide through nancount of row before
  

  #and then fill backwards and return
  return group_val.bfill()

def sum_urine(data):
  # urine_cols = ['UrineCAD']
  urine_cols = ['UrineCAD', 'UrineSupraPubis', 'UrineUP', 'UrineSpontaan', 'UrineIncontinentie', 'UrineSplint Re', 'UrineSplint Li']
  data['Urine_summed'] = data[urine_cols].sum(axis=1)
  data['Urine_summed'] = np.where(data['Urine_summed'] == 0, np.nan, data['Urine_summed'])
  return data.drop(columns=urine_cols)

def aggregate_all_cols(data, space):

  if space == 'state':

    cols_to_agg = ['time', 'admissionid', 'Kreatinine', 'Kreatinine (bloed)', 'KREAT enzym. (bloed)',
       'Nefrodrain re Uit', 'Nefrodrain li Uit', 'Chloor (bloed)', 'Natrium (bloed)',
       'Kalium (bloed)', 'HCO3', 'Natrium', 'Natrium Astrup',
       'Kalium Astrup', 'Chloor Astrup', 'Chloor', 'Kalium',
       'Act.HCO3 (bloed)', 'Na (onv.ISE) (bloed)', 'K (onv.ISE) (bloed)',
       'Cl (onv.ISE) (bloed)', 'Niet invasieve bloeddruk gemiddeld',
       'ABP gemiddeld II', 'ABP gemiddeld']

    #group urine (sum)
    grouped = data.groupby('admissionid', as_index = False).apply(lambda x: aggregate_col(x, 'Urine_summed')).reset_index()['Urine_summed']
    data['Urine'] = list(grouped.head(len(grouped)))
    data = pd.DataFrame(data).reset_index()
    urine_aggr = get_4h_urine(data[['admissionid', 'time', 'Urine']])

    #group other variables (mean)
    data[cols_to_agg] = data[cols_to_agg].bfill()
    df_aggr = get_4h(data[cols_to_agg])

    #combine both aggregations
    combined = pd.concat([urine_aggr, df_aggr], axis=1)

    return combined

  if space == 'action':

    data = data.reset_index()
    # cols_to_agg = ['time', 'admissionid', 'Dobutamine (Dobutrex)',
    #                'Adrenaline (Epinefrine)', 'Dopamine (Inotropin)',
    #                'Noradrenaline (Norepinefrine)', 'NaCl 0,45%/Glucose 2,5%']
    # data[cols_to_agg] = data[cols_to_agg].bfill()
    # df_aggr = get_4h(data[cols_to_agg])
    df_aggr = transform_df(data=transform_daterange(data[['admissionid',
                                                                 'fluidin',
                                                                 'start_time',
                                                                 'stop_time']].sort_values(['admissionid', 'start_time']).copy(),
                                                     time_col = 'stop_time',
                                                     infer_start_time=False,
                                                     multi_source=True,
                                                     start_time = 'start_time',
                                                     end_time = 'stop_time',
                                                     value_col = 'fluidin'),
                                 time_col='stop_time',
                                 bins=bin_values,
                                 bin_labels=bin_labels,
                                 group_cols=['admissionid', 'binn'],
                                 agg_func={'fluid_sum': ('prod_fill', 'sum')})
    
    df_aggr['fluid_sum'] = df_aggr['fluid_sum'].fillna(0)

    return df_aggr

  else:

    print("ERROR INVALID SPACE TYPE: options for space: state, action")


def interpolate(data_agg):
  #interpolate null values
  return data_agg.interpolate(limit_direction='forward')


def process_statespace(data):
  data['time'] = pd.to_datetime(data['time'], unit='m', origin = 'unix')
  grouped = to_cols(data)
  grouped = remove_outliers(grouped)
  data_sum = sum_urine(grouped)
  data_agg = aggregate_all_cols(data_sum, space="state")
  #data_filled = interpolate(data_agg)
  return data_agg.reset_index()

  
def process_actionspace(data):
  data['time'] = pd.to_datetime(data['stop'] - data['start'], unit='ms')
  data = data.drop(columns = ['start', 'stop'])
  data['time'] = pd.to_datetime(data['time'], unit='ms', origin = 'unix')
  grouped = to_cols_action(data)
  #grouped = remove_outliers_action(grouped)
  data_agg = aggregate_all_cols(grouped, space="action")
  #data_filled = interpolate(data_agg)
  return data_agg.reset_index()

def transform_df(data: pd.DataFrame = None,
                 time_col: str = 'time',
                 bins: list = None,
                 bin_labels: list = None,
                 group_cols: list = ['admissionid', 'binn'],
                 agg_func: dict = None):
    """
    Transforms the input data from the AmsterdamUMCdb and return a dataframe with bins assigned to each record based on the time column
    :param data: dataframe with single timestamps as integers, patientid and values
    :param bins: list of bins to divide the timestamps in
    :param bin_labels: list of labels to name the bins with
    :param group_cols: list of column to group by, including the newly created 'binn'
    :param agg_func: dictionary of kwargs passed to the .agg() method
    """
    
    data['binn'] = pd.cut(data[time_col], bins=bins, labels=bin_labels)
    data = data[data[time_col]>=0]
    grouped_data = data.groupby(group_cols).agg(**agg_func).reset_index().sort_values(by=group_cols, ascending=True)
    
    return grouped_data

In [14]:
statespace = pd.read_csv('final_state_space.csv')
actionspace = pd.read_csv('final_action_space.csv')

In [21]:
state = process_statespace(statespace)
action = process_actionspace(actionspace)

NameError: ignored