In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

In [None]:
tqdm.pandas()

In [None]:
import os
import json
import pickle
from pathlib import Path

In [None]:
raw_dir = Path('/data/mimiciii_benchmark/in-hospital-mortality/')  # replace with your from mimic-3 benchmark
out_dir = Path('./data/')

train_dir = raw_dir / 'train'
val_dir = raw_dir / 'train'
test_dir = raw_dir / 'test'

train_listfile = raw_dir / 'train_listfile.csv'
val_listfile = raw_dir / 'val_listfile.csv'
test_listfile = raw_dir / 'test_listfile.csv'

## Restructuring data to LS


**Output**

Data Config: 

```yaml
tgt_col: 'y_true'
idx_cols: ['stay'] 
time_order_col: ['Hours', 'seqnum']
```

In [None]:
def process_ts(fname, t_data=48.0, index_col='Hours', timestep=1.0):
    df = pd.read_csv(fname)

    if t_data:
        df = df[df[index_col] < t_data]

    df['seqnum'] = np.floor(df[index_col] / timestep)

    df = df.sort_index(axis=0).groupby('seqnum', as_index=False).last() #.reset_index()
    
    return df

### Training

In [None]:
df_y_train = pd.read_csv(train_listfile)
df_y_train.y_true.value_counts().plot.barh()
df_y_train.head()

In [None]:
df_x_train = []
for stay in tqdm(df_y_train.stay):
    _tmp = process_ts(train_dir / stay)
    _tmp['stay'] = stay
    df_x_train.append(_tmp)
del(_tmp)

df_x_train = pd.concat(df_x_train, ignore_index=True)
df_x_train.set_index(['stay', 'seqnum'], inplace=True)
print('Done')
df_x_train.head()

In [None]:
df_y_train.sort_values(['stay']).to_csv(out_dir/'IHM_V0_COHORT_OUT_EXP-SPLIT0-train.csv', index=False)
df_x_train.sort_index().to_csv(out_dir/'IHM_V0_FEAT_EXP-SPLIT0-train.csv')

### Validation

In [None]:
df_y_val = pd.read_csv(val_listfile)
df_y_val.y_true.value_counts().plot.barh()
df_y_val.head()

In [None]:
df_x_val = []
for stay in tqdm(df_y_val.stay):
    try:
        _tmp = process_ts(val_dir / stay)
        _tmp['stay'] = stay
        df_x_val.append(_tmp)
    except FileNotFoundError:
        print(f'{val_dir / stay} not found.... skipping')
del(_tmp)

df_x_val = pd.concat(df_x_val, ignore_index=True)
df_x_val.set_index(['stay', 'seqnum'], inplace=True)
print('Done')
df_x_val.head()

In [None]:
df_y_val.sort_values(['stay']).to_csv(out_dir/'IHM_V0_COHORT_OUT_EXP-SPLIT0-val.csv', index=False)
df_x_val.sort_index().to_csv(out_dir/'IHM_V0_FEAT_EXP-SPLIT0-val.csv')

### Test

In [None]:
df_y_test = pd.read_csv(test_listfile)
df_y_test.y_true.value_counts().plot.barh()
df_y_test.head()

In [None]:
df_x_test = []
for stay in tqdm(df_y_test.stay):
    try:
        _tmp = process_ts(test_dir / stay)
        _tmp['stay'] = stay
        df_x_test.append(_tmp)
    except FileNotFoundError:
        print(f'{test_dir / stay} not found.... skipping')
del(_tmp)

df_x_test = pd.concat(df_x_test, ignore_index=True)
df_x_test.set_index(['stay', 'seqnum'], inplace=True)
print('Done')
df_x_test.head()

In [None]:
df_y_test.sort_values(['stay']).to_csv(out_dir/'IHM_V0_COHORT_OUT_EXP-SPLIT0-test.csv', index=False)
df_x_test.sort_index().to_csv(out_dir/'IHM_V0_FEAT_EXP-SPLIT0-test.csv')