In [None]:
import os
os.chdir('..')

In [None]:
import re
import torch
import utils
import pandas as pd
import yaml
from glob import glob
from collections import defaultdict

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from dataloader.data import MIMICDataset, get_tables, JointTabularFeature
from dataloader.labels import get_labels
from dataloader.utils import BinnedEvent, get_vocab
from utils import prepare_batch, load_class, load_model, load_config

In [None]:
DEVICE = 'cpu'
data_path = 'data/multitask'

In [None]:
params = load_config('2xdwyub7')

In [None]:
joint_vocab = get_vocab(**params)
tables = get_tables(load=True,
                    event_class=BinnedEvent,
                    vocab=joint_vocab,
                    **params)

labels = get_labels(DEVICE)

train_set = MIMICDataset(datalist_file='train_listfile.csv', mode='EVAL',
                         tables=tables, labels=labels,
                         limit=2000,
                         numericalize=True,
                         )

val_set = MIMICDataset(datalist_file='val_listfile.csv', mode='EVAL',
                       tables=tables, labels=labels,
                       limit=None,
                       numericalize=True,
                       )

test_set = MIMICDataset(datalist_file='test_listfile.csv', mode='EVAL',
                        datasplit='test',
                        tables=tables, labels=labels,
                        limit=None,
                        numericalize=True,
                        )

In [None]:
tables[1].tables[0].value_counter['Heart-Rhythm']

In [None]:
tables[2].counts['FARR-10-OUTPUT-TOTAL']

In [None]:
[key for key in tables[2].bins.keys() if 'biliary' in key]

In [None]:
# tables = tables[1].tables
j = 0 
fig, axes = plt.subplots(3, 3, figsize=(10, 5))
for i, t in enumerate(tables):
    values = list(t.bins.items())
    axes[i][0].set_ylabel(f'{t.table}')
    while True:
        ind = np.random.choice(len(t.bins))
        key = values[ind][0]
        if sum(t.counts[key]) < 10: continue
        _bins = t.bins[key][:-1]
        axes[i][j].bar(range(7), t.counts[key])
        axes[i][j].set_title(f'{values[ind][0]:.20s}')
        axes[i][j].tick_params(axis='y', labelrotation=45)
        axes[i][j].set_xticks(range(0, 7))
        axes[i][j].set_xticklabels([f'#{b}' for b in range(1, 8)])
        j += 1
        if j == 3:
            j = 0
            break
plt.tight_layout()
plt.savefig('figures/histograms.pdf')

In [None]:
train_set.datalist_filename

In [None]:
import pandas as pd
demog = pd.read_csv('mimic3-benchmarks/data/multitask/train/demogfile.csv').set_index('filename')

In [None]:
model = utils.load_model(params, joint_vocab, tables, DEVICE)

In [None]:
from functools import partial
train_loader = torch.utils.data.DataLoader(train_set, batch_size=params['batch_size'],
                                           collate_fn=partial(utils.min_batch,
                                                              tables=tables,
                                                              labels=labels,
                                                              limit=720),
                                           shuffle=False, num_workers=0, pin_memory=True, drop_last=True,
                                          )

val_loader = torch.utils.data.DataLoader(val_set, batch_size=params['batch_size'],
                                         collate_fn=partial(utils.min_batch,
                                                            tables=tables,
                                                            labels=labels,
                                                            limit=None),
                                         shuffle=False, num_workers=0, pin_memory=True, drop_last=True)

test_loader = torch.utils.data.DataLoader(test_set, batch_size=params['batch_size'],
                                         collate_fn=partial(utils.min_batch,
                                                            tables=tables,
                                                            labels=labels,
                                                            limit=None),
                                         shuffle=False, num_workers=0, pin_memory=True, drop_last=True)

In [None]:
def write_event_counts(loader):
    SUMMARY = defaultdict(dict)

    for sample in loader:
        x, y, extra = prepare_batch(sample, DEVICE)
        for table in [t for t in tables if t.table != 'dem']:
            events_per_step = (x[table.table][0,:,:,0] != 0).sum(1).tolist()
            for step in range(len(events_per_step)):
                SUMMARY[table.table, extra['filename'][0], step] = events_per_step[step]

    mux = pd.MultiIndex.from_tuples(SUMMARY.keys())
    df = pd.DataFrame(list(SUMMARY.values()), index=mux)
    df = df.unstack(0)
    df.columns = df.columns.get_level_values(1)
    df = df.reset_index(1)
    df['period_length'] = df['level_1']
    df = df.drop('level_1', 1)
    df = df.reset_index()
    df['stay'] = df['index']
    df = df.drop('index', 1)
    df = df.loc[: , ['stay', 'period_length', 'CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS', 'INPUTEVENTS_*', 'PRESCRIPTIONS']]

    df.to_csv(f'notebooks/{loader.dataset.datalist_filename}_n_events.csv', index=False)

In [None]:
write_event_counts(test_loader)

In [None]:
# write_event_counts(train_loader)
write_event_counts(val_loader)

In [None]:
print('DONE')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.hist(np.array(CHART_SUMMARY['lengths'])[:,0], bins=20)
plt.title(f'#TIMESTEPS min_word_count={params["min_word_count"]}, batch_size={params["batch_size"]}')
plt.savefig(f'TIMESTEPS_min_word_count-{params["min_word_count"]}_batch_size-{params["batch_size"]}.png')

In [None]:
plt.hist(np.array(CHART_SUMMARY['lengths'])[:,1], bins=20)
plt.title(f'#EVENTS min_word_count={params["min_word_count"]}, batch_size={params["batch_size"]}')
plt.savefig(f'EVENTS_min_word_count-{params["min_word_count"]}_batch_size-{params["batch_size"]}.png')

In [None]:
plt.hist(sum(CHART_SUMMARY['event_lengths'], []), bins=20)
plt.title(f'CHART: #TIMESTEPLENGTHS min_word_count={params["min_word_count"]}, batch_size={params["batch_size"]}')
plt.savefig(f'CHART_TIMESTEPLENGTHS_min_word_count-{params["min_word_count"]}_batch_size-{params["batch_size"]}.png')

In [None]:
embeddings = []
filenames = []

In [None]:
for batch in train_loader:
    x, y_true, extra = prepare_batch(batch, DEVICE)

    preds, outputs = model(*x)
    output = {"y_pred": preds,
              "y_true": y_true}
    
    embeddings.append(outputs['patient'][0][-1].detach().numpy())
    filenames.append(extra['filename'])
    losses = {}
    for label in labels.values():
        losses[label.task] = label.loss(output)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
outputs['timesteps'][0].shape

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(outputs['timesteps'][0][0].detach())

# N_EVENTS

In [None]:
tables = ['CHARTEVENTS', 'LABEVENTS', 'OUTPUTEVENTS', 'INPUTEVENTS_*', 'PRESCRIPTIONS']

In [None]:
n_events = pd.read_csv('./notebooks/train_listfile_n_events.csv')

In [None]:
n_events['TOTAL'] = n_events[tables].sum(1)

In [None]:
agg_n_events = n_events.groupby('stay').agg(['count', 'max', 'median', 'mean', 'sum'])

In [None]:
agg_n_events[[(t, 'sum') for t in tables]].sum(1).size

In [None]:
agg_n_events[('period_length', 'count')].hist(bins=36, grid=False, range=(0, 864))

In [None]:
agg_n_events.TOTAL

In [None]:
fig.get_yaxis

In [None]:
from matplotlib.ticker import StrMethodFormatter
import seaborn as sns
sns.set_theme(style="darkgrid")

fig, axes = plt.subplots(1, 3, figsize=(7, 2.7), dpi=300, sharey=True)

axes[0].set_title('Length of stay')
agg_n_events[('period_length', 'count')].hist(bins=36, grid='y', range=(0, 29 * 24), ax=axes[0])
axes[0].set_xlabel('Days')
axes[0].set_xticks(range(0, 29 * 24, 24 * 7))
axes[0].set_xticklabels(range(0, 35, 7))
axes[0].set_ylabel('Number of ICU stays')
axes[0].set_yscale('log')

agg_n_events[[(t, 'sum') for t in tables]].sum(1).hist(bins=30, range=(0, 30000), grid='y', ax=axes[1])
axes[1].set_title('#Events per stay')
axes[1].xaxis.set_major_locator(plt.MaxNLocator(4))
axes[1].set_xlabel('Events')
axes[1].set_yscale('log')

agg_n_events[('TOTAL', 'median')].hist(bins=30, range=(0, 200), grid='y', ax=axes[2])
axes[2].set_title('Median of events per hour')
axes[2].set_xlabel('Median of events/h')
axes[2].xaxis.set_major_locator(plt.MaxNLocator(4))
axes[2].set_yscale('log')

plt.tight_layout(pad=0.2)
plt.savefig('notebooks/figures/timesteps-events.pdf')

In [None]:
((pd.read_csv('./notebooks/train_listfile_n_events.csv').groupby('stay').max()))['period_length'].median()

In [None]:
pd.read_csv('mimic3-benchmarks/data/multitask/train_listfile.csv')['length of stay'].median()

## Event types per table

In [None]:
for table in tables[1:]:
    print(table.table)
    all_values = {}
    for event_label, values in table.value_counter.items():
        for value_label, count in values.items():
            if count < 10:
                continue
            if value_label == 'scalar':
                for bin_ in range(len(table.counts[event_label])):
                    all_values[event_label+'='+str(bin_)] = int(table.counts[event_label][bin_])
            else:
                all_values[event_label+'='+value_label] = count
    print(len(all_values))
            

In [None]:
len(all_values)

In [None]:
list(range(len(table.counts[event_label])))

In [None]:
for table in tables[1:]:
    print(table.table)
    events = sorted([(event_label + '=' + value, count) for event_label, v in table.value_counter.items() for value, count in v.items() if count > 10], key=lambda x: x[1], reverse=True)
    print(len(events))
    print(len(table.value_counter))
    dist_events = set([event_label for event_label, v in table.value_counter.items() for value, count in v.items() if count > 10])
    print(len(dist_events))
    print(events[:5])


In [None]:
train_set.numericalize = True
train_set[292]['inputs']['OUTPUTEVENTS'][2]

In [None]:
joint_vocab.freqs['Jackson-Pratt-#1=3']

In [None]:
train_set.numericalize = True
for i in train_set[292]['inputs']['OUTPUTEVENTS'][2][:, 0].tolist():
    print(i, joint_vocab.itos[i])

In [None]:
train_set.numericalize = False
train_set[292]['inputs']['OUTPUTEVENTS']

In [None]:
dist_events

In [None]:
11594 + 1082 + 290 + 1339 + 863

# Bins

In [None]:
tables[1].value_counter

In [None]:
event_labels = ['Respiratory-Rate', 'Heart-Rate']

In [None]:
params['strategy'] = 'uniform'
uniform_table = get_tables(load=True,
                    event_class=BinnedEvent,
                    vocab=joint_vocab,
                    **params)[1]

In [None]:
params['strategy'] = 'kmeans'
kmeans_table = get_tables(load=True,
                    event_class=BinnedEvent,
                    vocab=joint_vocab,
                    **params)[1]

In [None]:
sns.set_theme(style="darkgrid")
import matplotlib.ticker as ticker

from matplotlib.ticker import ScalarFormatter

fig = plt.figure(figsize=(5.6, 3), dpi=300)

event_labels = ['Respiratory-Rate', 'Glucose', 'Xigris']
for i, event_label in enumerate(event_labels, 1):
    ax = plt.subplot(1, len(event_labels), i)
    
    _bins = kmeans_table.bins[event_label]
    _counts = -np.array(kmeans_table.counts[event_label])
    ax.barh(_bins[:-1], _counts, np.diff(_bins), align='edge', label='k-means')#, alpha=0.5)    
    _bins = uniform_table.bins[event_label]
    _counts = np.array(uniform_table.counts[event_label])
    ax.barh(_bins[:-1], _counts, np.diff(_bins), align='edge', label='uniform')#, alpha=0.5)
    
    if i == 1:
        ax.set_ylabel('Observed value')
    if i == 2:
        ax.set_xlabel('Number of occurance')
    ax.set_title(f'{event_label}')
    ax.set_ymargin(0)

    xabs_max = abs(max(ax.get_xlim(), key=abs))
    ax.set_xlim(-xabs_max, xabs_max)

    ax.xaxis.set_major_locator(plt.MaxNLocator(1))
    ax.xaxis.set_major_formatter(ScalarFormatter())
    ax.get_xaxis().get_offset_text().set_visible(False)
    ax.ticklabel_format(axis='x', style='sci', useMathText=True, scilimits=(-3,3))
    if i < 3:
        ax_max = max(ax.get_xticks())
        exponent_axis = np.floor(np.log10(ax_max)).astype(int)
        ax.set_xticklabels([f'${c//(10**(exponent_axis-1))}\\times10^{exponent_axis-1}$' if c != 0 else '0' for c in np.abs(ax.get_xticks()).round(0).astype(int)])
    else:
        ax.set_xticklabels(np.abs(ax.get_xticks()).round(0).astype(int))
    xabs_max = abs(max(ax.get_xlim(), key=abs))
    ax.set_xlim(-xabs_max, xabs_max)
    

plt.tight_layout(pad=0, h_pad=0)
plt.savefig('notebooks/figures/discritization.pdf')

In [None]:
exponent_axis

In [None]:
ticks = ax.get_xticklabels()

In [None]:
ticks

In [None]:
# Time input
plt.title('Time Input')
ax = plt.subplot(111)
ax.plot(np.arange(720), np.log(np.arange(720)+1), label='$\log(h+1)$')
ax.plot(np.arange(720), np.exp(np.arange(720)/1000)-1, label='$\exp(h)/1000+1$')
ax.set_xlabel('Hours ($h$)')
ax.set_ylabel('Time Feature ($F_h$)')
ax.legend()

In [None]:
n_hours = []
n_measures = {
    'CHARTEVENTS': [],
    'LABEVENTS': [],
    'OUTPUTEVENTS': [],
    'INPUTEVENTS_*': [],
    'PRESCRIPTIONS': []
}
for patient in train_set:
    n_hours.append(len(patient['inputs']['CHARTEVENTS']))
    for table in n_measures.keys():
        n_measures[table].append((patient['extra']['filename'], [len(hour) for hour in patient['inputs'][table]]))


In [None]:
np.mean(n_hours), np.median(n_hours)

In [None]:
for tabl, patients in n_measures.items():
    print('mean number of measures in', tabl, np.mean([np.mean([hour for hour in hours if hour > 0]) for hours in patients if [hour for hour in hours if hour > 0]]))
    print('rate of eventful hours', tabl, np.mean([np.mean(np.array(hours)>0) for hours in patients]))

In [None]:
np.mean(np.array(patients[0]) > 0)

In [None]:
len(n_hours)

# Benchmark features

In [None]:
from dataloader.utils import feature_string

In [None]:
df = pd.read_csv('mimic3-benchmarks/mimic3benchmark/resources/itemid_to_variable_map.csv')

In [None]:
df[df.STATUS == 'ready']['MIMIC LABEL']\
    .apply(feature_string).apply(lambda x: '^' + x + '(_?.*?)?\s')\
    .to_csv('embeddings/benchmark_features_greppatterns3', index=False, header=False)


In [None]:
import os
os.chdir('/home/oserbetci/EffiCare')

In [None]:
import pandas as pd

In [None]:
vecs = pd.read_csv('embeddings/sentences.mimic3.txt.100d.Fasttext.15ws.onlybenchmark.vec', sep=' ', header=0)

In [None]:
vecs

# Sampler

In [None]:
import samplers
sampler = samplers.DiagnoseAgeSubjectRandomSampler(train_set)


In [None]:
for sample in sampler:
    print(sample)
    break

In [None]:
df = sampler.sorted_df

In [None]:
sampler.df