In [None]:
import os
os.chdir('..')

In [None]:
import re
import torch
import utils
import yaml
from glob import glob

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd

from dataloader.data import MIMICDataset, get_tables, JointTabularFeature
from dataloader.labels import get_labels
from dataloader.utils import BinnedEvent, get_vocab
from utils import prepare_batch, load_class, load_model, load_config

In [None]:
DEVICE = 'cuda:0'

In [None]:
params = load_config('3hn50mmn')

In [None]:
params['wandb_id']

In [None]:
params
# params['min_word_count'] = 10000
params['batch_size'] = 1
# params['vocab_file'] = 'embeddings/sentences.mimic3.hourly.random.binned.train.counts'

In [None]:
params['joint_tables']

In [None]:
joint_vocab = get_vocab(**params)

tables = get_tables(vocab=joint_vocab,
                    load=True,
                    event_class=BinnedEvent,
                    **params)

labels = get_labels(DEVICE)

val_set = MIMICDataset(datalist_file='val_listfile.csv', mode='TRAIN',
                       tables=tables, labels=labels,
                       limit=None,
                       numericalize=True,
                       )

In [None]:
model = utils.load_model(params, joint_vocab, tables, DEVICE)
loaded_epoch = re.findall(r'checkpoint_(\d+)_', params['model_path'])

# Write vocab to tensorboard

In [None]:

log_dir=f'runs/{params["emb_prefix"]}{params["emb_suffix"]}/fasttext_events'
if not os.path.exists(log_dir):
    print("writing embedding")
    os.makedirs(log_dir, exist_ok=True)

    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter(log_dir)
    writer.add_embedding(joint_vocab.vectors, metadata=joint_vocab.itos, tag='fasttext_events')
    writer.close()

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
ft_pca = PCA(n_components=2)
projection = ft_pca.fit_transform(joint_vocab.vectors)

print(ft_pca.explained_variance_ratio_)

print(ft_pca.singular_values_)

In [None]:
df = pd.DataFrame(projection, columns=['x', 'y'])
df['token'] = joint_vocab.itos
df['focus'] = df['token'].str.contains('Creatinine')
px.scatter(df, x='x', y='y', color='focus', hover_data=['token'])

In [None]:
def get_table_vocabs(tables):
    vocabs = []
    for table in tables:
        if table.table == 'dem': continue
        if isinstance(table, JointTabularFeature): return get_table_vocabs(table.tables)
        for vocab in table.value_counter.keys():
            vocabs.append((vocab, table.table))
    return vocabs

In [None]:
table_vocabs = get_table_vocabs(tables)
vocab_df = pd.DataFrame(table_vocabs, columns=['label', 'source'])
df['label'] = df['token'].str.extract('(.+?)(?:=.*)?$')
df = df.merge(pd.DataFrame(vocab_df, columns=['label', 'source']), how='left', on='label')

In [None]:
# df['focus'] = df['source'] == 'LABEVENTS'
df['focus'] = df['label'] == 'Heart-Rhythm'
px.scatter(df, x='x', y='y', color='focus', hover_data=['token', 'label', 'source'], opacity=0.5)

# Use learned model

In [None]:
# model.timestep_encoder.model.linear.weight.data = model.timestep_encoder.model.linear.weight[:, :100]

In [None]:
if model.timestep_encoder.event_encoder.include_time:
    input = torch.cat([joint_vocab.vectors, torch.zeros(len(joint_vocab), 2)], 1)
else:
    input = joint_vocab.vectors

In [None]:
model.timestep_encoder.model.parameters

In [None]:
log_dir=f'runs/{params["emb_prefix"]}{params["emb_suffix"]}/{params["wandb_id"]}_e{loaded_epoch}_events'
if not os.path.exists(log_dir):
    print("writing embedding")
    os.makedirs(log_dir, exist_ok=True)

    from torch.utils.tensorboard import SummaryWriter
    writer = SummaryWriter(log_dir)
    
    # Transformer
    # writer.add_embedding(model.timestep_encoder.model.transformer_encoder(input[None]).detach()[0], metadata=joint_vocab.itos, tag=f'{params["wandb_id"]}_events')
    writer.add_embedding(model.timestep_encoder.model.linear(input).detach(), metadata=joint_vocab.itos, tag=f'{params["wandb_id"]}_events')
    writer.close()

In [None]:
momentum_pca = TSNE(n_components=2)
projection = momentum_pca.fit_transform(model.timestep_encoder.model.linear(input).detach())

print(momentum_pca.explained_variance_ratio_)

print(momentum_pca.singular_values_)

In [None]:
df = pd.DataFrame(projection, columns=['x', 'y'])
df['token'] = joint_vocab.itos

table_vocabs = get_table_vocabs(tables)
vocab_df = pd.DataFrame(table_vocabs, columns=['label', 'source'])
df['label'] = df['token'].str.extract('(.+?)(?:=.*)?$')
df = df.merge(pd.DataFrame(vocab_df, columns=['label', 'source']), how='left', on='label')

# df['focus'] = df['token'].str.contains('Glucose')
df['focus'] = df['label'] == 'Heart-Rhythm'
px.scatter(df, x='x', y='y', color='focus', hover_data=['token', 'label', 'source'], opacity=0.5)

In [None]:
df = pd.DataFrame(projection, columns=['x', 'y'])
df['token'] = joint_vocab.itos

table_vocabs = get_table_vocabs(tables)
vocab_df = pd.DataFrame(table_vocabs, columns=['label', 'source'])
df['label'] = df['token'].str.extract('(.+?)(?:=.*)?$')
df = df.merge(pd.DataFrame(vocab_df, columns=['label', 'source']), how='left', on='label')

# df['focus'] = df['token'].str.contains('Glucose')
df['focus'] = df['label'] == 'Heart-Rhythm'
px.scatter(df, x='x', y='y', color='focus', hover_data=['token', 'label', 'source'], opacity=0.5)

# Bins

In [None]:
joint_vocab.stoi['🅛🅔=Glucose=NEG']
tables[2].bins['🅛🅔=Glucose']
tables[1].plot_bin('🅒🅔=Respiratory-Rate')