In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import sys
sys.path.append('/home/jdunnmon/repos/metal')
import os
os.environ['CUDA_VISIBLE_DEVICES']='0'
import pandas as pd

In [None]:
# Setting data location
eeg_data_path = '/home/tsy935/Docs/RubinLab/Data/EEG/Reports'
eeg_data_file = 'reports_unique_for_hl_mm.csv'
data_path = os.path.join(eeg_data_path, eeg_data_file)

# Loading data
df_eeg = pd.read_csv(data_path, index_col=0).dropna(how='all')
df_eeg = df_eeg.rename({'Note': 'note', 'Hand Label (1 for seizure, -1 for no seizure, 0 for unsure)': 'hand_label'}, axis=1)

In [None]:
# Testing a single EEGNote
from eeg_utils import EEGNote

noteObj = EEGNote(df_eeg['note_uuid'][100], df_eeg['note'][100])

noteObj

In [None]:
import dill
from eeg_utils import parse_eeg_docs

# Parsing documents -- note that 1 = abnormal, 2 = normal!

eeg_note_dill = 'parsed_eeg_notes.dill'
eeg_note_dill_path = os.path.join(eeg_data_path, eeg_note_dill)

if os.path.exists(eeg_note_dill_path):
    print('Loading pre-parsed EEG notes...')
    with open(eeg_note_dill_path, 'rb') as af:
        docs = dill.load(af)
else:
    print('Parsing EEG notes...')
    docs = parse_eeg_docs(df_eeg, use_dask=False)
    with open(eeg_note_dill_path,'wb') as af:
        dill.dump(docs, af)

In [None]:
# These are docs with empty sections -- most look like they're not EEG reports!
from eeg_utils import get_empty_docs
empty_docs = get_empty_docs(docs)

In [None]:
# Removing empty EEG docs
eeg_docs = list(set(docs)-set(empty_docs))
print(f'Number of EEG Reports with Sections: {len(eeg_docs)}')

In [None]:
import numpy as np
from eeg_utils import create_data_split

# Shuffling and setting seed
np.random.seed(1701)
np.random.shuffle(eeg_docs)

# Creating data split
train_docs, dev_docs, test_docs = create_data_split(eeg_docs)
docs_list = [train_docs, dev_docs, test_docs]

In [None]:
# Computing dev/test label balance
Y_dev = np.array([doc.gold_label for doc in dev_docs])
Y_test = np.array([doc.gold_label for doc in test_docs])

dev_balance= np.sum(Y_dev == 1)/len(Y_dev)
test_balance = np.sum(Y_test == 1)/len(Y_test)

print(f'Dev positive percentage: {dev_balance}')
print(f'Test positive percentage: {test_balance}')

In [None]:
import re
from eeg_lfs import *
from eeg_utils import get_section_with_name

In [None]:
from metal.analysis import single_lf_summary, confusion_matrix

# Testing single LF
lf_test = lf_impression_section_positive

# Computing labels
Y_lf = np.array([lf_test(doc) for doc in dev_docs])
single_lf_summary(Y_lf, Y=Y_dev)

In [None]:
# Print confusion matrix
conf = confusion_matrix(Y_dev, Y_lf)

In [None]:
lfs = [
    lf_normal_interp_not_seizure,
    lf_abnormal_interp_with_seizure,
    lf_findall_interp_with_seizure,
    lf_findall_abnl_interp_without_seizure,
    lf_abnl_interp_negexsp_seizure,
    lf_findall_interp_negex_seizure,
    lf_seizure_section,
    lf_impression_section_negative,
    lf_impression_section_positive,
    lf_spikes_in_impression,
    lf_extreme_words_in_impression
]

In [None]:
from scipy.sparse import csr_matrix
import dask
from dask.diagnostics import ProgressBar
from eeg_utils import evaluate_lf_on_docs, create_label_matrix
import pickle

# Resetting LFs
clobber_lfs = True
Ls_file = 'Ls_0p3.pkl'
Ys_file = 'Ys_0p3.pkl'

# Get lf names
lf_names = [lf.__name__ for lf in lfs]

# Loading Ls if they exist

Ls = []
Ys = []
if clobber_lfs or (not os.path.exists(Ls_file)):
    print('Computing label matrices...')
    for i, docs in enumerate([train_docs, dev_docs, test_docs]):
        Ls.append(create_label_matrix(lfs,docs))  
    with open(Ls_file,'wb') as af:
        pickle.dump(Ls, af)
    
    print('Creating label vectors...')
    Ys = [[],Y_dev, Y_test]
    with open(Ys_file,'wb') as af:
        pickle.dump(Ls, af)
else:
    print('Loading pre-computed label matrices...')
    with open(Ls_file,'rb') as af:
        Ls=pickle.load(af) 
        

# Create label matrices
#Ls = []
#for i, docs in enumerate([train_docs, dev_docs, test_docs]):
#    Ls.append(create_label_matrix(lfs,docs)) 
    
# Create Ys
Ys = [[], Y_dev, Y_test]

In [None]:
from metal.analysis import lf_summary

# Analyzing LF stats
df_lf = lf_summary(Ls[1], Y=Y_dev, lf_names=lf_names)
df_lf

In [None]:
from metal.contrib.visualization.analysis import view_label_matrix, view_overlaps

# Viewing label matrix
view_label_matrix(Ls[0])

In [None]:
from  metal.contrib.visualization.analysis import view_conflicts

# Viewing conflicts
view_conflicts(Ls[1], normalize=True)

In [None]:
from metal.label_model import LabelModel
from metal.utils import LogWriter
from metal.tuners import RandomSearchTuner

# Creating metal label model
#label_model = LabelModel(k=2, seed=123)

# Creating search space
search_space = {
        'l2': {'range': [0.0001, 0.1], 'scale':'log'},           # linear range
        'lr': {'range': [0.0001, 0.01], 'scale': 'log'},  # log range
        }

searcher = RandomSearchTuner(LabelModel, log_dir='./run_logs',
               log_writer_class=None)

In [None]:
%%time
# Training label model
label_model = searcher.search(search_space, (Ls[1],Ys[1]), \
        train_args=[Ls[0]], init_args=[],
        init_kwargs={'k':2, 'seed':123}, train_kwargs={'n_epochs':100},
        max_search=20)

In [None]:
# Saving best model
searcher._save_best_model(label_model)

In [None]:
# Getting scores
scores = label_model.score((Ls[1], Ys[1]), metric=['accuracy','precision', 'recall', 'f1'])

In [None]:
from metal.label_model.baselines import MajorityLabelVoter

# Checking if we beat majority vote
mv = MajorityLabelVoter(seed=123)
scores = mv.score((Ls[1], Ys[1]), metric=['accuracy', 'precision', 'recall', 'f1'])

In [None]:
# Getting probabilistic training labels
# Y_train_ps stands for "Y[labels]_train[split]_p[redicted]s[oft]"
Y_train_ps = label_model.predict_proba(Ls[0])
Y_dev_ps = label_model.predict_proba(Ls[1])
Y_test_ps = label_model.predict_proba(Ls[2])
Y_ps = [Y_train_ps, Y_dev_ps, Y_test_ps]

In [None]:
# Running some analysis 
from metal.contrib.visualization.analysis import plot_predictions_histogram
Y_dev_p = label_model.predict(Ls[1])
plot_predictions_histogram(Y_dev_p, Ys[1], title="Label Distribution")

In [None]:
from  metal.contrib.visualization.analysis  import plot_probabilities_histogram

# Looking at probability histogram for training labels
plot_probabilities_histogram(Y_dev_ps[:,0], title="Probablistic Label Distribution")

In [None]:
from  metal.analysis import confusion_matrix

# Printing confusion matrix
cm = confusion_matrix(Ys[1], Y_dev_p)

In [None]:
from metal.contrib.featurizers.embedding_featurizer import TrainableEmbeddingFeaturizer

# Defining featurizer
# TODO: use a different one for IdentityModule!
featurizer = TrainableEmbeddingFeaturizer()

# Getting raw input data
Xs = [[doc.tokens for doc in doc_split] for doc_split in [train_docs, dev_docs, test_docs]]

# Flattening input data and getting lengths for unflattening
X_flat = Xs[0]+Xs[1]+Xs[2]
lens = [len(X) for X in Xs]
lens = np.cumsum(lens)

# Fitting featurizer
featurizer.fit(X_flat, min_freq=100)

# Creating transformed data
X_trans = featurizer.transform(X_flat).float()

# Unflattening data
X_trans = [X_trans[:lens[0]], X_trans[lens[0]:lens[1]], X_trans[lens[1]:lens[2]]]

# Print embedding size
print(f'Embedding size: {len(X_trans[0][0])}')

In [None]:
from metal.end_model import EndModel
from metal.modules import LSTMModule, IdentityModule

# LSTM parameters
hidden_size = 50
embed_size = 100
vocab_size = len(X_trans[0][0]) # Update Metal to handle this more gracefully!
input_module = LSTMModule(embed_size, hidden_size, vocab_size = vocab_size)

# Identity parameters
#feature_size = len(X_trans[0][0])
#hidden_size = 1000
#input_module = IdentityModule()

# Defining end model
end_model = EndModel([embed_size,100,2], input_module=input_module, seed=123, use_cuda=True)

In [None]:
import torch
from torch.utils.data import DataLoader
from metal.utils import MetalDataset

# Training end model
train_data = (X_trans[0].long(), torch.Tensor(Y_train_ps))
dev_data = (X_trans[1].long(), torch.Tensor(Ys[1]))
batch_size = 256

train_data = DataLoader(MetalDataset(*train_data), shuffle=True, batch_size=batch_size)
dev_data = DataLoader(MetalDataset(*dev_data), shuffle=True, batch_size = batch_size)

end_model.train_model(train_data, dev_data=dev_data, l2=0.00001, lr=0.001, batch_size=256, 
                num_workers=8, n_epochs=10, print_every=1, validation_metric='accuracy')

# Emptying cuda cache (add this to metal?)
torch.cuda.empty_cache()

In [None]:
# Evaluating performance
print("Label Model:")
score = label_model.score((Ls[2], Ys[2]), metric=['accuracy','precision', 'recall', 'f1'])

print()

print("End Model:")
score = end_model.score((X_trans[2].long(), Ys[2]), metric=['accuracy', 'precision', 'recall', 'f1'])

In [None]:
save_dest = os.path.dirname(searcher.save_path)
splits = ['train','dev','test']
save_csvs = True
results_df = {}
for ind, split in enumerate(splits):
    # Evaluating scores and writing to file
    doc_ids = [doc.doc_id for doc in docs_list[ind]]
    gm_marginals = [y for y in Y_ps[ind]]

    # Creating dataframe
    df_dict = {'id': doc_ids, 'gm_marginals':gm_marginals}
    df = pd.DataFrame(df_dict)
    results_df[split] = df
    
    # Writing dataframe
    if save_csvs:
        results_df[split].to_csv(os.path.join(save_dest, f'metal_results_{split}.csv'))

## SANDBOX