# Create the demographic table by identical burst vs non-identical burst patients
### Edit the variables "similarity_function" (dtw or xcorr), "percentile", and "good_vs_bad_similarity_cutoff" as desired

In [1]:
import math
import os
import json
import matplotlib.pyplot as plt
import IPython
import numpy as np
import pandas as pd
import cPickle as pickle
import sys
import h5py
import copy

from similarity_output_collectors import *
from similarity_output_analysis import *
from similarity_scaling import scale_arr

sys.path.append('..')
from histogram import *
sys.path.append('../../')
import utils
import readers
from readers.patient_info import PatientInfo
from readers.similarity_output_reader import SimilarityOutputReader

### Define the directory containing the similarity output

In [2]:
similarity_output_dir = '/afs/csail.mit.edu/u/t/tzhan/NFS/script_output/similarity_clip/'
# a short tag for the directory, so that saved figures are named with the tag
pickle_tag = 'clip500ms'

### Create the required readers

In [3]:
outcomes_path = '../../../../patient_outcome_info/'
patientInfo = PatientInfo(outcomes_path)
similarityReader = SimilarityOutputReader(similarity_output_dir)

## Define processing

In [4]:
def get_similarity_info(similarity_fn, min_episode_length, collectors, scale_fn, scale_param, max_iter=None, iter_start=None):
    # similarity_fn is one of 'sim' or 'xcorr'
    patients = patientInfo.get_all_sids()
    for i, patient in enumerate(patients):
        if iter_start is not None and i < iter_start:
            continue
        print(i, patient)
        if patientInfo.is_excluded(patient):
            print('excluded, continuing')
            continue
        if max_iter is not None and i==max_iter:
            break
        patient_all_sims = []
        for (edf_name, _, _) in patientInfo.get_edfs_and_indices(patient, max_num_hours=72):
            episode_to_sims_list = similarityReader.get_similarities(edf_name, similarity_fn, 
                                                                     convert_to_np=True, 
                                                                     min_episode_length=min_episode_length)
            edf_all_sims = similarityReader.flatten_similarities_list(episode_to_sims_list)
            patient_all_sims.extend(edf_all_sims)
        patient_all_sims = np.array(patient_all_sims)
        patient_all_sims = scale_arr(patient_all_sims, similarity_fn, scale_fn=scale_fn, param=scale_param)
        outcome = patientInfo.get_outcome(patient)
        for collector in collectors:
            collector.add_patient_all_sims(patient_all_sims, outcome)

        #IPython.display.clear_output()
    for collector in collectors:
        collector.done()
    return collectors, patients

In [5]:
def create_collectors(bin_size):
    percentileC = PercentileCollector()
    meanC = MeanCollector()
    goodHistogramC = HistogramCollector(lambda outcome: utils.is_good_outcome(outcome), bin_size=bin_size)
    badHistogramC = HistogramCollector(lambda outcome: utils.is_bad_outcome(outcome), bin_size=bin_size)
    bsOutcomeC = BSOutcomeCollector()
    outcomeC= OutcomeCollector()
    return [percentileC, meanC, goodHistogramC, badHistogramC, bsOutcomeC, outcomeC]

In [6]:
def get_pickled_histogram_collector(histogramC):
    histogramC_pkl = copy.deepcopy(histogramC)
    histogramC_pkl.outcome_predicate = None
    return histogramC_pkl
    
def pickle_dtw_info(percentileC, meanC, goodHistogramC, badHistogramC, bsOutcomeC, outcomeC, patients, save_path):
    goodHistogramC_pkl = get_pickled_histogram_collector(goodHistogramC)
    badHistogramC_pkl = get_pickled_histogram_collector(badHistogramC)
    pickle.dump(([percentileC, meanC, goodHistogramC_pkl, badHistogramC_pkl, bsOutcomeC, outcomeC], patients), open(save_path, "w"))
    
def pickle_xcorr_info(percentileC, meanC, goodHistogramC, badHistogramC, save_path):
    goodHistogramC_pkl = get_pickled_histogram_collector(goodHistogramC)
    badHistogramC_pkl = get_pickled_histogram_collector(badHistogramC)
    pickle.dump([percentileC, meanC, goodHistogramC_pkl, badHistogramC_pkl], open(save_path, "w"))

In [7]:
def single_patient_histogram(patient, similarity_fn, scale_fn, scale_param, min_episode_length):
    patient_all_sims = []
    for (edf_name, _, _) in patientInfo.get_edfs_and_indices(patient, max_num_hours=72):
        episode_to_sims_list = similarityReader.get_similarities(edf_name, similarity_fn, 
                                                                 convert_to_np=True, 
                                                                 min_episode_length=min_episode_length)
        edf_all_sims = similarityReader.flatten_similarities_list(episode_to_sims_list)
        patient_all_sims.extend(edf_all_sims)
    patient_all_sims = np.array(patient_all_sims)
    patient_all_sims = scale_arr(patient_all_sims, similarity_fn, scale_fn=scale_fn, param=scale_param)
    outcome = patientInfo.get_outcome(patient)
    return describe_array(patient_all_sims, '{} similarity histogram'.format(patient), 
                   'similarity {}, {} scaling'.format(similarity_fn, scale_fn), 
                   'density', rug=False, norm_hist=True, bins=500)

In [8]:
def get_total_eeg_recording_length(patient_sid):
    total_ind_length = 0
    for (_, edf_start_ind, edf_end_ind) in patientInfo.get_edfs_and_indices(patient_sid):
        edf_ind_length = edf_end_ind - edf_start_ind
        total_ind_length += edf_ind_length
    return utils.samples_to_hour(total_ind_length)

In [9]:
def create_demographic_table(similarity_function, percentile):
    assert(percentile%10==0), 'percentile must be multiple of 10 between 0 and 100'
    assert(similarity_function in ['dtw', 'xcorr']), 'similarity function must be one of "dtw" or "xcorr"'
    decile = int(percentile/10)
    if similarity_function=='dtw':
        similarity_x = dtw_percentileC.percentiles_list[:, decile]
    else:
        similarity_x = xcorr_percentileC.percentiles_list[:, decile]
    df_rows = []
    for pt, pt_similarity in zip(patients, similarity_x):
        if np.isnan(pt_similarity):
            continue
        if patientInfo.get_outcome(pt)==-1:
            continue
        patient_info = patientInfo.get_patient_clinical_info(pt)
        patient_df_row = {'patient_id':pt, 'similarity':pt_similarity, 
                          'age': patient_info['age'], 
                         'sex': patient_info['sex'], 
                         'vfib': patient_info['vfib'],
                         'cpc':patient_info['bestCpcBy6Mo'],
                         'ROSCmin':patient_info['ROSCmin'],
                         'eeg duration':get_total_eeg_recording_length(pt)}
        df_rows.append(patient_df_row)
    demographic_table = pd.DataFrame(df_rows, columns=['patient_id', 'similarity', 'age', 'sex',
                                  'vfib', 'cpc', 'ROSCmin', 'eeg duration'])
    demographic_table = demographic_table.sort_values('similarity')

    non_identical = demographic_table['similarity'] <= good_vs_bad_similarity_cutoff
    identical = demographic_table['similarity'] > good_vs_bad_similarity_cutoff

    non_identical_table = (demographic_table[non_identical]).reset_index(drop=True)
    identical_table = (demographic_table[identical]).reset_index(drop=True)
    return non_identical_table, identical_table

## Read and process the results

### Define desired parameters of getting the results

In [10]:
min_episode_length = 30
scale_fn = 'exp'
scale_param = 50.0
# set run_results to False if there are saved pickled results you want to use
run_results = False

### Derive parameters from the above named parameters

In [11]:
if scale_fn=='exp' and scale_param==50:
    scale_tag = '{}_{}'.format(scale_fn, scale_param)
else:
    scale_tag = scale_fn
image_info = '{}_{}'.format(scale_tag, pickle_tag)
if not os.path.exists('saved_output/pickled_collectors/'):
    os.makedirs('saved_output/pickled_collectors/')

### Get the "Collector" objects containing the output data, either by reading the output directory, or by loading  old saved objects

In [12]:
if run_results:
    if scale_fn=='noscale':
        bin_size = 1.0
    else:
        bin_size = 0.002
    ([dtw_percentileC, dtw_meanC, dtw_goodHistogramC, dtw_badHistogramC, 
     bsOutcomeC, outcomeC], patients) = get_similarity_info('dtw', min_episode_length, create_collectors(bin_size), scale_fn=scale_fn, scale_param=scale_param)
    print('pickle dtw')
    pickle_dtw_info(dtw_percentileC, dtw_meanC, dtw_goodHistogramC, dtw_badHistogramC, 
                    bsOutcomeC, outcomeC, patients, 
                    save_path='saved_output/pickled_collectors/dtw_similarity_info_{}_{}_{}.pkl'.format(pickle_tag, min_episode_length, scale_tag))
    ([xcorr_percentileC, xcorr_meanC, xcorr_goodHistogramC, xcorr_badHistogramC,
    _, _], _) = get_similarity_info('xcorr', min_episode_length, create_collectors(bin_size), scale_fn=scale_fn, scale_param=scale_param)
    print('pickle xcorr')
    pickle_xcorr_info(xcorr_percentileC, xcorr_meanC, xcorr_goodHistogramC, xcorr_badHistogramC,
                    save_path='saved_output/pickled_collectors/xcorr_similarity_info_{}_{}_{}.pkl'.format(pickle_tag, min_episode_length, scale_tag))
else:
    # load pickled result
    save_path = 'saved_output/pickled_collectors/{}_similarity_info_{}_{}_{}.pkl'
    dtw_save_path = save_path.format('dtw', pickle_tag, min_episode_length, scale_tag)
    xcorr_save_path = save_path.format('xcorr', pickle_tag, min_episode_length, scale_tag)
    ([dtw_percentileC, dtw_meanC, dtw_goodHistogramC, dtw_badHistogramC, 
     bsOutcomeC, outcomeC], patients) = pickle.load(open(dtw_save_path, "rb"))
    [xcorr_percentileC, xcorr_meanC, xcorr_goodHistogramC, xcorr_badHistogramC] = pickle.load(open(xcorr_save_path, "rb"))
    dtw_badHistogramC.outcome_predicate = utils.is_bad_outcome
    dtw_goodHistogramC.outcome_predicate = utils.is_good_outcome
    xcorr_goodHistogramC.outcome_predicate = utils.is_good_outcome
    xcorr_badHistogramC.outcome_predicate = utils.is_bad_outcome

# Create the demographics table

### Define the parameters for the "similarity" column of the table

In [13]:
similarity_function = 'dtw' # must be 'dtw' or 'xcorr'
percentile = 20  # must be multiple of 10

good_vs_bad_similarity_cutoff = 0.26

In [14]:
non_identical_table, identical_table = create_demographic_table(similarity_function, percentile)

In [15]:
non_identical_table

Unnamed: 0,patient_id,similarity,age,sex,vfib,cpc,ROSCmin,eeg duration
0,ynh59,0.103975,73.0,M,0.0,5.0,60.0,50.688000
1,ynh124,0.105912,65.0,M,0.0,5.0,30.0,45.685333
2,ynh16,0.111029,54.0,M,0.0,5.0,17.0,71.431110
3,ynh87,0.118686,85.0,F,1.0,4.0,12.0,64.784000
4,mgh167,0.121172,50.0,M,0.0,5.0,7.0,71.942054
5,ynh78,0.124738,78.0,F,0.0,5.0,20.0,67.121165
6,mgh87,0.125183,20.0,F,1.0,1.0,27.0,42.820833
7,mgh162,0.125871,71.0,F,1.0,5.0,40.0,23.228444
8,bwh64,0.128493,46.0,F,1.0,5.0,10.0,71.937110
9,mgh97,0.130289,65.0,F,0.0,5.0,15.0,71.907221


In [16]:
identical_table

Unnamed: 0,patient_id,similarity,age,sex,vfib,cpc,ROSCmin,eeg duration
0,bwh1,0.261024,61.0,M,0.0,5.0,15.0,33.152000
1,ynh70,0.262109,21.0,F,0.0,5.0,20.0,70.609778
2,mgh153,0.263981,55.0,M,1.0,5.0,60.0,24.275556
3,bwh116,0.264276,46.0,M,0.0,5.0,8.0,59.228610
4,mgh171,0.265500,58.0,F,0.0,5.0,30.0,63.945499
5,ynh65,0.265591,59.0,M,1.0,5.0,20.0,59.576889
6,mgh115,0.267878,56.0,F,0.0,5.0,47.0,70.002943
7,bwh210,0.268768,82.0,M,0.0,5.0,,69.763556
8,ynh51,0.269541,71.0,F,0.0,5.0,10.0,71.587943
9,mgh125,0.271707,69.0,M,0.0,5.0,10.0,71.869610
