In [None]:
import math
import os
import json
import matplotlib.pyplot as plt
import IPython
import numpy as np
import pandas as pd
import cPickle as pickle
import sys
import h5py
import copy

from similarity_output_collectors import *
from similarity_output_analysis import *
from similarity_scaling import scale_arr

sys.path.append('..')
from histogram import *
sys.path.append('../../')
import utils
import readers
from readers.patient_info import PatientInfo
from readers.similarity_output_reader import SimilarityOutputReader

### Define the directory containing the similarity output

In [None]:
similarity_output_dir = '/afs/csail.mit.edu/u/t/tzhan/NFS/script_output/similarity_no_clip/'
# a short tag for the directory, so that saved figures are named with the tag
pickle_tag = 'no_clip_new'

### Create the required readers

In [None]:
outcomes_path = '../../../../patient_outcome_info/'
patientInfo = PatientInfo(outcomes_path)
similarityReader = SimilarityOutputReader(similarity_output_dir)

## Define processing

In [None]:
def get_similarity_info(similarity_fn, min_episode_length, collectors, scale_fn, scale_param, max_iter=None, iter_start=None):
    # similarity_fn is one of 'sim' or 'xcorr'
    patients = patientInfo.get_all_sids()
    for i, patient in enumerate(patients):
        if iter_start is not None and i < iter_start:
            continue
        print(i, patient)
        if patientInfo.is_excluded(patient):
            print('excluded, continuing')
            continue
        if max_iter is not None and i==max_iter:
            break
        patient_all_sims = []
        for (edf_name, _, _) in patientInfo.get_edfs_and_indices(patient, max_num_hours=72):
            episode_to_sims_list = similarityReader.get_similarities(edf_name, similarity_fn, 
                                                                     convert_to_np=True, 
                                                                     min_episode_length=min_episode_length)
            edf_all_sims = similarityReader.flatten_similarities_list(episode_to_sims_list)
            patient_all_sims.extend(edf_all_sims)
        patient_all_sims = np.array(patient_all_sims)
        patient_all_sims = scale_arr(patient_all_sims, similarity_fn, scale_fn=scale_fn, param=scale_param)
        outcome = patientInfo.get_outcome(patient)
        for collector in collectors:
            collector.add_patient_all_sims(patient_all_sims, outcome)

        #IPython.display.clear_output()
    for collector in collectors:
        collector.done()
    return collectors, patients

In [None]:
def create_collectors(bin_size):
    percentileC = PercentileCollector()
    meanC = MeanCollector()
    goodHistogramC = HistogramCollector(lambda outcome: utils.is_good_outcome(outcome), bin_size=bin_size)
    badHistogramC = HistogramCollector(lambda outcome: utils.is_bad_outcome(outcome), bin_size=bin_size)
    bsOutcomeC = BSOutcomeCollector()
    outcomeC= OutcomeCollector()
    return [percentileC, meanC, goodHistogramC, badHistogramC, bsOutcomeC, outcomeC]

In [None]:
def get_pickled_histogram_collector(histogramC):
    histogramC_pkl = copy.deepcopy(histogramC)
    histogramC_pkl.outcome_predicate = None
    return histogramC_pkl
    
def pickle_dtw_info(percentileC, meanC, goodHistogramC, badHistogramC, bsOutcomeC, outcomeC, patients, save_path):
    goodHistogramC_pkl = get_pickled_histogram_collector(goodHistogramC)
    badHistogramC_pkl = get_pickled_histogram_collector(badHistogramC)
    pickle.dump(([percentileC, meanC, goodHistogramC_pkl, badHistogramC_pkl, bsOutcomeC, outcomeC], patients), open(save_path, "w"))
    
def pickle_xcorr_info(percentileC, meanC, goodHistogramC, badHistogramC, save_path):
    goodHistogramC_pkl = get_pickled_histogram_collector(goodHistogramC)
    badHistogramC_pkl = get_pickled_histogram_collector(badHistogramC)
    pickle.dump([percentileC, meanC, goodHistogramC_pkl, badHistogramC_pkl], open(save_path, "w"))

In [None]:
def single_patient_histogram(patient, similarity_fn, scale_fn, scale_param, min_episode_length):
    patient_all_sims = []
    for (edf_name, _, _) in patientInfo.get_edfs_and_indices(patient, max_num_hours=72):
        episode_to_sims_list = similarityReader.get_similarities(edf_name, similarity_fn, 
                                                                 convert_to_np=True, 
                                                                 min_episode_length=min_episode_length)
        edf_all_sims = similarityReader.flatten_similarities_list(episode_to_sims_list)
        patient_all_sims.extend(edf_all_sims)
    patient_all_sims = np.array(patient_all_sims)
    patient_all_sims = scale_arr(patient_all_sims, similarity_fn, scale_fn=scale_fn, param=scale_param)
    outcome = patientInfo.get_outcome(patient)
    return describe_array(patient_all_sims, '{} similarity histogram'.format(patient), 
                   'similarity {}, {} scaling'.format(similarity_fn, scale_fn), 
                   'density', rug=False, norm_hist=True, bins=500)

## Read and process the results

### Define desired parameters of getting the results

In [None]:
min_episode_length = 30
scale_fn = 'noscale'
scale_param = 50.0
# set run_results to False if there are saved pickled results you want to use
run_results = False

### Derive parameters from the above named parameters

In [None]:
if scale_fn=='exp' and scale_param==50:
    scale_tag = '{}_{}'.format(scale_fn, scale_param)
else:
    scale_tag = scale_fn
image_info = '{}_{}'.format(scale_tag, pickle_tag)
if not os.path.exists('saved_output/pickled_collectors/'):
    os.makedirs('saved_output/pickled_collectors/')

### Get the "Collector" objects containing the output data, either by reading the output directory, or by loading  old saved objects

In [None]:
if run_results:
    if scale_fn=='noscale':
        bin_size = 1.0
    else:
        bin_size = 0.002
    ([dtw_percentileC, dtw_meanC, dtw_goodHistogramC, dtw_badHistogramC, 
     bsOutcomeC, outcomeC], patients) = get_similarity_info('dtw', min_episode_length, create_collectors(bin_size), scale_fn=scale_fn, scale_param=scale_param)
    print('pickle dtw')
    pickle_dtw_info(dtw_percentileC, dtw_meanC, dtw_goodHistogramC, dtw_badHistogramC, 
                    bsOutcomeC, outcomeC, patients, 
                    save_path='saved_output/pickled_collectors/dtw_similarity_info_{}_{}_{}.pkl'.format(pickle_tag, min_episode_length, scale_tag))
    ([xcorr_percentileC, xcorr_meanC, xcorr_goodHistogramC, xcorr_badHistogramC,
    _, _], _) = get_similarity_info('xcorr', min_episode_length, create_collectors(bin_size), scale_fn=scale_fn, scale_param=scale_param)
    print('pickle xcorr')
    pickle_xcorr_info(xcorr_percentileC, xcorr_meanC, xcorr_goodHistogramC, xcorr_badHistogramC,
                    save_path='saved_output/pickled_collectors/xcorr_similarity_info_{}_{}_{}.pkl'.format(pickle_tag, min_episode_length, scale_tag))
else:
    # load pickled result
    save_path = 'saved_output/pickled_collectors/{}_similarity_info_{}_{}_{}.pkl'
    dtw_save_path = save_path.format('dtw', pickle_tag, min_episode_length, scale_tag)
    xcorr_save_path = save_path.format('xcorr', pickle_tag, min_episode_length, scale_tag)
    ([dtw_percentileC, dtw_meanC, dtw_goodHistogramC, dtw_badHistogramC, 
     bsOutcomeC, outcomeC], patients) = pickle.load(open(dtw_save_path, "rb"))
    [xcorr_percentileC, xcorr_meanC, xcorr_goodHistogramC, xcorr_badHistogramC] = pickle.load(open(xcorr_save_path, "rb"))
    dtw_badHistogramC.outcome_predicate = utils.is_bad_outcome
    dtw_goodHistogramC.outcome_predicate = utils.is_good_outcome
    xcorr_goodHistogramC.outcome_predicate = utils.is_good_outcome
    xcorr_badHistogramC.outcome_predicate = utils.is_bad_outcome

# Histograms of similarity for good vs bad

In [None]:
title = 'Similarities ({}) of patients with good vs bad outcomes'
xlabel = 'Similarity'
ylabel = 'Density of burst pairs'

In [None]:
percentiles = [0, 20,40, 50, 60,70, 80, 85, 90, 95, 100]
def print_percentiles(percentiles, results):
    a = ""
    for percentile, result in zip(percentiles, results):
        a += ", "+ str(percentile)+": "+str(result)
    print a

In [None]:
print_percentiles(percentiles, precounted_percentiles(dtw_badHistogramC.aggregate_histograms(normalize_row=True), dtw_badHistogramC.bins, percentiles))
print_percentiles(percentiles, precounted_percentiles(dtw_goodHistogramC.aggregate_histograms(normalize_row=True), dtw_goodHistogramC.bins, percentiles))
ax = precounted_histograms([dtw_goodHistogramC.aggregate_histograms(normalize_row=True), 
                       dtw_badHistogramC.aggregate_histograms(normalize_row=True)], 
                    [dtw_goodHistogramC.bins, dtw_badHistogramC.bins], title.format('DTW'), 
                      xlabel, ylabel, ['dtw good', 'dtw bad'])
ax.figure.savefig('saved_output/similarity_histogram_good_vs_bad/similarity_histogram_good_vs_bad_dtw_{}.png'.format(image_info),
                 bbox_inches='tight')

In [None]:
print_percentiles(percentiles, precounted_percentiles(xcorr_badHistogramC.aggregate_histograms(normalize_row=True), xcorr_badHistogramC.bins, percentiles))
print_percentiles(percentiles, precounted_percentiles(xcorr_goodHistogramC.aggregate_histograms(normalize_row=True), xcorr_goodHistogramC.bins, percentiles))
ax = precounted_histograms([xcorr_goodHistogramC.aggregate_histograms(normalize_row=True), 
                       xcorr_badHistogramC.aggregate_histograms(normalize_row=True)], 
                    [xcorr_goodHistogramC.bins, xcorr_badHistogramC.bins], title.format('xcorr'), 
                      xlabel, ylabel, ['xcorr good', 'xcorr bad'])
ax.figure.savefig('saved_output/similarity_histogram_good_vs_bad/similarity_histogram_good_vs_bad_xcorr_{}.png'.format(image_info),
                 bbox_inches='tight')

# Histograms of sample good and sample bad

In [None]:
# This gets the histograms for a single patient and saves it
a = single_patient_histogram('mgh112', 'dtw', 'exp', 50.0, 30)
a.figure.savefig('mgh112_histogram.png')
b = single_patient_histogram('ynh34', 'dtw', 'exp', 50.0, 30)
b.figure.savefig('ynh34_histogram.png')

# Similarity/Distance vs outcome

## Define the summary measure

In [None]:
# For each patient, we get one single summary measure of similarity from their entire 
# similarities vector summary_measure is either 'mean' or 'percentile'. 
# if 'mean', we use the mean of the similarities vector. 
# if 'percentile', we use the 'summary_percentile' decile of the similarities vector
summary_measure = 'mean'
# summary_percentile_decile is integer beween 0 and 10. It is ignored if 'summary_measure' is mean. 
# summary_percentile_decile of 2, for instance, means 20% percentile
summary_percentile_decile = 2

In [None]:
if summary_measure=='mean':
    dtw_x = dtw_meanC.means_list
    xcorr_x = xcorr_meanC.means_list
    summary_tag = 'mean'
elif summary_measure=='percentile':
    dtw_x = dtw_meanC.percentiles_list[:, summary_percentile_decile]
    xcorr_x = xcorr_meanC.percentiles_list[:, summary_percentile_decile]
    summary_tag = 'perc{}'.format(summary_percentile_decile)
else:
    print 'summary_measure must be either "mean" or "percentile"'
outcomes = outcomeC.outcomes

## Bar plots

#### DTW

In [None]:
nbins = 7
binarize = False
transpose = False
bin_by_percentile = False
p = plot_xbins_vs_percent_outcomes(dtw_x, outcomeC.outcomes, nbins=nbins, normalize=True, binarize=binarize, transpose=transpose, 
                                   scale_xaxis=True, bin_by_percentile=bin_by_percentile, similarity_fn='dtw')
p.figure.savefig('saved_output/bar_plots/{}_{}_dtw_{}bins_binarize{}_binperc{}_trans{}.png'.format(image_info, summary_tag, 
                                                                                                    nbins, int(binarize), 
                                                                                                   int(bin_by_percentile), 
                                                                                                    int(transpose)),
                bbox_inches='tight')

#### Cross-correlation

In [None]:
nbins = 4
binarize = False
transpose = False
bin_by_percentile = True
p = plot_xbins_vs_percent_outcomes(xcorr_x, outcomeC.outcomes, nbins=nbins, normalize=True, binarize=binarize, transpose=transpose, 
                                   scale_xaxis=True, bin_by_percentile=bin_by_percentile, similarity_fn='cross-correlation')
p.figure.savefig('saved_output/bar_plots/{}_{}_xcorr_{}bins_binarize{}_binperc{}_trans{}.png'.format(image_info, summary_tag, 
                                                                                                    nbins, int(binarize), 
                                                                                                   int(bin_by_percentile), 
                                                                                                    int(transpose)),
                bbox_inches='tight')

## Statistical Correlation

In [None]:
statistical_correlation(dtw_x, outcomes)
statistical_correlation(xcorr_x, outcomes)

## Plot of similarity vs outcome

In [None]:
plot_x_vs_outcome(dtw_x, outcomes, filter=True)

In [None]:
plot_x_vs_outcome(xcorr_x, outcomes, filter=True)

## ROC Curves

In [None]:
plot = roc(dtw_x, outcomes, similarity_fn='dtw', similarity_type='distance' if scale_fn=='noscale' else 'similarity')
plot.savefig('saved_output/roc/roc_{}_{}_dtw.png'.format(image_info, summary_tag), bbox_inches='tight')

In [None]:
plot = roc(xcorr_x, outcomes, similarity_fn='cross-correlation')
plot.savefig('saved_output/roc/roc_{}_{}_xcorr.png'.format(image_info, summary_tag), bbox_inches='tight')

## For each summary measure available, print the number of bad patients which we could distinguish using the most similar good patient score as a cutoff

In [None]:
print 'dtw \n mean - ', threshold(dtw_meanC.means_list, outcomes, 'distance' if scale_fn=='noscale' else 'similarity')
for i in range(0, 11):
    cutoff, num_cutoff = threshold(dtw_percentileC.percentiles_list[:, i], outcomes, 'distance' if scale_fn=='noscale' else 'similarity')
    print 'percentile {}, cutoff {}, num bad beyond cutoff {}'.format(i, cutoff, num_cutoff)
print 'xcorr \n mean - ', threshold(xcorr_meanC.means_list, outcomes, 'similarity')
for i in range(11):
    cutoff, num_cutoff = threshold(xcorr_percentileC.percentiles_list[:, i], outcomes, 'similarity')
    print 'percentile {}, cutoff {}, num bad beyond cutoff {}'.format(i, cutoff, num_cutoff)

# Heatmap of distribution similarity for good, bad patients

In [None]:
img = heatmap_distribution_similarity(xcorr_goodHistogramC, xcorr_badHistogramC, figsize=(30, 30))
#img.figure.savefig('saved_output/heatmap_matrix/heatmap_similarity_dists_xcorr_{}.png'.format(image_info))

In [None]:
img = heatmap_distribution_similarity(dtw_goodHistogramC, dtw_badHistogramC, figsize=(30, 30))
#img.figure.savefig('saved_output/heatmap_matrix/heatmap_similarity_dists_dtw_{}.png'.format(image_info))

In [None]:
compare_patient_dists(dtw_goodHistogramC, dtw_badHistogramC, 'g11', 'g12', plot=True)