In [22]:
import sys
sys.path.append('..')

In [23]:
import pandas as pd
import numpy as np
import utils

In [64]:
class PatientInfo:
    def __init__(self,
                merge_info_csv='../../../patient_outcome_info/merge_info.csv',
                exclude_list='../../../patient_outcome_info/exclude_patient_sids.txt',
                outcome_csv='../../../patient_outcome_info/clinical_outcomes.csv'
                ):
        self.merge_info_df = pd.read_csv(merge_info_csv)
        # exclude
        self.exclude_patients = utils.read_list(exclude_list)
        # new id to outcomes
        self.outcomes_df = pd.read_csv(outcome_csv, index_col=0)
    
    def is_excluded(self, new_sid):
        return new_sid in self.exclude_patients
    
    def get_all_sids(self):
        # returns list of all new sids in patient_merge_info (none of these are excluded)
        all_sids = list(set(list(self.merge_info_df['sid'])))
        all_sids.sort(key=utils.parse_patient_name)
        return all_sids
    
    def get_edfs_and_indices(self, new_sid, max_num_hours=72):
        # for a new sid, returns all the edfs and start/end indices of each edf
        # in the merged data record, taking the first occurring timestamp as index 0, and
        # assuming sample rate of 200. 
        #
        # max_num_hours: maximum duration of the merged data record. 
        #
        # Note: edfs with unreliable timestamps are ignored. "First occuring timestamp" is the first
        # reliable timestamp for a patient.
        
        rows = self.merge_info_df[self.merge_info_df['sid']==new_sid]
        max_index = utils.hour_to_samples(max_num_hours) - 1
        edfs_and_indices = []
        if len(rows)==0:
            return edfs_and_indices
        reference_timestamp = self._get_reference_timestamp(rows)
        if reference_timestamp is None:
            return edfs_and_indices
        for i, row in rows.iterrows():
            start_timestamp = row['timestamp']
            nsamples = row['nsamples']
            timestamp_guessed = row['is_timestamp_guessed_from_old_edf_name']
            if not self._is_timestamp_okay(row):
                continue
            start_index = self._convert_timestamp_to_index(reference_timestamp, start_timestamp)
            if start_index >= max_index:
                continue
            end_index = min(start_index + nsamples, max_index)
            edfs_and_indices.append((row['csail_edf_name'], start_index, end_index))
        edfs_and_indices = [x for x in edfs_and_indices if not self._is_edf_redundant(x, edfs_and_indices)]
        return edfs_and_indices
    
    def has_good_outcome(self, patient_sid, vb=False):
        outcome = self.get_outcome(patient_sid, vb)
        if outcome==-1:
            return outcome
        return self.is_good_outcome(outcome)
    
    def is_good_outcome(self, outcome):
        return utils.is_good_outcome(outcome)
    
    def is_bad_outcome(self, outcome):
        return utils.is_bad_outcome(outcome)
    
    def get_patient_clinical_info(self, patient_sid):
        return self.outcomes_df.loc[patient_sid]
    
    def get_outcome(self, patient_sid, vb=False):
        try:
            outcome = self.outcomes_df['bestCpcBy6Mo'][patient_sid]
        except KeyError:
            if vb:
                if patient_sid in self.exclude_sids:
                    print('{} excluded'.format(patient_filename))
                else:
                    print('KeyError on {}'.format(patient_filename))
            return -1
        if np.isnan(outcome):
            if vb:
                if patient_sid in self.exclude_sids:
                    print('{} excluded'.format(patient_filename))
                else:
                    print('Isnan outcome on {}'.format(patient_filename))
            return -1
        return outcome
    
    def _convert_timestamp_to_index(self, reference_timestamp, timestamp):
        # converts timestamp to index relative to reference_timestamp being index 0
        reference_datetime = utils.convert_timestamp_to_datetime(reference_timestamp)
        datetime = utils.convert_timestamp_to_datetime(timestamp)
        timediff_secs = (datetime-reference_datetime).total_seconds()
        timediff_samples = utils.sec_to_samples(timediff_secs)
        return timediff_samples
    
    def _get_reference_timestamp(self, rows):
        for i, row in rows.iterrows():
            if _is_timestamp_okay(row):
                return row['timestamp']
            
        list(rows.iterrows())[0][1]['timestamp']
        
    def _is_timestamp_okay(self, row):
        timestamp_guessed = row['is_timestamp_guessed_from_old_edf_name']
        if timestamp_guessed:
            guessed_timestamp_okay = row['is_guessed_timestamp_correct_seeming']
            assert(not pd.isnull(guessed_timestamp_okay)), 'guessed_timestamp_null {}'.format(row)
            if not guessed_timestamp_okay:
                return False
        return True
    
    def _is_edf_redundant(self, edf_and_index, edfs_and_indices):
        # for edf, returns whether or not its range [start_index-end_index] is contained entirely 
        # inside another edf's range
        edf, start_index, end_index = edf_and_index
        for edf_other, start_index_other, end_index_other in edfs_and_indices:
            if edf==edf_other:
                continue
            if start_index >= start_index_other and end_index<=end_index_other:
                return True
        return False
            

## Check edfs_and_indices

In [56]:
# bi 34 has edfs a year apart, we want second year

In [65]:
patientInfo = PatientInfo()

In [67]:
patientInfo.get_patient_info(sids[0])

site                      bidmc
age                          85
sex                           F
vfib                          0
dateArrest      1/17/2011 14:53
ROSCmin                       7
cpc0                          1
cpc3                        NaN
cpc6                        NaN
bestCpcBy6Mo                  1
Name: bi1, dtype: object

In [27]:
sids = patientInfo.get_all_sids()
i = -1

In [43]:
merge_info_df = patientInfo.merge_info_df

In [33]:
i+=1
while True:
    sid = sids[i]
    rows = merge_info_df[merge_info_df['sid']==sid]
    edfs_and_indices = patientInfo.get_edfs_and_indices(sid)
    edfs_and_indices_df = pd.DataFrame(edfs_and_indices, columns=['edf', 'start_index', 'end_index'])
    if len(rows)==len(edfs_and_indices_df):
        i+=1
    else:
        if edfs_and_indices[-1][-1]==51839999.0:
            i+=1
        else:
            break

IndexError: list index out of range

In [58]:
sid = 'bwh51'
rows = merge_info_df[merge_info_df['sid']==sid]
edfs_and_indices = patientInfo.get_edfs_and_indices(sid)
edfs_and_indices_df = pd.DataFrame(edfs_and_indices, columns=['edf', 'start_index', 'end_index'])

timestamp_guessed sid                                                            bwh51
timestamp                                            20130221_122730
end_timestamp                                        20130222_022050
csail_edf_name                            bwh_51_1_0_20130221T122730
duration_(hours)                                             13.8889
nsamples                                                    10000000
time_to_next_edf_(hours)                                           0
is_timestamp_guessed_from_old_edf_name                          True
is_guessed_timestamp_correct_seeming                           False
Name: 1263, dtype: object
timestamp okay False
timestamp not okay sid                                                            bwh51
timestamp                                            20130221_122730
end_timestamp                                        20130222_022050
csail_edf_name                            bwh_51_1_0_20130221T122730
duration_(hours)   

In [59]:
rows

Unnamed: 0,sid,timestamp,end_timestamp,csail_edf_name,duration_(hours),nsamples,time_to_next_edf_(hours),is_timestamp_guessed_from_old_edf_name,is_guessed_timestamp_correct_seeming
1263,bwh51,20130221_122730,20130222_022050,bwh_51_1_0_20130221T122730,13.888889,10000000,0,True,False
1264,bwh51,20130222_022050,20130222_132155,bwh_51_1_1_20130222T022050,11.018056,7933000,6359.870278,True,False
1265,bwh51,20131114_131408,20131114_183928,bwh_51_1_0_20131114T131408,5.422222,3904000,0.003055556,False,
1266,bwh51,20131114_183939,20131115_000459,bwh_51_1_1_20131114T183939,5.422222,3904000,0.003055556,False,
1267,bwh51,20131115_000510,20131115_053030,bwh_51_1_2_20131115T000510,5.422222,3904000,0.003333333,False,
1268,bwh51,20131115_053042,20131115_075546,bwh_51_1_3_20131115T053042,2.417778,1740800,3.028333333,False,
1269,bwh51,20131115_105728,20131115_162248,bwh_51_2_0_20131115T105728,5.422222,3904000,0.003055556,False,
1270,bwh51,20131115_162259,20131115_214819,bwh_51_2_1_20131115T162259,5.422222,3904000,0.003055556,False,
1271,bwh51,20131115_214830,20131115_224228,bwh_51_2_2_20131115T214830,0.899556,647680,-,False,


In [60]:
edfs_and_indices_df

Unnamed: 0,edf,start_index,end_index
