# Create test data set

## Import libraries

In [None]:
import os
import copy
import pickle
import datetime
import pandas as pd
import editdistance
from tqdm import tqdm
from Bio import AlignIO
import scipy.cluster.hierarchy as shc


## General utilities

In [None]:
file_suffix = '_test'
output_path = "output/"


In [None]:
!rm -r ../scripts/output/stays-test/

In [None]:
def save_as_pickle(data, file_name, path=output_path):
    file = open(path + file_name, 'wb')
    pickle.dump(data, file)
    file.close()


def get_pickle(file_name, path=output_path):
    return pickle.load(open(path + file_name, 'ab'))


## Create test set from sequences

In [None]:
# seqs_1 = ["aaaaaaaaab", "aaaaaaaab", "aaaaaaab", "aaaaaab", "aaaaab"]
seqs_1 = ["aaaaaaaaa",
          "aaaaaaaya",
           "abb",
           "bbb",
           "aba",
           "abaa",
           "dbbddd",
           "dbbdd",
           "eedbbdd",
           "dbbddgh",
           "ccccccccc",    
           "cccccccgc",    
          ]
# seqs_1 = [
#             "bbb",
#            "aba",
#            "abaa",
#            "abbaa",
#           ]




In [None]:
test_data = pd.DataFrame()
date_time = datetime.datetime(2022, 1, 30, 18, 31)

for idx, seq in enumerate(seqs_1):
    data_seq = pd.DataFrame([i for i in seq], columns=['eventtype'])
    data_seq['subject_id'] = 10201 + idx
    data_seq['hadm_id'] = 20201 + idx
    data_seq['transfer_id'] = [i + 100 * idx for i in range(len(seq))]
    data_seq['careunit'] = 'Emergency Department'
    data_seq['intime'] = [date_time + datetime.timedelta(0,i) for i in range(len(seq))]
    data_seq['outtime'] = data_seq['intime']
    data_seq['charttime'] = data_seq['intime']
    # data_seq['intime'] = datetime.datetime(2022, 1, 30, 18, 31)
    # data_seq['outtime'] = datetime.datetime(2022, 1, 30, 18, 31)
    # data_seq['charttime'] = datetime.datetime(2022, 1, 30, 18, 31)
    data_seq = data_seq.set_index('transfer_id')
    test_data = pd.concat([test_data, data_seq])
    

    


In [None]:
test_data

In [None]:
test_data['event'] = 'transfer: ' + test_data['eventtype']


In [None]:
all_data_types = pd.DataFrame(columns=['subject_id',	'hadm_id',	'transfer_id',	'eventtype',	'careunit',	'intime',	'outtime',
                              'charttime',	'event',	'value',	'valuenum',	'valueuom',	'label',	'category',	'param_type',	'value_categorical'])
all_data_types


In [None]:
data = test_data.merge(all_data_types, how='left')
data.head()

In [None]:
data['event_encoded'] = data['eventtype']
len(data.event_encoded.unique())


In [None]:
data.head()

In [None]:
data = data.sort_values(by=['charttime'])
data.head()


In [None]:
data.insert(0, 'event_id', range(0, 0 + len(data)))
data.set_index('event_id')
data.head()


In [None]:
save_as_pickle(data, 'data_test')
data.to_csv("output/data_test.csv")


In [None]:
distance_data = data[['event_id', 'hadm_id', 'event_encoded']]
save_as_pickle(distance_data, 'distance_test_data')
distance_data.to_csv('output/distance_test_data.csv')
distance_data.head()


## Calculate distance matrix

In [None]:
start_index = 0
number_of_stays = 'test'
dist_data = distance_data
output_path = "output/"
output_folder = f"stays-test/"
alignments_output = f"{output_folder}alignments/"
file_suffix = '_test'


In [None]:
stays = list(dist_data['hadm_id'].unique())

In [None]:
save_as_pickle(data, 'events' + file_suffix)


In [None]:
def compute_distance_matrix():
    sequences = [dist_data[dist_data['hadm_id']
                           == hadm_id]['event_encoded'].tolist() for hadm_id in stays]

    print("[INFO] Data Loaded ")

    length = len(sequences)
    outputMatrix = [[0] * length for _i in range(length)]

    progress = 0
    updateStep = 100
    with tqdm(total=0.5*(length * length)) as pbar:
        for idxA in range(0, length):
            for idxB in range(idxA, length):
                max_length = max(len(sequences[idxA]), len(sequences[idxB]))
                distance = editdistance.eval(
                    sequences[idxA], sequences[idxB])/max_length
                outputMatrix[idxA][idxB] = distance
                outputMatrix[idxB][idxA] = distance
                if (progress % updateStep == 0):
                    pbar.update(updateStep)
                progress += 1

    return outputMatrix


In [None]:
dist_matrix = compute_distance_matrix()
save_as_pickle(dist_matrix, 'distance_matrix_' + str(number_of_stays))
save_as_pickle(dist_matrix, 'dist_matrix_' + str(number_of_stays))

In [None]:
# print(editdistance.eval('bbb', 'aba')/3)
# print(editdistance.eval('bbb', 'abaa')/4)
# print(editdistance.eval('bbb', 'abbaa')/5)
# print(editdistance.eval('aba', 'abaa')/4)
# print(editdistance.eval('aba', 'abbaa')/5)
print(editdistance.eval('abaa', 'abbaa')/5)


In [None]:
pd.DataFrame(dist_matrix)

## Calculate hierarchical clustering

In [None]:
def get_sequence_distance_matrix(u, v):
    index_u, index_v = stays.index(u[0]), stays.index(v[0])
    return dist_matrix[index_u][index_v]

In [None]:
clust_data = data.drop_duplicates(subset=['hadm_id'])

clust_data = clust_data.drop(columns=['event_id', 'subject_id', 'transfer_id', 'eventtype',
                                      'careunit', 'intime', 'outtime', 'charttime', 'event',
                                      'value', 'valuenum', 'valueuom',
                                      'label', 'category', 'param_type',
                                      'value_categorical',
                                      'event_encoded'])

# links = shc.linkage(clust_data, metric=get_sequence_distance_list)
links = shc.linkage(clust_data, metric=get_sequence_distance_matrix)
dend = shc.dendrogram(links, labels=stays, leaf_rotation=-90)


## Calculate alignment

In [None]:
def sequence_to_fasta(sequences: list, sequence_ids, id, path=output_path, folder=output_folder):
    if not os.path.isdir(path + folder):
        os.makedirs(path + folder)

    file_name = f"{path + folder}sequences-{id}.fa"

    if not os.path.exists(file_name):
        file = open(file_name, 'w')
        for i in range(len(sequences)):
            file.write(f">{sequence_ids[i]}\n{sequences[i]}\n")
        file.close()


In [None]:
sequences = []
for stay in stays:
    events = data[data['hadm_id'] == stay]
    # print(f"seq {stay}: {''.join(list(events['event_encoded_alphabet']))}")
    sequences.append(''.join(list(events['event_encoded'])))
    # print(f"seq {stay}: {''.join(list(events['event_encoded']))}")


In [None]:
for stay in stays:
    print(stay)

In [None]:
def sort_by_indexes(list_data, indexes, reverse=False):
  return [val for (_, val) in sorted(zip(indexes, list_data), key=lambda x:
          x[0], reverse=reverse)]


def get_clusters_by_level(level, links):
    return list(shc.fcluster(links, t=level, criterion="distance"))


def get_aggregated_sequence(al_seq):
    agg_sequence = list(
        zip(*[sequence.seq for sequence in al_seq]))
    # Remove duplicates
    agg_sequence = [list(set(agg_event)) for agg_event in agg_sequence]
    # Convert characters to numbers
    agg_sequence = [[event for event in agg_event]
                    for agg_event in agg_sequence]
    # agg_sequence = [[str(character_to_number(event))
    #                  for event in agg_event] for agg_event in agg_sequence]
    # Only have lists when aggregate event
    agg_sequence = [event[0] if len(
        event) == 1 else event for event in agg_sequence]

    return agg_sequence


In [None]:
# indices = [dend['ivl'].index(i) for i in stays]


In [None]:
def cluster_events(level, stays, links):
    clusters = get_clusters_by_level(level, links)
    unique_levels = list(set(clusters))
    print(f"clust: {clusters}, ul: {unique_levels}")

    cluster_level = copy.deepcopy(level)

    for count, level in enumerate(unique_levels):
        cluster = [i for i, x in enumerate(clusters) if x == level]
        print(f"clust: {cluster}")
        bidx = branch_depths.index(cluster_level)

        sequence_ids = [int(stays[i]) for i in cluster]
        print(f"sids: {sequence_ids}")

        alignment_levels = []
        if (cluster_level > 0):
            alignment_levels = list(dict.fromkeys([
                sequence_alignments[bidx - 1][stays.index(s)] for s in sequence_ids]))

        if len(sequence_ids) == 1:
            # base case: sequence need not to be merged
            # print("[INFO] base case, no alignment")
            pass

        elif (len(alignment_levels) == 1 and alignment_levels[0] == -1) or (len(alignment_levels) == 0 and cluster_level == 0):
            # case sequences need to be merged, no prior alignment
            print("[INFO] sequences need to be aligned, no alignment present")

            sequence_to_fasta(sequences=[sequences[stays.index(s)] for s in sequence_ids],
                              sequence_ids=sequence_ids, id=f"{cluster_level}-{count}")

            sequences_file_path = f"{output_path + output_folder}sequences-{cluster_level}-{count}.fa"
            base_alignment_file_path = f"{output_path + output_folder}alignment-{cluster_level}-{count}.fasta"

            mafft_align = f"/usr/local/bin/mafft --text --reorder --maxiterate 0 --retree 1 --6merpair --quiet --thread 4 {sequences_file_path} > {base_alignment_file_path}"
            !$mafft_align
            # os.system(mafft_align)

            alignment = AlignIO.read(base_alignment_file_path, "fasta")
            aggregated_sequence = get_aggregated_sequence(alignment)

            cluster_alignment = {
                'file': base_alignment_file_path,
                'stays': sequence_ids,
                'sequence': aggregated_sequence,
                'alignment': [{'hadm_id': aligned_seq.id, "sequence": [
                    event for event in aligned_seq.seq]} for aligned_seq in alignment]
            }

            save_as_pickle(
                cluster_alignment, f"alignment-info-{number_of_stays}-level-{cluster_level}-count-{count}.p", path=output_path + alignments_output)

            for s in sequence_ids:
                sidx = stays.index(s)
                sequence_alignments[bidx][sidx] = f"{cluster_level}-{count}"

        elif len(alignment_levels) == 1 and alignment_levels[0] != -1:
            # case sequences have already been merged, no action needed
            print("[INFO] have been merged, no action needed")
            for s in sequence_ids:
                sidx = stays.index(s)
                sequence_alignments[bidx][sidx] = sequence_alignments[bidx - 1][sidx]

        else:
            # merging and or alignment needs to happen
            # print("[INFO] merge needed and optional alignment")

            not_aligned_sequences = [
                s for s in sequence_ids if sequence_alignments[bidx - 1][stays.index(s)] == -1]

            sequence_to_fasta(sequences=[sequences[stays.index(s)] for s in not_aligned_sequences],
                              sequence_ids=not_aligned_sequences, id=f"{cluster_level}-{count}")

            sequences_file_path = f"{output_path + output_folder}sequences-{cluster_level}-{count}.fa"
            base_alignment_file_path = f"{output_path + output_folder}alignment-{cluster_level}-{count}.fasta"

            # Get alignment files of previous merged
            aligned_sequences = [
                s for s in sequence_ids if sequence_alignments[bidx - 1][stays.index(s)] != -1]

            print(
                f"[INFO] merge needed ({len(aligned_sequences)}) and optional alignment ({len(not_aligned_sequences)})")

            aligned_files = []

            for a in aligned_sequences:
                file_details = sequence_alignments[bidx -
                                                   1][stays.index(a)].split('-')
                aligned_files.append(
                    f"{output_path + output_folder}alignment-{file_details[0]}-{file_details[1]}.fasta")

            aligned_files = list(dict.fromkeys(
                aligned_files))  # Remove duplicates
            table_files = " ".join(aligned_files)
            aligned_files.append(sequences_file_path)
            input_files = " ".join(aligned_files)

            # Create merge table for MAFFT
            merge_table = f"/usr/bin/ruby makemergetable.rb {table_files} > subMSAtable"
            # os.system(merge_table)
            !$merge_table

            # Create input file
            input_command = f"cat {input_files} > inputFile"
            !$input_command

            # os.system(input_command)

            mafft_merge = f"/usr/local/bin/mafft --merge subMSAtable --text --reorder --maxiterate 0 --retree 1 --6merpair --quiet --thread 4 inputFile > {base_alignment_file_path}"
            # os.system(mafft_merge)
            !$mafft_merge

            alignment = AlignIO.read(base_alignment_file_path, "fasta")
            aggregated_sequence = get_aggregated_sequence(alignment)

            cluster_alignment = {
                'file': base_alignment_file_path,
                'stays': sequence_ids,
                'sequence': aggregated_sequence,
                'alignment': [{'hadm_id': aligned_seq.id, "sequence": [
                    event for event in aligned_seq.seq]} for aligned_seq in alignment]
            }

            save_as_pickle(
                cluster_alignment, f"alignment-info-{number_of_stays}-level-{cluster_level}-count-{count}.p", path=output_path + alignments_output)

            for s in sequence_ids:
                sidx = stays.index(s)
                sequence_alignments[bidx][sidx] = f"{cluster_level}-{count}"


In [None]:
branch_depths = [-1]
for d in dend['dcoord']:
    branch_depths.append(d[1])
branch_depths = list(dict.fromkeys(branch_depths))
branch_depths.sort()

if not os.path.isdir(output_path + alignments_output):
    os.makedirs(output_path + alignments_output)

sequence_alignments = [[-1] * len(stays) for i in range(len(branch_depths))]

for index, branch_depth in enumerate(tqdm(branch_depths)):
    print(f"Aligning level {index, branch_depth}")
    cluster_events(branch_depth, stays, links)

# save_as_pickle(sequence_alignments, 'alignments_' + str(number_of_stays))


In [None]:
labels = []

transfer_label_data = data[['eventtype', 'careunit', 'event_encoded']]
icu_label_data = data[['label', 'value',
                       'valuenum',	'valueuom', 'event_encoded']]

for event_type in tqdm(transfer_label_data['eventtype'].unique()):
    # values = data[data['eventtype'] == event_type].drop_duplicates()
    values = transfer_label_data[transfer_label_data['eventtype']
                                 == event_type].drop_duplicates()
    if len(values) == 0:
        continue
    else:
        label = {
            'type': 'Transfer',
            'care_unit': event_type,
            'value_enc': values.iloc[0, 2],
            'values': [{'event_type': v.careunit} for v in values.itertuples(index=True)]
        }

    labels.append(label)

for label in tqdm(icu_label_data['label'].unique()):
    values = icu_label_data[icu_label_data['label'] == label].drop_duplicates()
    if len(values) == 0:
        continue
    else:
        label = {
            'type': 'ICU measurement',
            'measurement': label,
            'value_enc': values.iloc[0, 4],
            'unit': values.iloc[0, 3] if not pd.isnull(values.iloc[0, 3]) else -1,
            'values': [{'value': v.value} for v in values.itertuples(index=True)]
        }
    labels.append(label)


In [None]:
save_as_pickle(links, 'links' + file_suffix)
save_as_pickle(dist_matrix, 'dist_matrix' + file_suffix) 
save_as_pickle(stays, 'stays' + file_suffix)
save_as_pickle(sequence_alignments, 'alignments_' + str(number_of_stays))
save_as_pickle(labels, 'labels' + file_suffix)


In [None]:
sequence_alignments[-1]