# Clustering validation

## General utilities

In [None]:
import io
import json
import pickle
import matplotlib
import math
import numpy as np
import pandas as pd
import scipy.cluster.hierarchy as shc
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from tqdm import tqdm
import os
import copy


In [None]:
start_index = 0
number_of_stays = 'test'
stay_length = 20
display_matrix = True
output_path = "../scripts/output/"
run_test_data = False


In [None]:
def save_as_pickle(data, file_name, path=output_path):
    file = open(path + file_name, 'wb')
    pickle.dump(data, file)
    file.close()


def get_pickle(file_name, path=output_path):
    return pickle.load(open(path + file_name, 'rb'))


def sequence_to_fasta(sequences: list, file_name, path=output_path):
    file = open(path + file_name + '.fa', 'w')
    for i in range(len(sequences)):
        file.write(f">sequence_{i}\n{sequences[i]}\n")
    file.close()


def get_sequence_distance_list(u, v):
    index_u, index_v = stays.index(u[0]), stays.index(v[0])
    return dist_matrix[min(index_u, index_v)][max(index_u, index_v) - min(index_u, index_v)]


def get_sequence_distance_matrix(u, v):
    index_u, index_v = stays.index(u[0]), stays.index(v[0])
    return dist_matrix[index_u][index_v]


## Calculate the data again to check

In [None]:
print("[INFO] Loading data")

data = get_pickle('data_test')
dist_data = get_pickle("distance_test_data")


In [None]:
file_suffix = '_' + str(number_of_stays)

print(f"[INFO] Number of sequences: {number_of_stays}")

print("[INFO] Using complete dist data")
dist_matrix = get_pickle("distance_matrix_test")

stays = list(dist_data['hadm_id'].unique())

print("[INFO] Data loaded")

clust_data = data.drop_duplicates(subset=['hadm_id'])

clust_data = clust_data.drop(columns=['event_id', 'subject_id', 'transfer_id', 'eventtype',
                                      'careunit', 'intime', 'outtime', 'charttime', 'event',
                                      'value', 'valuenum', 'valueuom',
                                      'label', 'category', 'param_type',
                                      'value_categorical',
                                      'event_encoded'])

# links = shc.linkage(clust_data, metric=get_sequence_distance_list)

### Hierarchical clustering

In [None]:
links = shc.linkage(clust_data, metric=get_sequence_distance_matrix)
dend = shc.dendrogram(links, labels=stays, leaf_rotation=-90)

In [None]:
print("Dend info")
print(dend['ivl'])
print(stays)


## Calculate clusters from dendrogram

In [None]:
print('[INFO] Loading links')
links = get_pickle('links' + file_suffix)
print('[INFO] Loading clusters')
clusters = get_pickle('alignments' + file_suffix)
print('[INFO] Loading stays')
stays = get_pickle('stays' + file_suffix)
print('[INFO] Loading events')
events = get_pickle('events' + file_suffix)
print('[INFO] Data loaded')
dend = shc.dendrogram(links, labels=stays, leaf_rotation=-90)


In [None]:
print("Dend info")
print(dend['ivl'])
print(stays)


In [None]:
def get_all_levels(dend):
    branch_depths = [-1]
    branch_depths.extend(list(set([d[1] for d in dend['dcoord']])))
    branch_depths.sort()
    return branch_depths


def sort_by_indexes(list_data, indexes, reverse=False):
    return [val for (_, val) in sorted(zip(indexes, list_data), key=lambda x:
            x[0], reverse=reverse)]


In [None]:
levels = get_all_levels(dend)
print(levels)

In [None]:
indices = [dend['ivl'].index(i) for i in stays]
print(indices)

In [None]:
sequences = []
for stay in stays:
    e = events[events['hadm_id'] == stay]
    sequences.append(''.join(list(e['event_encoded'])))
print(sequences)


In [None]:

def get_clusters_by_level(level):
    return list(shc.fcluster(links, t=level, criterion="distance"))


In [None]:
def get_available_levels():
    print(f"stays: {dend['ivl']}")
    print(f"og: {stays}")
    
    dend_data = {
        'sequences': [int(i) for i in dend['ivl']],
        'levels': []
    }
    for index, level in enumerate(get_all_levels(dend)):
        dend_data['levels'].append({
            'level': level,
            'cluster': [int(i) for i in get_clusters_by_level(level)]
        })

    return dend_data


In [None]:
cluster_level = get_clusters_by_level(0.1111111111111111)
unique_levels = list(set(cluster_level))

for count, level in enumerate(unique_levels):
    cluster = [i for i, x in enumerate(cluster_level) if x == level]
    print(f"clust: {cluster}")


In [None]:
unique_levels = list(set(clusters))

for count, level in enumerate(unique_levels):
    cluster = [i for i, x in enumerate(clusters) if x == level]
    print(f"clust: {cluster}")


In [None]:
dend['dcoord']

In [None]:
branch_depths = [-1]
for d in dend['dcoord']:
    branch_depths.append(d[1])
branch_depths = list(dict.fromkeys(branch_depths))
branch_depths.sort()

print(branch_depths)

In [None]:
sequence_ids = [int(stays[i]) for i in cluster]


In [None]:
dend