# Cache MIMIC distances (code, patient)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import time
import threading
import datetime
import multiprocessing as mp
from multiprocessing import Process, Manager
from itertools import repeat
import multiprocessing as mp
import os
import pprint
from magictree import * #tree layout

In [None]:
raw = pd.read_csv('ICD9CM.csv')
mimic = sorted(np.load('mimic_sequences.npy'))

In [None]:
import pickle

def pickle_save(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def pickle_load(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
#build graphs

icd9_G=nx.DiGraph()
icd9_G.add_edge('ROOT','owl#Thing')
for indx,row in list(raw.iterrows()):
    son = row['Class ID'].split('/')[-1]
    dad = row['Parents'].split('/')[-1] if isinstance(row['Parents'], str) else 'ROOT'
    icd9_G.add_edge(dad,son)
icd9_G.add_edge('719.7','719.70')

mimic_leaves = sorted(list(set([a for b in [c for d in mimic for c in d] for a in b])))

mimic_nodes = []
for l in mimic_leaves:
    mimic_nodes+=nx.shortest_path(icd9_G,'ROOT',l)
mimic_nodes = sorted(list(set(mimic_nodes)))

#FATHER->SON
mimic_G=nx.DiGraph()
for son in mimic_nodes:
    if son=='ROOT':
        continue
    dad = list(icd9_G.in_edges(son))[0][0]
    mimic_G.add_edge(dad,son)

# Cache code2code dist

In [None]:
all_codes = sorted(list(set([a for b in [c for d in mimic for c in d] for a in b])))
verbose_mimic_codes = [nx.shortest_path(mimic_G,'ROOT',c) for c in all_codes]

#common substring length -> depth of LCA
#a1, a2 are PATHS FROM ROOT
def wup_chain(a1,a2):
    lca = 0
    while True:
        if lca>=len(a1) or lca>=len(a2) or a1[lca]!=a2[lca]:
            break
        lca+=1
    dr = lca-1
    da = len(a1)-lca
    db = len(a2)-lca
    return((2*dr)/(da+db+2*dr))

In [None]:
c2c_dict = {}
for i,v1 in enumerate(verbose_mimic_codes):
    if i%100==0:
        print(i,end=' ')
    for v2 in verbose_mimic_codes:
        c2c_dict[(v1[-1],v2[-1])]=wup_chain(v1,v2)

In [None]:
pickle_save(c2c_dict,'c2c_dist')

In [None]:
print('self-dist?',c2c_dict[('228.02','228.02')])
print('symmetry?',c2c_dict[('228.02','518.81')],c2c_dict[('518.81','228.02')])
print('near vs far?',c2c_dict[('518.81','519.1')],c2c_dict[('518.81','V55.0')])

# Distance functions

In [None]:
#p2p_dist = {}
root = 'ROOT'
symbols = [chr(i) for i in range(33,128)]

In [None]:
from weighted_levenshtein import lev, osa, dam_lev

def wup_visit(graph,v1,v2,verbose=False):
    #encode
    all_icd9 = sorted(list(set(v1+v2)))
    coded_all = [symbols[i] for i in range(len(all_icd9))]
    coded_v1 = ''.join([coded_all[all_icd9.index(c)] for c in sorted(v1)])
    coded_v2 = ''.join([coded_all[all_icd9.index(c)] for c in sorted(v2)])
    encoder = {k:v for (k,v) in zip(all_icd9,coded_all)}
    decoder = {v:k for (k,v) in zip(all_icd9,coded_all)}
    #raw matrices
    icd9_substitute_costs = np.full((128, 128), 1, dtype=np.float64)
    icd9_insert_costs = np.full(128, 1, dtype=np.float64)
    icd9_delete_costs = np.full(128, 1, dtype=np.float64)
    #now, discount for near stuff
    for icd9_1 in v1:
        for icd9_2 in v2:
            indx_1 = ord(encoder[icd9_1])
            indx_2 = ord(encoder[icd9_2])
            icd9_substitute_costs[indx_1][indx_2] = 1 - c2c_dict[(icd9_1,icd9_2)]
            if verbose:
                print(icd9_1,'->',icd9_2,': cost',icd9_substitute_costs[indx_1][indx_2])   
    #
    return lev(coded_v1,coded_v2, substitute_costs=icd9_substitute_costs,
         insert_costs=icd9_insert_costs, delete_costs=icd9_delete_costs)

In [None]:
def wup_patient(graph, p1, p2):
    dtw = np.full((len(p1),len(p2)), 10000, dtype=np.float64)
    dtw[0,0] = 0 
    for i,v1 in enumerate(p1):
        for j,v2 in enumerate(p2):
            cost = np.round(wup_visit(graph,v1,v2),3)
            in_cost = dtw[i-1,j] if i>0 else 0
            del_cost = dtw[i,j-1] if j>0 else 0
            edit_cost = dtw[i-1,j-1] if i>0 and j>0 else 0
            dtw[i,j] = cost + min(in_cost,del_cost,edit_cost)
    return dtw[-1][-1]

# Cache patient2patient dist

In [None]:
manager = mp.Manager()
p2p_dict = manager.dict()
max_dict = int((len(mimic)**2 - len(mimic))/2)

In [None]:
#each slave manages ONE SINGLE PATIENT
def slave(i):
    for j,p in enumerate(mimic):
        if j>i:
            p2p_dict[(i,j)]=wup_patient(icd9_G,mimic[i],mimic[j])
    if i%100==0:
        print(str(i),end='\n')

def distrib_tasks(num_parallel):
    start = time.time() 
    # created multicore pool running 
    pool = mp.Pool(num_parallel)
    # Execute the folding task in parallel
    for i,e in enumerate(mimic):
        pool.apply_async(slave, args=([i]))
        #monitor
    # Tell the pool that there are no more tasks to come and join
    pool.close()
    pool.join()
    #
    print()
    print(time.time()-start)

In [None]:
distrib_tasks(num_parallel=80) #parallelise

In [None]:
pickle_save(dict(p2p_dict), 'p2p_holy_2')

# minitest

In [None]:
p2p_dict = pickle_load('p2p_holy_2')

In [None]:
mimic[7330]

In [None]:
mimic[3670]

In [None]:
wup_patient(icd9_G,mimic[7330],mimic[3670])

In [None]:
p2p_dict[(3670,7330)]