# Distance matrix computation from an index patient 
A distance matrix of size (n_label, n_label) is computed between the index patient and all other patients. 

Distance computation for each label and cross label can be found in module distance.py

Input : 
A pickle file containing a dataframe of patients with following informations: 
source (file/clinical note id), patient_id, terms, Fastext Embedding, BERT embedding, label. 
Eventually : patient index file, if not, this one would be selected from the dataset


Output : Distance Matrix of size (n_label, n_label, number of patients to compare with)


In [1]:
import os
import numpy as np
import pandas as pd 

import sklearn
from scipy.spatial.distance import cosine
from sklearn.metrics import euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer

# import local module of EMD computation along each label and between labels 
import distance 

file_path='./out/02022022_train_val_local_annotated_accent_lower_FastText_BERT_mean2l_Emb.pkl'
img_path = "./out/img_annot_Pheno_attributes/"
file_CLASS_path = './results/Gold_classif_expe_pheno_patient_id.csv'

Embedding = "Fasttext" # should be Fasttext of local_BERT or EDS_BERT
assert Embedding in ('Fasttext', 'local_BERT', 'EDS_BERT')

In [2]:
# load the dataframe
df_init = pd.read_pickle(file_path)

# in a first experiment, delete all "etatsosy rows"
df = df_init.copy()
df.drop(df.loc[df.label == 'etatsosy'].index, inplace=False)

# load the labels for all the terms of the file
labels = sorted(set(df['label']))
n_lab = len(labels)
print('\n number of labels', n_lab, '\nLabels', labels)



# load the files 
files = list(set(df['source']))
# print("number of files", len(files))

# Create a dictionnary  {file : [list of corresponding labels]} :
files_dict = {}
for i in range(len(files)):
    files_dict[files[i]]= set(df[df['source']== files[i]].label.tolist())
    
df['idx']=range(len(df))
df.sample(5)



 number of labels 23 
Labels ['ORL', 'blessures', 'cardiovasculaires', 'chimiques', 'digestif', 'endocriniennes', 'etatsosy', 'femme', 'genetique', 'hemopathies', 'homme', 'immunitaire', 'infections', 'nerveux', 'nutritionnelles', 'oeil', 'osteomusculaires', 'parasitaires', 'peau', 'respiratoire', 'stomatognathique', 'tumeur', 'virales']


Unnamed: 0,idx,source,term,fastext_embeddings,BERT_embeddings,label,label_weight
3201,3201,CRH_scl_sample_30.ann,kystes ovariens,"[0.81571275, 12.604499, -25.561928, -6.7058015...","[-0.14426988, 0.15276805, 0.20322166, -0.00078...",femme,0.044944
1740,1740,CRH_scl_sample_42.ann,anti centromères positifs,"[2.812077, -6.981257, -45.759495, -79.20795, 1...","[-0.23235501, 0.52911335, 0.40961814, -0.30308...",etatsosy,0.447761
1617,1617,CRH_taka_sample_10.ann,dents manquantes,"[-3.3379304, 17.830576, -24.00285, -34.287975,...","[-0.30510905, 0.28690282, 0.13931705, -0.29548...",stomatognathique,0.005988
4799,4799,CRH_taka_sample_0.ann,majoration du nombre d'infarctus lacunaires en...,"[-3.7410843, 15.352729, -21.703049, -18.029694...","[-0.12426093, 0.060462743, -0.15853764, 0.1929...",nerveux,0.175
1197,1197,CRH_sapl_sample_10.ann,syndrome sec oculaire,"[29.312986, 10.587681, -32.925625, -25.076454,...","[0.09900762, 0.57523936, 0.624562, -0.32313663...",etatsosy,0.424581


In [129]:
# select a patient for comparison purpose : 
file_index = "CRH_scl_sample_46.ann"
files_lab = files.copy()
files_lab.remove(file_index)
print(len(files_lab))

150


In [130]:
# Compute Distance matrix 

# First by storing all distances in a dictionnary dist_dict : 
# dist_dict contains each following keys : (file, label_1, label_2) and value : distance 
dist_dict= {}
dist_matrix = np.zeros([len(files_lab), len(labels), len(labels)])


i = 0 
for file in sorted(set(files_lab)):
    j = 0
    for lab_1 in sorted(labels): 
        k = 0
        for lab_2 in sorted(labels): 
            if (file,lab_1,lab_2) not in dist_dict.keys(): 
                if lab_1 == lab_2:
                    dist_dict[(file, lab_1, lab_2)] = distance.distance_files_by_lab(df, file_index, file, 
                                                                                     Embedding,lab_1, 
                                                                                     Distance_type = 'cosine', 
                                                                                     verbose = False)
                    dist_matrix[i,j,j] = dist_dict[(file, lab_1, lab_2)]
                else : 
                    dist_dict[(file, lab_1, lab_2)] = distance.distance_files_cross_lab(df, file_index, file, 
                                                                                        Embedding,lab_1,lab_2,
                                                                                        Distance_type = 'cosine', 
                                                                                        verbose = False)
                    dist_matrix[i,j,k] = dist_dict[(file, lab_1, lab_2)]
            k += 1
        j += 1
                    
    i += 1
    if i%30 == 0:
        print('i', i)
    

i 30
i 60
i 90
i 120
i 150


In [131]:
# print(dist_dict[('CRH_lupus_sample_0.ann', 'etatsosy', 'osteomusculaires')])

In [132]:
# In order to easily navigate from one entity to another, create a dictionnary of labels and files 
# corresponding to their  
lab_index = {}
for i in range(len(labels)):
    lab_index[sorted(list(labels))[i]] = i

files_index ={}
for j in range(len(files_lab)):
    files_index[sorted(list(files_lab))[j]] = j
    
    
index_files={}
for j in range(len(files_lab)):
    index_files[j+1] = sorted(list(files_lab))[j]


In [133]:
# dist_matrix[files_index['CRH_lupus_sample_2.ann'], lab_index['etatsosy'], lab_index['immunitaire']]

In [134]:
# Now compute a weighted matrix in order to select only some labels of interest
# Get labels of interest : 
label_1 = 'respiratoire'
label_2 = 'immunitaire'


# get their respected indexes : 
idx_1 = lab_index[label_1]
idx_2 = lab_index[label_2]
# idx_3 = lab_index[label_3]

# give the associated weight : 
beta = 5

# build the weight matrix : 
weights = np.zeros([n_lab, n_lab]) # +100
weights[idx_1, idx_1] = weights[idx_2, idx_2] = 1
weights[idx_1, idx_2] = weights[idx_2, idx_1] = 0.5



In [135]:
# weights[0:idx_1+1, 0: idx_2+1]

In [136]:
# repeat the weights matrix for all patients : 
weights_all = np.repeat(weights[np.newaxis, :,:], len(files_lab), axis = 0)

In [137]:
weights_all.shape, weights_all[50, idx_1, idx_2]

((150, 23, 23), 0.5)

In [138]:
# Now multiply the 2 3D matrices in order to get a distance from our patient to all patient on those 2 labels : 
print('Correponding shapes of the two matrices : distance matrix =', dist_matrix.shape, 
      'and weights =', weights_all.shape)

# we want to obtain a 150*1 array at the end

Correponding shapes of the two matrices : distance matrix = (150, 23, 23) and weights = (150, 23, 23)


In [139]:
# Element wise computation 
result = np.multiply(dist_matrix, weights_all)

In [140]:
result.shape, np.nanmax(result)

((150, 23, 23), 0.9162385383060004)

In [141]:
result_no_nan = np.nan_to_num(result, copy=True, nan=0, posinf=None, neginf=None)

In [142]:
reduce_result = np.sum(result_no_nan, axis = 2)
reduce_result.shape

(150, 23)

In [143]:
reduce_result_2 = np.sum(reduce_result, axis = 1)
reduce_result_2.shape

(150,)

In [144]:
for file, index in files_index.items():
    if index == np.argmin(reduce_result_2):
        print(file)

CRH_lupus_sample_23.ann


In [145]:
# convert the np array to a dict in order to sort values and get the closest files
d = dict(enumerate(reduce_result_2.flatten(), 1))

In [146]:
# convert the number with file names :
file_dist_dict = {}
for k, v in d.items():
    file_dist_dict[index_files[k]] = v

#file_dist_dict

In [147]:
# rename the files to check if osteoporosis are the first one
# act_class = 'osteoporose'
# act_class = 'nephro_lupus'
act_class = 'PINS_Sclerodermie'

# Load the corresponding class of the files 
class_df = pd.read_csv(file_CLASS_path)
    
file_class_dict = {}

for k,v in file_dist_dict.items(): 
    if k in class_df['source'].tolist():
        if act_class in class_df[class_df['source'] == k].label.tolist():
            file_class_dict[(act_class[0:5]+'_'+k)] = v
        else :
            file_class_dict[k] = v

In [148]:
# class_df.sample(10)

file_index

'CRH_scl_sample_46.ann'

In [149]:
{k: v for k, v in sorted(file_class_dict.items(), key=lambda item: item[1]) if v > 1}

{'PINS__CRH_scl_sample_47.ann': 1.2369788912040007,
 'PINS__CRH_scl_sample_26.ann': 1.3249479008505007,
 'CRH_scl_sample_52.ann': 1.4253454251615008,
 'PINS__CRH_scl_sample_44.ann': 1.4654074442395009,
 'PINS__CRH_scl_sample_9.ann': 1.5581567693205005,
 'PINS__CRH_scl_sample_20.ann': 1.6137180796715007,
 'PINS__CRH_scl_sample_37.ann': 1.6210630802075012,
 'PINS__CRH_scl_sample_5.ann': 1.6763391310650007,
 'PINS__CRH_scl_sample_6.ann': 1.6884360980770006,
 'PINS__CRH_scl_sample_45.ann': 1.6902605706335012,
 'PINS__CRH_scl_sample_27.ann': 1.6938177010000006,
 'PINS__CRH_scl_sample_14.ann': 1.7066654958505005,
 'PINS__CRH_scl_sample_10.ann': 1.719246261970501,
 'PINS__CRH_scl_sample_38.ann': 1.741313020669001,
 'CRH_lupus_sample_20.ann': 1.799654607146001,
 'CRH_scl_sample_48.ann': 1.837390735360001,
 'PINS__CRH_scl_sample_19.ann': 1.8447255088055008,
 'CRH_scl_sample_24.ann': 1.845861473932501,
 'CRH_scl_sample_17.ann': 1.8499118870210007,
 'PINS__CRH_scl_sample_30.ann': 1.85650561881700