# Network-based approach for drug repurposing using drug signature and disease phenotype
### Ariane ALIX
#### ENS Paris-Saclay, Department of Mathematics

In [29]:
%reload_ext autoreload
%autoreload 2

In [30]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

dir_path = os.path.abspath('')
os.chdir(dir_path)

## Importation of the data and reformatting

### Importing 

#### To pandas Dataframes

In [31]:
for file in os.listdir('data'):
    print(file)

disease_phenotypes1.csv
disease_phenotypes2.csv
drug_disease_associations.csv
drug_signatures.csv


In [32]:
disease_phenotypes1=pd.read_csv('data/disease_phenotypes1.csv')
disease_phenotypes2=pd.read_csv('data/disease_phenotypes2.csv')
drug_signatures=pd.read_csv('data/drug_signatures.csv')
drug_disease_associations=pd.read_csv('data/drug_disease_associations.csv')

### Re-formatting

#### Dropping first column if needed, replace index first good column, replace NaN to 0

In [33]:
disease_phen1=disease_phenotypes1.drop(['Unnamed: 0'],axis=1).set_index(['gene']).fillna(0)
disease_phen2=disease_phenotypes2.drop(['Unnamed: 0'],axis=1).set_index(['gene']).fillna(0)
drug_sign=drug_signatures.rename(columns={"Unnamed: 0": "gene"}).set_index(['gene']).fillna(0)
drug_disease_asso=drug_disease_associations.drop(['Unnamed: 0'],axis=1).set_index(['drug_name'])

#### Fusion of the two tables of disease phenotypes

In [34]:
disease_phen=pd.concat([disease_phen1,disease_phen2],sort=False).fillna(0)

#### Switch everything to lower case

In [35]:
drug_disease_asso['ind_id']=drug_disease_asso['ind_id'].str.lower()
drug_disease_asso['drug_id']=drug_disease_asso['drug_id'].str.lower()
drug_disease_asso['ind_name']=drug_disease_asso['ind_name'].str.lower()
drug_disease_asso.index=drug_disease_asso.index.str.lower()

drug_sign.columns=drug_sign.columns.str.lower()

In [36]:
list(drug_sign.columns)

['methyl 2,5-dihydroxycinnamate',
 'compound 10',
 'gsk-3-inhibitor-ii',
 'hy-10456',
 'raf 265',
 'c3930',
 'pac 1',
 'az20',
 'brd-k44432556',
 '32937',
 's1018',
 'piperlongumine (hplc)',
 'apigenin triacetate',
 'y-27632',
 'mapp, l-erythro',
 'idelalisib',
 'brd-a36010170',
 'bi 2536',
 'ag 957',
 'brd-k96799727',
 'dichlorobenzamil',
 'ag 556',
 'salermide',
 'entinostat',
 'fit',
 'c8273',
 'plx-4720',
 'brd-k56411643',
 'brd-a35588707',
 'brd-k32582686',
 'wz-4-145',
 'stk397047',
 'calyculin a',
 'cyclosporine',
 'ro 90-7501',
 'k784-3187',
 'brd-k46373671',
 'lapatinib',
 'ncgc00188536-01',
 'brd-k35708212',
 'brd-k05977355',
 'mls-0091944.0001',
 'gw405833 hydrochloride',
 'brd-k52522949',
 'idarubicin hcl',
 'selumetinib',
 'daunorubicin',
 'hy-50895',
 '1,25-dihydroxyvitamin d3',
 'syk-inhibitor',
 'ly-2183240',
 'vx-680',
 '6-diazo-5-oxo-l-norleucine',
 'chaetocin',
 'minoxidil',
 'hy-50295',
 'gr 103691',
 'brd-a13964793',
 'brd-k87158025',
 'brd-k11634954',
 'sb-216763'

#### Averaging values for duplicates drugs in drug_sign tables

In [37]:
drug_sign = drug_sign.transpose()
drug_sign = drug_sign.groupby(by=drug_sign.index, sort=False).mean()
drug_sign = drug_sign.transpose()

#### Averaging values for duplicates genes in the tables

In [38]:
disease_phen=disease_phen.groupby(['gene'],sort=False).mean()
drug_sign=drug_sign.groupby(['gene'],sort=False).mean()

#### Preview of modified tables

In [39]:
display(disease_phen.head(3))
display(drug_sign.head(3))
display(drug_disease_asso.head(3))

Unnamed: 0_level_0,c0153195,c0016053,c0018916,c0346993,c0349790,c0376358,c0520577,c0037011,c0473527,c0036202,...,c0020428,c1274470,c0038363,c0278553,c0152973,c0004352,c0004096,c0149776,c0034065,c0038358
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMP1,-0.02739,0.010917,0.0,0.0,-0.099726,0.0,0.0,0.0,0.0,0.02091,...,0.0,0.0,0.0,0.017132,0.0,0.0,0.0,0.030745,0.016403,0.013822
SMG1P3,-0.025112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SMG1P1,-0.024621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008416,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,"methyl 2,5-dihydroxycinnamate",compound 10,gsk-3-inhibitor-ii,hy-10456,raf 265,c3930,pac 1,az20,brd-k44432556,32937,...,pha-793887,f3103-0039,beta-escin,clofarabine,pik-90,dovitinib,brd-k24632213,betamethasone acetate,brd-k17953061,"2,4-dideoxy-dc-45-a2"
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RNF14,0.009657,0.011228,0.01354,0.007519,0.004846,0.000957,-0.002974,-0.001293,0.00756,-0.003778,...,-0.00455,-0.000549,0.000281,0.000462,0.001925,0.003712,0.005116,0.005032,-0.003062,0.0053
UBE2Q1,0.007538,-0.003624,-0.003987,-0.001868,0.009035,-0.004862,-0.00238,0.000557,-0.002815,0.003299,...,0.012603,0.007133,0.011412,0.001781,0.000659,0.003501,-0.002974,0.009794,-0.000961,-0.006942
RNF17,0.003182,-0.002379,-0.013635,0.00025,-0.003551,-0.000438,-0.002424,-0.002837,0.001408,0.002562,...,-0.011662,0.003717,0.002052,0.002235,0.000363,-0.022385,-0.002264,0.00352,-0.002205,-0.002528


Unnamed: 0_level_0,drug_id,ind_name,ind_id,status
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lepirudin,db00001,heparin-induced thrombocytopenia with thrombosis,c0272275,Approved
cetuximab,db00002,squamous cell carcinoma of mouth,c0585362,Approved
cetuximab,db00002,squamous cell carcinoma of nose,c3163899,Approved


### Check drugs and diseases present in tables and filter

#### Diseases

In [40]:
diseases=drug_disease_asso['ind_id'].values

diseases2=disease_phen.columns

n_diseases=len(diseases)
n_diseases2=len(diseases2)
n_diseases_unique=len(np.unique(diseases))

print('There are',n_diseases,'occurence of diseases (',n_diseases_unique, 'distincts) in the drug-disease associations table.')
print('There are',n_diseases2,'diseases in the diseases phenotypes table.')

# Inclusion
count=0
for d in diseases2:
    if d in diseases:
        count+=1
        
print(count,'out of',n_diseases2,'of diseases in the phenotypes table are in the drug-disease associations table.')

There are 7325 occurence of diseases ( 1465 distincts) in the drug-disease associations table.
There are 48 diseases in the diseases phenotypes table.
48 out of 48 of diseases in the phenotypes table are in the drug-disease associations table.


##### List of all diseases

In [41]:
all_diseases=list(set(diseases))
print(len(all_diseases))

1465


#### Drugs 

In [42]:
drugs=drug_disease_asso.index.values

drugs2=drug_sign.columns

n_drugs=len(drugs)
n_drugs2=len(np.unique(drugs2))
n_drugs_unique=len(np.unique(drugs))

print('There are',n_drugs,'occurence of drugs (',n_drugs_unique, 'distincts ) in the drug-disease associations table.')
print('There are',n_drugs2,'drugs in the drugs signatures table.')


# Inclusion
count=0
for d in np.unique(drugs):
    if d in drugs2:
        count+=1
        
print(count,'out of',n_drugs_unique,'of drugs in the drug-disease associations table are in the signatures table.')

There are 7325 occurence of drugs ( 1543 distincts ) in the drug-disease associations table.
There are 617 drugs in the drugs signatures table.
45 out of 1543 of drugs in the drug-disease associations table are in the signatures table.


##### List of all drugs

In [43]:
all_drugs=list(set(np.concatenate((drugs,drugs2))))
print(len(all_drugs))

2115


#### Keeping only lines of approved drugs in the drug association table

In [44]:
drug_disease_asso=drug_disease_asso[drug_disease_asso['status']=='Approved']

### Final modified tables

In [45]:
display(drug_disease_asso.head(3),drug_disease_asso.shape)
display(drug_sign.head(3),drug_sign.shape)
display(disease_phen.head(3),disease_phen.shape)

Unnamed: 0_level_0,drug_id,ind_name,ind_id,status
drug_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lepirudin,db00001,heparin-induced thrombocytopenia with thrombosis,c0272275,Approved
cetuximab,db00002,squamous cell carcinoma of mouth,c0585362,Approved
cetuximab,db00002,squamous cell carcinoma of nose,c3163899,Approved


(6677, 4)

Unnamed: 0_level_0,"methyl 2,5-dihydroxycinnamate",compound 10,gsk-3-inhibitor-ii,hy-10456,raf 265,c3930,pac 1,az20,brd-k44432556,32937,...,pha-793887,f3103-0039,beta-escin,clofarabine,pik-90,dovitinib,brd-k24632213,betamethasone acetate,brd-k17953061,"2,4-dideoxy-dc-45-a2"
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RNF14,0.009657,0.011228,0.01354,0.007519,0.004846,0.000957,-0.002974,-0.001293,0.00756,-0.003778,...,-0.00455,-0.000549,0.000281,0.000462,0.001925,0.003712,0.005116,0.005032,-0.003062,0.0053
UBE2Q1,0.007538,-0.003624,-0.003987,-0.001868,0.009035,-0.004862,-0.00238,0.000557,-0.002815,0.003299,...,0.012603,0.007133,0.011412,0.001781,0.000659,0.003501,-0.002974,0.009794,-0.000961,-0.006942
RNF17,0.003182,-0.002379,-0.013635,0.00025,-0.003551,-0.000438,-0.002424,-0.002837,0.001408,0.002562,...,-0.011662,0.003717,0.002052,0.002235,0.000363,-0.022385,-0.002264,0.00352,-0.002205,-0.002528


(12717, 617)

Unnamed: 0_level_0,c0153195,c0016053,c0018916,c0346993,c0349790,c0376358,c0520577,c0037011,c0473527,c0036202,...,c0020428,c1274470,c0038363,c0278553,c0152973,c0004352,c0004096,c0149776,c0034065,c0038358
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMP1,-0.02739,0.010917,0.0,0.0,-0.099726,0.0,0.0,0.0,0.0,0.02091,...,0.0,0.0,0.0,0.017132,0.0,0.0,0.0,0.030745,0.016403,0.013822
SMG1P3,-0.025112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SMG1P1,-0.024621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.008416,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(11393, 48)

## Computation of matrices (adjacency, similarity, weights...)

### Adjacency matrix between diseases and drugs from drug_disease association table


In [46]:
Adj = pd.crosstab(drug_disease_asso['ind_id'], drug_disease_asso.index)
display(Adj.head())

col_0,abacavir,abarelix,abatacept,abiraterone,acamprosate,acarbose,acebutolol,aceprometazine,acetaminophen,acetazolamide,...,ziconotide,zidovudine,zileuton,zinc oxide,zinc sulfate,ziprasidone,zoledronic acid,zolmitriptan,zolpidem,zopiclone
ind_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c0000810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c0001126,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c0001144,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c0001206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
c0001207,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Filling the adjacency matrix with 0
Adding columns of 0 for drugs and lines of 0 for diseases that were in one table and not another

In [47]:
A=Adj.copy()

i=0
for drug in all_drugs:
    if drug not in A.columns:
        A[drug]=0
        
    i+=1
    if i%100==0 or  i==len(all_drugs):
        print(i/len(all_drugs)*100,'%')
        
        
i=0
for disease in all_diseases:
    if disease not in A.index:
        size=len(A.columns)
        A.loc[disease]=[0]*size
    
    i+=1
    if i%100==0 or  i==len(all_diseases):
        print(i/len(all_diseases)*100,'%')

4.7281323877068555 %
9.456264775413711 %
14.184397163120568 %
18.912529550827422 %
23.64066193853428 %
28.368794326241137 %
33.09692671394799 %
37.825059101654844 %
42.5531914893617 %
47.28132387706856 %
52.00945626477541 %
56.73758865248227 %
61.46572104018912 %
66.19385342789597 %
70.92198581560284 %
75.65011820330969 %
80.37825059101655 %
85.1063829787234 %
89.83451536643025 %
94.56264775413712 %
99.29078014184397 %
100.0 %
6.825938566552901 %
13.651877133105803 %
20.477815699658702 %
27.303754266211605 %
34.129692832764505 %
40.955631399317404 %
47.781569965870304 %
54.60750853242321 %
61.43344709897611 %
68.25938566552901 %
75.0853242320819 %
81.91126279863481 %
88.73720136518772 %
95.56313993174061 %
100.0 %


In [48]:
print(Adj.shape)
print(A.shape)

A.to_csv("./saved_tables/save_A.csv")

(1229, 1519)
(1465, 2115)


### Similarity matrices

Using functions written in the *similarity_and_weight.py* file

In [49]:
from similarity_and_weight import *

#### ssCMap similarity metric (drug-drug and disease-disease)

$$C(R_1,R_2) = \frac{ \sum_{i=1}^N R_1(g_i)R_2(g_i)}{ \sum_{i=1}^N (N−i+ 1)^2}$$
where $N$ is the number of genes is the signatures.<br/>
Used for drug-drug and disease-disease similarity.


The function *sscmap_sim* iw written in *similarity_and_weight.py* and uses another function computing the ranks of the genes for the input siganture table.

In [50]:
S_drug=sscmap_sim(drug_sign)
print('done')
display(S_drug.head())


S_disease=sscmap_sim(disease_phen)
print('done')
display(S_disease.head())

done


Unnamed: 0,"methyl 2,5-dihydroxycinnamate",compound 10,gsk-3-inhibitor-ii,hy-10456,raf 265,c3930,pac 1,az20,brd-k44432556,32937,...,pha-793887,f3103-0039,beta-escin,clofarabine,pik-90,dovitinib,brd-k24632213,betamethasone acetate,brd-k17953061,"2,4-dideoxy-dc-45-a2"
"methyl 2,5-dihydroxycinnamate",1.0,-0.079078,0.011069,0.045914,0.01325,0.054595,0.066196,0.005727,0.058488,0.013551,...,0.010054,0.009653,-0.04354,-0.048055,0.120992,0.007086,-0.155303,-0.006056,0.042996,-0.120609
compound 10,-0.079078,1.0,0.00128,-0.034669,0.001418,-0.025445,0.171377,-0.005865,0.162315,-0.014576,...,0.000791,0.061062,0.008925,-0.057728,0.040739,0.006312,-0.034556,0.048947,0.102945,-0.008408
gsk-3-inhibitor-ii,0.011069,0.00128,1.0,-0.023739,-0.032191,0.005975,-0.009017,-0.031645,0.001386,0.013682,...,-0.034424,-0.010489,-0.01013,-0.003687,-0.008613,0.110046,-0.003124,0.001482,0.020082,-0.009423
hy-10456,0.045914,-0.034669,-0.023739,1.0,-0.002892,-0.008449,-0.069491,-0.007576,-0.054895,0.053289,...,-0.003694,-0.025658,0.014702,0.040022,0.133916,-0.005931,-0.061039,0.064056,0.123981,0.018226
raf 265,0.01325,0.001418,-0.032191,-0.002892,1.0,-0.006833,-0.009467,0.083214,0.009237,-0.004891,...,0.103247,0.00367,-0.002867,0.000518,0.005358,-0.01528,0.012512,-0.001413,-0.00808,-0.002894


done


Unnamed: 0,c0153195,c0016053,c0018916,c0346993,c0349790,c0376358,c0520577,c0037011,c0473527,c0036202,...,c0020428,c1274470,c0038363,c0278553,c0152973,c0004352,c0004096,c0149776,c0034065,c0038358
c0153195,1.0,0.026403,0.005408,0.013999,0.001326,0.006735,0.008344,0.014625,-0.001432,0.001619,...,-0.016828,0.01498,-0.016631,0.00129,-0.004807,-0.008281,0.006655,-0.018248,0.020208,-0.015558
c0016053,0.026403,1.0,0.009996,0.003622,-0.005052,0.002038,-0.0084,-0.026293,-0.016667,-0.013361,...,-0.008194,0.005168,-0.003547,-0.003051,-0.001789,-0.021669,-0.028688,0.063043,0.047896,0.032517
c0018916,0.005408,0.009996,1.0,0.009578,0.001619,0.0034,-0.003032,-0.011772,-0.004865,-0.016677,...,0.035639,0.008438,0.010342,-0.015363,-0.009806,0.003343,-0.008647,-0.001696,0.056651,0.030648
c0346993,0.013999,0.003622,0.009578,1.0,-0.014217,0.032665,-0.054179,-0.001323,0.001586,0.010418,...,0.047255,0.042233,0.010452,0.024338,-0.01587,0.006824,-0.025995,0.015571,0.031778,-0.003041
c0349790,0.001326,-0.005052,0.001619,-0.014217,1.0,-0.00367,0.008683,0.005183,0.005219,-0.01324,...,-0.052735,-0.006665,0.008812,-0.013019,0.006772,-0.006445,-0.008218,0.036542,-0.019089,0.028623


In [51]:
S_drug.to_csv('./saved_tables/save_drug_sim.csv')
S_disease.to_csv('./saved_tables/save_disease_sim.csv')

### Changing the scale and mean to get coefficient between 0 and 1
Advantages :
    - It is distance-like (positivity, symmetricity, triangle inequality)
    - 0 is a worst-case scenario, which makes sens for the following step of filling with 0 fo unkwown drugs and diseases
    
    
In this format, a score below 0.5 means a negative correlation between the 2 drugs or diseases.


In [52]:
S_drug=(S_drug+1)/2
S_disease=(S_disease+1)/2

### Filling the similarity matrices with 0
Adding columns and lines of 0 (and 1 on the diagonal) for diseases and drugs that were in one table and not another

S_drug=pd.read_csv('save_drug_sim.csv',index_col=0)
S_disease=pd.read_csv('save_disease_sim.csv',index_col=0)

In [53]:
S_drug_0=S_drug.copy()

i=0
for drug in all_drugs:
    if drug not in S_drug_0.columns:
        S_drug_0[drug]=0
        size=len(S_drug_0.columns)
        S_drug_0.loc[drug]=[0]*(size-1)+[1]

    i+=1
    if i%200==0 or  i==len(all_drugs):
        print(i/len(all_drugs)*100,'%')

9.456264775413711 %
18.912529550827422 %
28.368794326241137 %
37.825059101654844 %
47.28132387706856 %
56.73758865248227 %
66.19385342789597 %
75.65011820330969 %
85.1063829787234 %
94.56264775413712 %
100.0 %


In [54]:
S_disease_0=S_disease.copy()

i=0
for disease in all_diseases:
    if disease not in S_disease_0.index:
        S_disease_0[disease]=0
        size=len(S_disease_0.columns)
        S_disease_0.loc[disease]=[0]*(size-1)+[1]

    i+=1
    if i%200==0 or i==len(all_diseases):
        print(i/len(all_diseases)*100,'%')


13.651877133105803 %
27.303754266211605 %
40.955631399317404 %
54.60750853242321 %
68.25938566552901 %
81.91126279863481 %
95.56313993174061 %
100.0 %


In [55]:
S_drug_0.to_csv('./saved_tables/save_drug_sim_norm.csv')
S_disease_0.to_csv('./saved_tables/save_disease_sim_norm.csv')

#### Example visualization

In [1]:
import seaborn as sns

plt.figure(figsize=(8,8))
plt.imshow(S_drug.iloc[25:45,25:45],cmap='gray', linewidths=.5)

plt.rcParams['xtick.bottom'] = plt.rcParams['xtick.labelbottom'] = False
plt.rcParams['xtick.top'] = plt.rcParams['xtick.labeltop'] = True

plt.xticks(np.arange(20), list(S_drug.iloc[25:45,25:45].columns),rotation=90)
plt.yticks(np.arange(20), list(S_drug.iloc[25:45,25:45].columns))

#plt.title('Similarity between 20 drugs',y=1.25,fontsize=16)
plt.show()

NameError: name 'plt' is not defined

### Reordering lines and columns of the tables the same way

In [56]:
A = A.reindex(sorted(A.columns), axis=1)
A = A.reindex(sorted(A.index), axis=0)


S_drug_0 = S_drug_0.reindex(sorted(S_drug_0.columns), axis=1)
S_drug_0 = S_drug_0.reindex(sorted(S_drug_0.index), axis=0)

S_disease_0 = S_disease_0.reindex(sorted(S_disease_0.columns), axis=1)
S_disease_0 = S_disease_0.reindex(sorted(S_disease_0.index), axis=0)


A.to_csv('./saved_tables/save_A.csv')
S_drug_0.to_csv('./saved_tables/save_drug_sim_norm.csv')
S_disease_0.to_csv('./saved_tables/save_disease_sim_norm.csv')

### Saving for c++ treatment
A_for_c=A.copy()
A_for_c['endline']='.'
A_for_c.to_csv("./C++ script/saved_tables/save_A.csv",index=False,header=False)

S_drug_for_c=S_drug_0.copy()
S_drug_for_c['endline']='.'
S_drug_for_c.to_csv("./C++ script/saved_tables/save_drug_sim_norm.csv",index=False,header=False)

### Final similarity matrix
As a combination of the drug-drug similarity matrix and the one ($S^{td}$) computed by considering the connections between diseases passing through drugs :

$$S^{td}_{ij} =\frac{\sum_{k=1}^{N_d} \sum_{l=1}^{N_d} (a_{il} a_{jk} s^d_{lk}) }{\sum_{k=1}^{N_d} \sum_{l=1}^{N_d} (a_{il} a_{jk})}$$

Final matrix S such that:
$$S_{ij}= \alpha s^t_{ij} + (1-\alpha)s^{td}_{ij}$$

#### Computing $S^{td}$

The *A_keep* and *S_keep* arguments are there to simplify the computation by not looking at components of the matrices that we know are 0.

With the high volume of the matrices, the computation was done with a C++ script, and the result stored in a *.csv* table. We must add the inices and column names since it was not supported by C++.

In [6]:
A=pd.read_csv('./saved_tables/save_A.csv',index_col=0)

In [7]:
S_td=pd.read_csv("./C++ script/saved_tables/save_connection_sim.csv",header=None)
S_td.columns=A.index
S_td.index=A.index
display(S_td.head(),S_td.shape)

ind_id,c0000810,c0001126,c0001144,c0001206,c0001207,c0001261,c0001263,c0001264,c0001403,c0001418,...,c3897513,c3897515,c3897516,c3897517,c3897520,c3897522,c3897523,c3898069,c4038730,c4083212
ind_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c0000810,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
c0001126,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
c0001144,0.0,0.0,0.041651,0.0,0.0,0.025,0.0,0.0,0.015846,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
c0001206,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
c0001207,0.0,0.0,0.0,0.2,1.0,0.0,0.0,0.0,0.0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0


(1465, 1465)

#### Computing the final S depending on $\mathbb{\alpha}$

Computing 10 versions with different $\alpha$ so we could compare the results.

In [9]:
S_disease_0=pd.read_csv('./saved_tables/save_disease_sim_norm.csv',index_col=0)

In [13]:
alphas=[0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]

They are modifed so they can be easily used by C++: no header, no index and an additionnal column that is used to indicate the end of a row

In [19]:
for alpha in alphas:
    S= alpha * S_disease_0 + (1-alpha) * S_td
    S['endline']='.'
    S.to_csv("./C++ script/S_tables/S_"+'{:.2f}'.format(round(alpha, 2))+".csv",index=False,header=False)
    
display(S.head())

Unnamed: 0,c0000810,c0001126,c0001144,c0001206,c0001207,c0001261,c0001263,c0001264,c0001403,c0001418,...,c3897515,c3897516,c3897517,c3897520,c3897522,c3897523,c3898069,c4038730,c4083212,endline
c0000810,0.933333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,.
c0001126,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,.
c0001144,0.0,0.0,0.904165,0.0,0.0,0.0025,0.0,0.0,0.001585,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,.
c0001206,0.0,0.0,0.0,0.92,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,.
c0001207,0.0,0.0,0.0,0.02,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,.


### The computations of the weight matrices is done in C++ for computational reasons
(810 matrices of size 1465x1465 to compute)