In [1]:
from scipy.spatial.distance import squareform, cdist
from functools import partial

import os
import graco
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/media/clusterduck123/joe/data"
HUMAN_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/human"
NETWORK_DIRECTORY = f"{HUMAN_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{HUMAN_DIRECTORY}/distance-matrices"

# Distance matrices

In [3]:
PPI_nx = nx.read_edgelist(f"{NETWORK_DIRECTORY}/PPI_BioGRID.txt")
GDV = graco.orbits(PPI_nx)
GCV = graco.coefficients(GDV)

In [4]:
def normalizer(distance, length):
    if   distance == 'normalized1_l1'  : return length         
    elif distance == 'normalized1_l2'  : return np.sqrt(length)
    elif distance == 'normalized1_linf': return 1              
    
    elif distance == 'cityblock'   : return 2         
    elif distance == 'euclidean'   : return np.sqrt(2)
    elif distance == 'sqeuclidean' : return 2         
    elif distance == 'chebyshev'   : return 1         
        
    elif distance == 'cosine'      : return 1
    elif distance == 'correlation' : return 2
        
    elif distance == 'canberra'   : return length
    elif distance == 'braycurtis' : return 1     
     
#    'seuclidean', , 'mahalanobis'

## GDV

In [4]:
if not os.path.exists(f"{MATRIX_DIRECTORY}/GDV"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GDV/")

### graco

In [7]:
# GDV similarity
D = graco.distances.GDV_similarity(GDV)
np.savetxt(f"{MATRIX_DIRECTORY}/GDV/GDV-similarity_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

In [8]:
# Normalized L_p
D = graco.distances.normalized1_lp(GDV,1)
np.savetxt(f"{MATRIX_DIRECTORY}/GDV/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

D = graco.distances.normalized1_lp(GDV,2)
np.savetxt(f"{MATRIX_DIRECTORY}/GDV/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

D = graco.distances.normalized1_lp(GDV,np.inf)
np.savetxt(f"{MATRIX_DIRECTORY}/GDV/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

### cdist

In [9]:
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [10]:
for distance in all_distances:
    D = cdist(GDV.values, GDV.values, distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/GDV/{distance}_BioGRID.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

# GCV-D0

In [7]:
GCV_D0 = GCV['D']

if not os.path.exists(f"{MATRIX_DIRECTORY}/GCV-D0"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GCV-D0/")

In [8]:
source_D0 = sorted({source for source,target in GCV_D0.columns})

### graco

In [7]:
# Normalized1 L_1
D_all = np.zeros([len(GCV_D0), len(GCV_D0)])
for source in source_D0:
    length = len(GCV_D0[source].T)
    D_i = graco.distances.normalized1_lp(GCV_D0[source], 1) / normalizer('normalized1_l1',length)
    D_i = np.nan_to_num(D_i, nan=0.0)
    D_all += D_i
    print(length, np.max(D_i))
    
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D0/normalized1-l1_BioGRID.txt", D_all/len(source_D0), 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
4 1.0
3 1.0
3 1.0


In [9]:
# Normalized1 L_2
D_all = np.zeros([len(GCV_D0), len(GCV_D0)])
for source in source_D0:
    length = len(GCV_D0[source].T)
    D_i = graco.distances.normalized1_lp(GCV_D0[source], 2) / normalizer('normalized1_l2',length)
    D_i = np.nan_to_num(D_i, nan=0.0)
    D_all += D_i
    print(length, np.max(D_i))
    
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D0/normalized1-l2_BioGRID.txt", D_all/len(source_D0), 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.9999974810218273
4 1.0
3 0.9999995337498914
3 0.9999995337498914


In [10]:
# Normalized1 L_inf
D_all = np.zeros([len(GCV_D0), len(GCV_D0)])
for source in source_D0:
    length = len(GCV_D0[source].T)
    D_i = graco.distances.normalized1_lp(GCV_D0[source], np.inf) / normalizer('normalized1_linf',length)
    D_i = np.nan_to_num(D_i, nan=0.0)
    D_all += D_i
    print(length, np.max(D_i))
    
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D0/normalized1-linf_BioGRID.txt", D_all/len(source_D0), 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
4 1.0
3 1.0
3 1.0


### cdist

In [12]:
all_distances = ['cityblock', 'euclidean', 'sqeuclidean', 'chebyshev',
                 'cosine', 'correlation', 
                 'canberra', 'braycurtis', 
                 #'mahalanobis', 'seuclidean'
                ]

In [13]:
for distance in all_distances:
    print(distance)
    D_all = np.zeros([len(GCV_D0), len(GCV_D0)])
    for source in source_D0:
        length = len(GCV_D0[source].T)
        D_i = cdist(GCV_D0[source].values, GCV_D0[source].values, distance) / normalizer(distance,length)
        D_i = np.nan_to_num(D_i, nan=0.0)
        D_all += D_i
        print(length, np.max(D_i))

    np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D0/{distance}_BioGRID.txt", D_all/len(source_D0), 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')
    print()

cityblock
2 1.0
4 1.0
3 1.0
3 1.0

euclidean
2 1.0
4 0.9988459376986899
3 1.0
3 1.0

sqeuclidean
2 1.0
4 0.9976932072571753
3 1.0
3 1.0

chebyshev
2 1.0
4 1.0
3 1.0
3 1.0

cosine
2 1.0
4 1.0
3 1.0
3 1.0

correlation
2 1.0
4 0.9998535209089816
3 1.0
3 1.0

canberra
2 1.0
4 1.0
3 1.0
3 1.0

braycurtis
2 1.0
4 1.0
3 1.0
3 1.0



## GCV-A

In [5]:
GCV_A = GCV['A']

if not os.path.exists(f"{MATRIX_DIRECTORY}/GCV-A"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GCV-A/")

In [6]:
source_A = sorted({source for source,target in GCV_A.columns})

### graco

In [9]:
# Normalized1 L_1
D_dict = {}
for source in source_A:
    length = len(GCV_A[source].T)
    D = graco.distances.normalized1_lp(GCV_A[source], 1) / normalizer('normalized1_l1',length)
    D_dict[source] = D 
    print(length, np.max(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-A/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.83986
3 0.9640933333333334
4 0.9771975
3 0.9869866666666667


In [10]:
# Normalized1 L_2
D_dict = {}
for source in source_A:
    length = len(GCV_A[source].T)
    D = graco.distances.normalized1_lp(GCV_A[source], 2) / normalizer('normalized1_l2',length)
    D_dict[source] = D 
    print(length, np.max(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-A/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.8549910934039021
3 0.9654277996308165
4 0.977995
3 0.9871592637631141


In [11]:
# Normalized1 L_inf
D_dict = {}
for source in source_A:
    length = len(GCV_A[source].T)
    D = graco.distances.normalized1_lp(GCV_A[source], np.inf) / normalizer('normalized1_linf',length)
    D_dict[source] = D 
    print(length, np.max(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-A/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
3 1.0
4 1.0
3 1.0


### cdist

In [7]:
all_distances = ['cityblock', 'euclidean', 'sqeuclidean', 'chebyshev',
                 'cosine', 'correlation', 
                 'canberra', 'braycurtis', 
                 #'mahalanobis', 'seuclidean'
                ]

In [10]:
for distance in all_distances:
    print(distance)
    D_dict = {}
    for source in source_A:
        length = len(GCV_A[source].T)
        D = cdist(GCV_A[source].values, GCV_A[source].values, distance) / normalizer(distance,length)
        D_dict[source] = D 
        print(length, np.max(D))

    D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
    np.savetxt(f"{MATRIX_DIRECTORY}/GCV-A/{distance}_BioGRID.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')
    print()

cityblock
2 0.8093264320893911
3 0.9430716971275914
4 0.769331987867176
3 0.6719674409720175

euclidean
2 0.8093264320893909
3 0.8264211655912044
4 0.7252102456752573
3 0.7199663059860392

sqeuclidean
2 0.6550092736785438
3 0.6829719429371248
4 0.525929900432367
3 0.518351481755183

chebyshev
2 0.8093264320893911
3 0.9430716971275914
4 0.9522149563282507
3 0.9800902969906579

cosine
2 0.7706828246032714
3 0.917813990757563
4 0.875548762584852
3 0.92802683899653

correlation
2 1.0
3 0.9999987195021924
4 0.7803105799574437
3 0.9999929541007613

canberra
2 0.8398607535689212
3 0.9640919806842181
4 0.9771971150187941
3 0.9869859701301028

braycurtis
2 0.8093264320893911
3 0.9430716971275914
4 0.9409390942152436
3 0.971223643991514



# GCV-D based

In [5]:
GCV_D = GCV['D']

if not os.path.exists(f"{MATRIX_DIRECTORY}/GCV-D"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GCV-D/")

In [6]:
source_D = sorted({source for source,target in GCV_D.columns})

### graco

In [8]:
# Normalized1 L_1
D_dict = {}
for source in source_D:
    length = len(GCV_D[source].T)
    D = graco.distances.normalized1_lp(GCV_D[source], 1) / normalizer('normalized1_l1',length)
    D_dict[source] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
4 1.0
3 1.0
3 1.0


  if __name__ == '__main__':


In [11]:
# Normalized1 L_2
D_dict = {}
for source in source_D:
    length = len(GCV_D[source].T)
    D = graco.distances.normalized1_lp(GCV_D[source], 2) / normalizer('normalized1_l2',length)
    D_dict[source] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.9999974810218273
4 1.0
3 0.9999995337498914
3 0.9999995337498914


  if __name__ == '__main__':


In [12]:
# Normalized1 L_inf
D_dict = {}
for source in source_D:
    length = len(GCV_D[source].T)
    D = graco.distances.normalized1_lp(GCV_D[source], np.inf) / normalizer('normalized1_linf',length)
    D_dict[source] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-D/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
4 1.0
3 1.0
3 1.0


  if __name__ == '__main__':


# GCV-G

In [5]:
GCV_G = GCV['G']

if not os.path.exists(f"{MATRIX_DIRECTORY}/GCV-G"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GCV-G/")

In [6]:
source_G = sorted({source for source,target in GCV_G.columns})

### graco

In [7]:
# Normalized1 L_1
D_dict = {}
for source in source_G:
    length = len(GCV_G[source].T)
    D = graco.distances.normalized1_lp(GCV_G[source], 1) / normalizer('normalized1_l1',length)
    D_dict[source] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-G/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.83986
4 0.91754
4 0.875085
4 1.0
4 1.0


In [8]:
# Normalized1 L_2
D_dict = {}
for source in source_G:
    length = len(GCV_G[source].T)
    D = graco.distances.normalized1_lp(GCV_G[source], 2) / normalizer('normalized1_l2',length)
    D_dict[source] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-G/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.8549910934039021
4 0.92787
4 0.894845
4 1.0
4 1.0


In [9]:
# Normalized1 L_inf
D_dict = {}
for source in source_G:
    length = len(GCV_G[source].T)
    D = graco.distances.normalized1_lp(GCV_G[source], np.inf) / normalizer('normalized1_linf',length)
    D_dict[source] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-G/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
4 1.0
4 1.0
4 1.0
4 1.0


### cdist

In [7]:
all_distances = ['cityblock', 'euclidean', 'sqeuclidean', 'chebyshev',
                 'cosine', 'correlation', 
                 'canberra', 'braycurtis', 
                 #'mahalanobis', 'seuclidean'
                ]

In [None]:
distance = chebyshev
D_dict = {}
for source in source_G:
    length = len(GCV_G[source].T)
    D = cdist(GCV_G[source].values, GCV_G[source].values, distance) / normalizer(distance,length)
    D_dict[source] = D 
    print(length, np.nanmax(D), np.isnan(D).any())

D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-G/{distance}_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')
print()

In [18]:
source = source_G[3]

In [52]:
distance = 'chebyshev'

D_all = pd.DataFrame(0., index=GCV_G.index, columns=GCV_G.index)
Count = pd.DataFrame(0 , index=GCV_G.index, columns=GCV_G.index)

length = len(GCV_G[source].T)

In [58]:
assert (GCV_G[source].T.isna().any() == GCV_G[source].T.isna().all()).all()
D_i = pd.DataFrame(cdist(GCV_G[source].values, GCV_G[source].values, distance) / normalizer(distance,length),
                   index=GCV_G.index, columns=GCV_G.index)

In [62]:
assert (GCV_G[source].T.isna().any() == GCV_G[source].T.isna().all()).all()
mask_index = GCV_G.index[~GCV_G[source].T.isna().any()]

In [67]:
D_all.loc[mask_index,mask_index] += D_i.loc[mask_index,mask_index]

In [69]:
D_

Unnamed: 0,YLR418C,YOL145C,YOR123C,YBR279W,YML069W,YGL244W,YGL207W,YDR167W,YDL140C,YGR104C,YAL021C,YOL051W,YDR138W,YGR005C,YML010W,YIL035C,YLR150W,YNL189W,YMR186W,YBR221C,YGL017W,YGR043C,YGR252W,YJR148W,YOR061W,...,YJL062W,YNL130C,YNL320W,YNL336W,YNR061C,YOL092W,YOR067C,YJL077W-A,YDL159W-A,YMR170C,YNL067W-B,YGR204C-A,YLR157C,YPL223C,YJL077W-B,YAL067C,YMR158C-A,YJL077C,YNL146C-A,YGL188C-A,YFR018C,YBR056W-A,YOR278W,YCR010C,YBL029C-A
YLR418C,0.000000,0.021066,0.020024,0.019538,0.003553,0.011633,0.047340,0.005836,0.044163,0.021083,0.445725,0.017348,0.037850,0.024923,0.071713,0.052942,0.013274,0.023243,0.265731,0.016132,0.071381,0.113822,0.119176,0.034335,0.028644,...,0.214102,0.120994,0.0,0.116801,0.163777,0.0,0.0,0.0,0.0,0.033480,0.058055,0.0,0.110804,0.0,0.134106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YOL145C,0.021066,0.000000,0.023240,0.001528,0.022282,0.013365,0.068406,0.021659,0.065229,0.042149,0.466791,0.008713,0.016785,0.003857,0.092779,0.074007,0.034339,0.035130,0.286796,0.019287,0.050315,0.092756,0.140242,0.031251,0.049710,...,0.235168,0.142060,0.0,0.095736,0.184842,0.0,0.0,0.0,0.0,0.030396,0.061544,0.0,0.089738,0.0,0.113040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YOR123C,0.020024,0.023240,0.000000,0.021712,0.023577,0.009875,0.045166,0.025860,0.045548,0.033082,0.443551,0.025652,0.040024,0.027097,0.069539,0.050768,0.025285,0.039529,0.263557,0.004352,0.073555,0.115996,0.117002,0.014311,0.031294,...,0.211928,0.118820,0.0,0.118975,0.161603,0.0,0.0,0.0,0.0,0.013456,0.038304,0.0,0.112978,0.0,0.136280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YBR279W,0.019538,0.001528,0.021712,0.000000,0.020754,0.011837,0.066878,0.020131,0.063701,0.040621,0.465263,0.010039,0.018313,0.005385,0.091251,0.072480,0.032811,0.034186,0.285268,0.017759,0.051843,0.094284,0.138714,0.029924,0.048182,...,0.233640,0.140532,0.0,0.097263,0.183314,0.0,0.0,0.0,0.0,0.029070,0.060016,0.0,0.091266,0.0,0.114568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YML069W,0.003553,0.022282,0.023577,0.020754,0.000000,0.015186,0.046124,0.002294,0.042947,0.019867,0.444509,0.018564,0.039067,0.026140,0.070496,0.051725,0.012057,0.022402,0.264514,0.019685,0.072598,0.115039,0.117959,0.037888,0.027427,...,0.212885,0.119777,0.0,0.118018,0.162560,0.0,0.0,0.0,0.0,0.037033,0.061608,0.0,0.112021,0.0,0.135323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YFR018C,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YBR056W-A,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YOR278W,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YCR010C,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
for distance in all_distances:
    print(distance)
    D_dict = {}
    for source in source_G:
        length = len(GCV_G[source].T)
        D = cdist(GCV_G[source].values, GCV_G[source].values, distance) / normalizer(distance,length)
        D_dict[source] = D 
        print(length, np.nanmax(D), np.isnan(D).any())

    D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
    np.savetxt(f"{MATRIX_DIRECTORY}/GCV-G/{distance}_BioGRID.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')
    print()

cityblock
2 0.8093264320893911 False
4 0.8075409455443076 False
4 0.7549106801870417 False
4 0.6151361314930966 True
4 0.7887025900275332 True

euclidean
2 0.8093264320893909 False
4 0.8061449172901909 False
4 0.7417992051710224 False
4 0.6392917858289597 True
4 0.7615003551376018 True

sqeuclidean
2 0.6550092736785438 False
4 0.6498696276728086 False
4 0.5502660607923605 False
4 0.40869398742838065 True
4 0.5798827908746939 True

chebyshev
2 0.8093264320893911 False
4 0.8075409455443076 False
4 0.7533777915944772 False
4 0.8201815086574622 False
4 0.7887025900275332 False

cosine
2 0.7706828246032714 False
4 0.8999994436765109 False
4 0.7907524261027041 False
4 0.6269304238392925 True
4 0.8829081117925971 True

correlation
2 1.0 False
4 0.7743774448536456 False
4 0.7316981676689341 False
4 0.603187078718982 True
4 0.7739425932745478 True

canberra
2 0.8398607535689212 False
4 0.9175402415759402 False
4 0.8750855972220317 False
4 0.9237940019324842 False
4 0.9127805408346124 False

bra

# GCV-AD

In [8]:
GCV_AD = GCV[['A','D']]

if not os.path.exists(f"{MATRIX_DIRECTORY}/GCV-AD"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GCV-AD/")

In [9]:
order_source_AD = sorted({(order,source) for order,source,target in GCV_AD.columns})

### graco

In [10]:
# Normalized1 L_1
D_dict = {}
for order,source in order_source_AD:
    length = len(GCV_AD[order][source].T)
    D = graco.distances.normalized1_lp(GCV_AD[order][source], 1) / normalizer('normalized1_l1',length)
    D_dict[(order,source)] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-AD/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.83986
3 0.9640933333333334
4 0.9771975
3 0.9869866666666667
2 1.0
4 1.0
3 1.0
3 1.0


In [13]:
# Normalized1 L_2
D_dict = {}
for order,source in order_source_AD:
    length = len(GCV_AD[order][source].T)
    D = graco.distances.normalized1_lp(GCV_AD[order][source], 2) / normalizer('normalized1_l2',length)
    D_dict[(order,source)] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-AD/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 0.8549910934039021
3 0.9654277996308165
4 0.977995
3 0.9871592637631141
2 0.9999974810218273
4 1.0
3 0.9999995337498914
3 0.9999995337498914


In [14]:
# Normalized1 L_inf
D_dict = {}
for order,source in order_source_AD:
    length = len(GCV_AD[order][source].T)
    D = graco.distances.normalized1_lp(GCV_AD[order][source], np.inf) / normalizer('normalized1_linf',length)
    D_dict[(order,source)] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-AD/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
3 1.0
4 1.0
3 1.0
2 1.0
4 1.0
3 1.0
3 1.0


# GCV-DG

In [5]:
GCV_DG = GCV[['D', 'G']]

if not os.path.exists(f"{MATRIX_DIRECTORY}/GCV-DG"):
    os.makedirs(f"{MATRIX_DIRECTORY}/GCV-DG/")

In [6]:
order_source_DG = sorted({(order,source) for order,source,target in GCV_DG.columns})

### graco

In [8]:
# Normalized1 L_1
D_dict = {}
for order,source in order_source_DG:
    length = len(GCV_DG[order][source].T)
    D = graco.distance_matrix.normalized1_lp(GCV_DG[order][source], 1) / normalizer('normalized1_l1',length)
    D_dict[(order,source)] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-DG/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

2 1.0
4 1.0
3 1.0
3 1.0
2 1.0
4 1.0
4 1.0


MemoryError: Unable to allocate array with shape (17125, 1, 17125) and data type float64

In [None]:
# Normalized1 L_2
D_dict = {}
for order,source in order_source_DG:
    length = len(GCV_DG[order][source].T)
    D = graco.distance_matrix.normalized1_lp(GCV_DG[order][source], 2) / normalizer('normalized1_l2',length)
    D_dict[(order,source)] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-DG/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

In [None]:
# Normalized1 L_inf
D_dict = {}
for order,source in order_source_DG:
    length = len(GCV_DG[order][source].T)
    D = graco.distance_matrix.normalized1_lp(GCV_DG[order][source], np.inf) / normalizer('normalized1_linf',length)
    D_dict[(order,source)] = D 
    print(length, np.nanmax(D))
    
D = np.nanmean([D_i for D_i in D_dict.values()], axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/GCV-DG/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')