In [1]:
from scipy.spatial.distance import squareform, cdist
from functools import partial

import os
import graco
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
PROCESSEDDATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data/processed_data/"
PPI_DIRECTORY = f"{PROCESSEDDATA_DIRECTORY}/PPIs"
ORGANISM = "human"

In [3]:
FEATURE_DIRECTORY = f"{PROCESSEDDATA_DIRECTORY}/{ORGANISM}/features"
    
if not os.path.exists(FEATURE_DIRECTORY):
    os.makedirs(FEATURE_DIRECTORY)

# Distance matrices

In [4]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/hs_BioGRID.txt")
GDV = graco.orbits(PPI_nx)
GCV = graco.coefficients(GDV)

## GDV-based

In [5]:
if not os.path.exists(f"{FEATURE_DIRECTORY}/GDV/distance_matrices"):
    os.makedirs(f"{FEATURE_DIRECTORY}/GDV/distance_matrices/")

### graco

In [6]:
# GDV similaritz
D = graco.distances.GDV_similarity(GDV)
np.savetxt(f"{FEATURE_DIRECTORY}/GDV/distance_matrices/GDV-similarity_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

In [7]:
# Normalized L_p
D = graco.distances.normalized1_lp(GDV,1)
np.savetxt(f"{FEATURE_DIRECTORY}/GDV/distance_matrices/normalized1-l1_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

D = graco.distances.normalized1_lp(GDV,2)
np.savetxt(f"{FEATURE_DIRECTORY}/GDV/distance_matrices/normalized1-l2_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

D = graco.distances.normalized1_lp(GDV,np.inf)
np.savetxt(f"{FEATURE_DIRECTORY}/GDV/distance_matrices/normalized1-linf_BioGRID.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

### cdist

In [8]:
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [9]:
for distance in all_distances:
    D = cdist(GDV.values, GDV.values, distance)
    np.savetxt(f"{FEATURE_DIRECTORY}/GDV/distance_matrices/{distance}_BioGRID.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

## GCV-based

In [11]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GCV = graco.coefficients(PPI_nx)

FileNotFoundError: [Errno 2] No such file or directory: '/home/clusterduck123/Desktop/git/supplements/data/processed_data//PPIs/BioGRID_sc.txt'

### TVD - individual

In [8]:
for order,source in set((order,source) for order,source,target in GCV.columns):
    t1 = time.time()
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), graco.functions.tvd)
    t2 = time.time()
    print(f'{order}-{source}: {t2-t1:.2f}sec')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_tvd.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0-0: 165.49sec
-1-3: 161.96sec
3-3: 162.53sec
1-1: 162.82sec
-1-2: 162.89sec
-1-0: 161.32sec
-1-1: 162.55sec
1-2: 161.95sec
2-1: 160.59sec


### TVD - combination

In [11]:
D_list = []

for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_tvd.txt", delimiter=' ')
    D_list.append(np.array(df))

D = np.nanmean(D_list, axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_TVD.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### Rest

In [7]:
GCV = GCV.fillna(0)
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [16]:
for distance in all_distances:
    D = cdist(GCV.values, GCV.values, distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### Rest - individual

In [7]:
all_distances = ['euclidean', 'cityblock', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis']

In [8]:
for distance in all_distances:
    for order,source in set((order,source) for order,source,target in GCV.columns):
        D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), distance)
        print(f'{distance} {order: <2} {source}')  
        np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_{distance}.txt", D, 
                   fmt='%.7f', header=' '.join(PPI_nx), comments='')

-1-3: 0.13sec
0-0: 0.07sec
1-1: 0.12sec
3-3: 0.11sec
-1-1: 0.12sec
-1-0: 0.09sec
1-2: 0.11sec
2-1: 0.12sec
-1-2: 0.13sec
-1-3: 0.08sec
0-0: 0.07sec
1-1: 0.09sec
3-3: 0.10sec
-1-1: 0.10sec
-1-0: 0.06sec
1-2: 0.09sec
2-1: 0.10sec
-1-2: 0.09sec
-1-3: 0.22sec
0-0: 0.16sec
1-1: 0.25sec
3-3: 0.35sec
-1-1: 0.23sec
-1-0: 0.21sec
1-2: 0.25sec
2-1: 0.34sec
-1-2: 0.21sec
-1-3: 0.22sec
0-0: 0.07sec
1-1: 0.14sec
3-3: 0.16sec
-1-1: 0.18sec
-1-0: 0.08sec
1-2: 0.16sec
2-1: 0.16sec
-1-2: 0.15sec
-1-3: 0.10sec
0-0: 0.15sec
1-1: 0.22sec
3-3: 0.13sec
-1-1: 0.19sec
-1-0: 0.16sec
1-2: 0.20sec
2-1: 0.11sec
-1-2: 0.18sec
-1-3: 0.19sec
0-0: 0.19sec
1-1: 0.19sec
3-3: 0.21sec
-1-1: 0.11sec
-1-0: 0.22sec
1-2: 0.22sec
2-1: 0.19sec
-1-2: 0.20sec
-1-3: 0.09sec
0-0: 0.14sec
1-1: 0.19sec
3-3: 0.11sec
-1-1: 0.22sec
-1-0: 0.07sec
1-2: 0.16sec
2-1: 0.12sec
-1-2: 0.08sec
-1-3: 0.16sec
0-0: 0.13sec
1-1: 0.21sec
3-3: 0.23sec
-1-1: 0.30sec
-1-0: 0.13sec
1-2: 0.27sec
2-1: 0.32sec
-1-2: 0.18sec
-1-3: 0.19sec
0-0: 0.15sec
1-1: 

### Rest - combination

In [10]:
for distance in all_distances:
    D_list = []
    for order,source in set((order,source) for order,source,target in GCV.columns):
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_{distance}.txt", delimiter=' ')
        D_list.append(np.array(df))

    D = np.nanmean(D_list, axis=0)
    print(distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_gGCV_{distance}.txt", D, 
                   fmt='%.7f', header=' '.join(PPI_nx), comments='')

euclidean
cityblock
seuclidean
sqeuclidean
cosine
correlation
chebyshev
canberra
braycurtis
mahalanobis


### Normalized $L_p$

In [21]:
def normalized_lp(P,Q,p=1):
    v1 = np.divide(P, P+Q, out=np.zeros_like(P), where=(P+Q)!=0)
    v2 = np.divide(Q, P+Q, out=np.zeros_like(Q), where=(P+Q)!=0)
    return np.linalg.norm(v1-v2,p)

def normalized_l1(P,Q):
    return normalized_lp(P,Q,1)

def normalized_l2(P,Q):
    return normalized_lp(P,Q,2)

def normalized_linf(P,Q):
    return normalized_lp(P,Q,np.inf)

In [67]:
p = 2

In [68]:
list_ = []
for a,b in product(np.linspace(0,1,10), repeat=2):
    v1 = np.array([a,1-b])
    v2 = np.array([b,1-b])
    
    v1 = np.where(v1<0,0,v1)
    v2 = np.where(v2<0,0,v2)
    
    list_.append(normalized_lp(v1,v2,p))
print(max(list_))

1.0


In [69]:
list_ = []
for a,b,c,d in product(np.linspace(0,1,10), repeat=4):
    if (a+b>1) or (c+d)>1:
        continue
    else:
        v1 = np.array([a,b,1-a-b])
        v2 = np.array([c,d,1-c-d])
        
        v1 = np.where(v1<0,0,v1)
        v2 = np.where(v2<0,0,v2)
        
        list_.append(normalized_lp(v1,v2,p))
print(max(list_))

1.7320508075688772


In [70]:
list_ = []
for a,b,c,d,e,f in product(np.linspace(0,1,10), repeat=6):
    if (a+b+c>1) or (d+e+f)>1:
        continue
    else:
        v1 = np.array([a,b,c,1-a-b-c])
        v2 = np.array([d,e,f,1-d-e-f])
        
        v1 = np.where(v1<0,0,v1)
        v2 = np.where(v2<0,0,v2)
        
        list_.append(normalized_lp(v1,v2,p))
print(max(list_))

2.0


#### Normalizes $L_p$ - individual

In [11]:
# p=1
for order,source in set((order,source) for order,source,target in GCV.columns):
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), normalized_l1) / GCV[order][source].shape[1]
    print(f'{order: <2} {source}')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_normalizedl1.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

3  3
1  2
-1 0
-1 1
-1 2
1  1
0  0
2  1
-1 3


In [12]:
# p=1
D_list = []
for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_normalizedl1.txt", delimiter=' ')
    D_list.append(np.array(df))
    print(np.nanmax(np.array(df)))

D = np.nanmean(D_list, axis=0)
print(np.max(D))
print()
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_gGCV_normalizedl1.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0.9127796
0.8750902
1.0
1.0
1.0
0.9175424999999999
0.8398652
0.9237966000000001
1.0
0.8864963166666667



In [74]:
# p=2
for order,source in set((order,source) for order,source,target in GCV.columns):
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), normalized_l2) / np.sqrt(GCV[order][source].shape[1])
    print(f'{order: <2} {source}')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_normalizedl2.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

3  3
2  1
-1 2
1  1
0  0
-1 1
1  2
-1 0
-1 3


In [75]:
# p=2
D_list = []
for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_normalizedl2.txt", delimiter=' ')
    D_list.append(np.array(df))
    print(np.nanmax(np.array(df)))

D = np.nanmean(D_list, axis=0)
print(np.max(D))
print()
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_gGCV_normalizedl2.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0.9251966
0.9331779
1.0
0.9278732
0.8549950999999999
1.0
0.8948482
1.0
1.0
0.9001188599999999

