In [1]:
from scipy.spatial.distance import squareform, pdist, cdist
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import graco
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [1]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

NameError: name 'sns' is not defined

In [2]:
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)
        
if not os.path.exists(RAW_DATA_DIRECTORY):
    os.makedirs(RAW_DATA_DIRECTORY)     
    
if not os.path.exists(PPI_DIRECTORY):
    os.makedirs(PPI_DIRECTORY)
    
if not os.path.exists(ANNOTATIONS_DIRECTORY):
    os.makedirs(ANNOTATIONS_DIRECTORY)
    
if not os.path.exists(MATRIX_DIRECTORY):
    os.makedirs(MATRIX_DIRECTORY)
    
if not os.path.exists(CLUSTERS_DIRECTORY):
    os.makedirs(CLUSTERS_DIRECTORY)
    
if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GDV"):
    os.makedirs(f"{CLUSTERS_DIRECTORY}/GDV")
    
if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GCV"):
    os.makedirs(f"{CLUSTERS_DIRECTORY}/GCV")

NameError: name 'DATA_DIRECTORY' is not defined

# Distance matrices

In [4]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_hs.txt")
GDV = graco.orbits(PPI_nx)
GCV = graco.coefficients(GDV)

## GDV-based

### GDV similarity

In [5]:
np.savetxt(f"{CPP_DIRECTORY}/matrix.in", GDV, 
           header=f"{len(GDV)} 15", fmt='%d')

In [6]:
D = pd.read_csv(f"{CPP_DIRECTORY}/matrix.out", delimiter=' ')

In [7]:
D.head()

Unnamed: 0,0,0.199184,0.239149,0.354714,0.0587975,0.385507,0.0873124,0.212972,0.266331,0.300813,0.185758,0.0739606,0.23172,0.280199,0.0899579,0.297289,0.175067,0.283429,0.156968,0.437139,0.097347,0.137529,0.27406,0.422582,0.464079,...,0.882992,0.857618,0.802483,0.957711,0.856631,0.90841,0.919822,0.919822.1,0.94225,0.94225.1,0.914607,0.917211,0.883508,0.91211,0.933742,0.904973,0.870043,0.833599,0.890393,0.843447,0.943999,0.951078,0.916924,0.725108,Unnamed: 17119
0,0.199184,0.0,0.066414,0.205421,0.202249,0.242221,0.129885,0.063548,0.095421,0.130681,0.03837,0.154811,0.060044,0.110278,0.123358,0.127937,0.307605,0.106102,0.062832,0.310305,0.117314,0.086884,0.107912,0.290066,0.346313,...,0.898118,0.874384,0.830893,0.967332,0.873327,0.921529,0.93195,0.93195,0.952954,0.952954,0.927112,0.929694,0.898459,0.924767,0.944997,0.918216,0.885717,0.852036,0.904832,0.861218,0.954611,0.961178,0.929897,0.764276,
1,0.239149,0.066414,0.0,0.172368,0.23846,0.208949,0.169065,0.040588,0.05943,0.085341,0.073775,0.201424,0.021013,0.07911,0.165145,0.086868,0.327209,0.060221,0.105346,0.279039,0.159442,0.123117,0.060038,0.260087,0.317689,...,0.898798,0.875401,0.834771,0.967612,0.874533,0.92197,0.932612,0.932612,0.953287,0.953287,0.927729,0.930211,0.899214,0.925442,0.945431,0.91895,0.886779,0.853368,0.905471,0.862475,0.954771,0.961399,0.929876,0.763507,
2,0.354714,0.205421,0.172368,0.0,0.355317,0.051065,0.298509,0.199756,0.133253,0.098078,0.219081,0.32778,0.177532,0.117004,0.296652,0.095837,0.427996,0.12338,0.250221,0.143393,0.294039,0.264519,0.134217,0.1185,0.192987,...,0.906374,0.883527,0.850431,0.972929,0.882408,0.929385,0.939061,0.939061,0.959332,0.959332,0.934579,0.93695,0.90677,0.932281,0.951689,0.925817,0.894529,0.861863,0.913087,0.870614,0.961142,0.967227,0.937808,0.784209,
3,0.058798,0.202249,0.23846,0.355317,0.0,0.385488,0.107058,0.212189,0.267476,0.301334,0.189328,0.066508,0.231619,0.282569,0.0926,0.297329,0.138718,0.2809,0.157261,0.438388,0.099843,0.135666,0.270848,0.42242,0.465766,...,0.882526,0.856781,0.801513,0.958433,0.855804,0.908337,0.919971,0.919971,0.942722,0.942722,0.914664,0.917293,0.883063,0.912134,0.934088,0.904885,0.869421,0.832405,0.890048,0.842406,0.944478,0.951682,0.916913,0.722175,
4,0.385507,0.242221,0.208949,0.051065,0.385488,0.0,0.333132,0.234492,0.176471,0.138504,0.256398,0.360402,0.215896,0.158814,0.330196,0.137706,0.453568,0.161014,0.286754,0.097367,0.324621,0.297232,0.170498,0.071777,0.149906,...,0.909449,0.887366,0.856177,0.973817,0.886311,0.931623,0.941048,0.941048,0.960633,0.960633,0.936683,0.93899,0.909825,0.93447,0.953242,0.928244,0.897998,0.866453,0.915902,0.874926,0.962339,0.968264,0.939709,0.792541,


In [5]:
D = np.genfromtxt(f"{CPP_DIRECTORY}/matrix.out")
np.savetxt(f"{MATRIX_DIRECTORY}/hs_BioGRID_GDV_similarity.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

ValueError: could not convert string to float: 

#### Minimal code

In [None]:
import numpy as np

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"

D = np.genfromtxt(f"{CPP_DIRECTORY}/matrix.out")
np.savetxt(f"{MATRIX_DIRECTORY}/hs_BioGRID_GDV_similarity.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

#### Rest

In [20]:
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [21]:
for distance in all_distances:
    D = cdist(GDV.values, GDV.values, distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

## GCV-based

In [6]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GCV = graco.coefficients(PPI_nx)

### Hellinger - single

In [6]:
_SQRT2 = np.sqrt(2) 

def hellinger(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / _SQRT2

In [7]:
for order,source in set((order,source) for order,source,target in GCV.columns):
    t1 = time.time()
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), hellinger)
    t2 = time.time()
    print(f'{order}-{source}: {t2-t1:.2f}sec')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_hellinger.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0-0: 242.75sec
-1-3: 245.51sec
3-3: 240.31sec
1-1: 236.50sec
-1-2: 239.88sec
-1-0: 237.58sec
-1-1: 236.89sec
1-2: 239.48sec
2-1: 239.23sec


### Hellinger - combination

In [9]:
D_list = []

for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_hellinger.txt", delimiter=' ')
    D_list.append(np.array(df))

D = np.nanmean(D_list, axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_hellinger.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### TVD - individual

In [8]:
for order,source in set((order,source) for order,source,target in GCV.columns):
    t1 = time.time()
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), graco.functions.tvd)
    t2 = time.time()
    print(f'{order}-{source}: {t2-t1:.2f}sec')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_tvd.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0-0: 165.49sec
-1-3: 161.96sec
3-3: 162.53sec
1-1: 162.82sec
-1-2: 162.89sec
-1-0: 161.32sec
-1-1: 162.55sec
1-2: 161.95sec
2-1: 160.59sec


### TVD - combination

In [11]:
D_list = []

for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_tvd.txt", delimiter=' ')
    D_list.append(np.array(df))

D = np.nanmean(D_list, axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_TVD.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### Rest

In [7]:
GCV = GCV.fillna(0)
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [16]:
for distance in all_distances:
    D = cdist(GCV.values, GCV.values, distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### Rest - individual

In [7]:
all_distances = ['euclidean', 'cityblock', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis']

In [8]:
for distance in all_distances:
    for order,source in set((order,source) for order,source,target in GCV.columns):
        D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), distance)
        print(f'{distance} {order: <2} {source}')  
        np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_{distance}.txt", D, 
                   fmt='%.7f', header=' '.join(PPI_nx), comments='')

-1-3: 0.13sec
0-0: 0.07sec
1-1: 0.12sec
3-3: 0.11sec
-1-1: 0.12sec
-1-0: 0.09sec
1-2: 0.11sec
2-1: 0.12sec
-1-2: 0.13sec
-1-3: 0.08sec
0-0: 0.07sec
1-1: 0.09sec
3-3: 0.10sec
-1-1: 0.10sec
-1-0: 0.06sec
1-2: 0.09sec
2-1: 0.10sec
-1-2: 0.09sec
-1-3: 0.22sec
0-0: 0.16sec
1-1: 0.25sec
3-3: 0.35sec
-1-1: 0.23sec
-1-0: 0.21sec
1-2: 0.25sec
2-1: 0.34sec
-1-2: 0.21sec
-1-3: 0.22sec
0-0: 0.07sec
1-1: 0.14sec
3-3: 0.16sec
-1-1: 0.18sec
-1-0: 0.08sec
1-2: 0.16sec
2-1: 0.16sec
-1-2: 0.15sec
-1-3: 0.10sec
0-0: 0.15sec
1-1: 0.22sec
3-3: 0.13sec
-1-1: 0.19sec
-1-0: 0.16sec
1-2: 0.20sec
2-1: 0.11sec
-1-2: 0.18sec
-1-3: 0.19sec
0-0: 0.19sec
1-1: 0.19sec
3-3: 0.21sec
-1-1: 0.11sec
-1-0: 0.22sec
1-2: 0.22sec
2-1: 0.19sec
-1-2: 0.20sec
-1-3: 0.09sec
0-0: 0.14sec
1-1: 0.19sec
3-3: 0.11sec
-1-1: 0.22sec
-1-0: 0.07sec
1-2: 0.16sec
2-1: 0.12sec
-1-2: 0.08sec
-1-3: 0.16sec
0-0: 0.13sec
1-1: 0.21sec
3-3: 0.23sec
-1-1: 0.30sec
-1-0: 0.13sec
1-2: 0.27sec
2-1: 0.32sec
-1-2: 0.18sec
-1-3: 0.19sec
0-0: 0.15sec
1-1: 

### Rest - combination

In [10]:
for distance in all_distances:
    D_list = []
    for order,source in set((order,source) for order,source,target in GCV.columns):
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_{distance}.txt", delimiter=' ')
        D_list.append(np.array(df))

    D = np.nanmean(D_list, axis=0)
    print(distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_gGCV_{distance}.txt", D, 
                   fmt='%.7f', header=' '.join(PPI_nx), comments='')

euclidean
cityblock
seuclidean
sqeuclidean
cosine
correlation
chebyshev
canberra
braycurtis
mahalanobis


### Normalized $L_p$

In [5]:
def normalized_lp(P,Q,p=1):
    v1 = np.divide(P, P+Q, out=np.zeros_like(P), where=(P+Q)!=0)
    v2 = np.divide(Q, P+Q, out=np.zeros_like(Q), where=(P+Q)!=0)
    return np.linalg.norm(v1-v2,p)

def normalized_l1(P,Q):
    return normalized_lp(P,Q,1)

def normalized_l2(P,Q):
    return normalized_lp(P,Q,2)

def normalized_linf(P,Q):
    return normalized_lp(P,Q,np.inf)

#### Normalizes $L_p$ - individual

In [11]:
for order,source in set((order,source) for order,source,target in GCV.columns):
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), normalized_l1) / GCV[order][source].shape[1]
    print(f'{order: <2} {source}')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_normalized_l1.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

3  3
1  2
-1 0
-1 1
-1 2
1  1
0  0
2  1
-1 3


In [12]:
D_list = []
for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_normalized_l1.txt", delimiter=' ')
    D_list.append(np.array(df))
    print(np.nanmax(np.array(df)))

D = np.nanmean(D_list, axis=0)
print(np.max(D))
print()
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_gGCV_normalized_l1.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0.9127796
0.8750902
1.0
1.0
1.0
0.9175424999999999
0.8398652
0.9237966000000001
1.0
0.8864963166666667



# Clustering

In [5]:
def get_number_of_max_runs(GV, distance, n_clusters = 100):
    runs = max(int(run) for run,species,db,ncluster_txt in 
             map(partial(str.split, sep='_'), os.listdir(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}"))
                if int(ncluster_txt.split('.')[0]) == n_clusters)
    return runs

## GDV

In [6]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['mahalanobis', 'similarity']

In [7]:
# Automated
for run in range(50):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GDV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/GDV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_GDV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

        int2gene = dict(enumerate(line.split()))

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_max_runs('GDV', distance, n_clusters)

            with open(f"{CLUSTERS_DIRECTORY}/GDV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

mahalanobis
99: 507.44sec
similarity
99: 501.11sec
mahalanobis
99: 497.43sec
similarity
99: 499.39sec
mahalanobis
99: 497.79sec
similarity
99: 499.23sec
mahalanobis
99: 498.46sec
similarity
99: 500.07sec
mahalanobis
99: 498.37sec
similarity
99: 500.44sec
mahalanobis
99: 500.16sec
similarity
99: 500.32sec
mahalanobis
99: 497.62sec
similarity
99: 499.23sec
mahalanobis
99: 498.95sec
similarity
99: 499.71sec
mahalanobis
99: 497.96sec
similarity
99: 499.15sec
mahalanobis
99: 498.80sec
similarity
99: 500.92sec
mahalanobis
99: 498.55sec
similarity
99: 500.23sec
mahalanobis
99: 499.02sec
similarity
99: 500.84sec
mahalanobis
99: 498.60sec
similarity
99: 499.89sec
mahalanobis
99: 498.61sec
similarity
99: 500.33sec
mahalanobis
99: 499.67sec
similarity
99: 500.79sec
mahalanobis
99: 499.14sec
similarity
99: 500.67sec
mahalanobis
99: 499.16sec
similarity
99: 500.90sec
mahalanobis
99: 499.08sec
similarity
99: 501.52sec
mahalanobis
99: 499.56sec
similarity
99: 500.99sec
mahalanobis
99: 498.98sec
simil

## GCV

In [8]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['seuclidean', 'canberra', 'tvd', 'hellinger']

In [None]:
# Automated
for run in range(50):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/GCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_GCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('GCV', distance, n_clusters)
        
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/GCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

seuclidean
99: 498.53sec
canberra
99: 499.63sec
tvd
99: 504.42sec
hellinger
99: 500.98sec
seuclidean
99: 502.19sec
canberra
99: 501.07sec
tvd
99: 506.65sec
hellinger
99: 500.29sec
seuclidean
99: 501.09sec
canberra
99: 498.53sec
tvd
99: 503.57sec
hellinger
99: 499.80sec
seuclidean
41: 206.82sec

### gGCV

In [6]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['normalized_l1']

In [None]:
# Automated
for run in range(10):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_gGCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('gGCV', distance, n_clusters) # CAREFULL !!!!!
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalized_l1
17: 92.52sec