In [1]:
from scipy.spatial.distance import squareform, pdist, cdist
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids

import os
import time
import graco
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

In [3]:
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)
        
if not os.path.exists(RAW_DATA_DIRECTORY):
    os.makedirs(RAW_DATA_DIRECTORY)     
    
if not os.path.exists(PPI_DIRECTORY):
    os.makedirs(PPI_DIRECTORY)
    
if not os.path.exists(ANNOTATIONS_DIRECTORY):
    os.makedirs(ANNOTATIONS_DIRECTORY)
    
if not os.path.exists(MATRIX_DIRECTORY):
    os.makedirs(MATRIX_DIRECTORY)
    
if not os.path.exists(CLUSTERS_DIRECTORY):
    os.makedirs(CLUSTERS_DIRECTORY)

# Distance matrices

In [4]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GDV = graco.orbits(PPI_nx)
GCV = graco.coefficients(GDV)

## GDV-based

### GDV similarity

In [22]:
np.savetxt(f"{CPP_DIRECTORY}/matrix.in", GDV, 
           header=f"{len(GDV)} 15", fmt='%d')

In [23]:
D = np.genfromtxt(f"{CPP_DIRECTORY}/matrix.out")
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_similarity.txt", D, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

#### Rest

In [20]:
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [21]:
for distance in all_distances:
    D = cdist(GDV.values, GDV.values, distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

## GCV-based

In [5]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GCV = graco.coefficients(PPI_nx)

### Hellinger - single

In [6]:
_SQRT2 = np.sqrt(2) 

def hellinger(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / _SQRT2

In [None]:
for order,source in set((order,source) for order,source,target in GCV.columns):
    t1 = time.time()
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), hellinger)
    t2 = time.time()
    print(f'{order}-{source}: {t2-t1:.2f}sec')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_hellinger.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

0-0: 242.75sec


### Hellinger - combination

In [None]:
D_list = []

for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_hellinger.txt", delimiter=' ')
    D_list.append(np.array(df))

D = np.nanmean(D_list, axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_hellinger.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### TVD - single

In [None]:
for order,source in set((order,source) for order,source,target in GCV.columns):
    t1 = time.time()
    D = cdist(np.array(GCV[order][source]), np.array(GCV[order][source]), graco.functions.tvd)
    t2 = time.time()
    print(f'{order}-{source}: {t2-t1:.2f}sec')  
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_tvd.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### TVD - combination

In [None]:
D_list = []

for order,source in set((order,source) for order,source,target in GCV.columns):
    df = pd.read_csv(f"{MATRIX_DIRECTORY}/sc_BioGRID_{order}GCV{source}_TVD.txt", delimiter=' ')
    D_list.append(np.array(df))

D = np.nanmean(D_list, axis=0)
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_TVD.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

### Rest

In [7]:
GCV = GCV.fillna(0)
all_distances = ['euclidean', 'cityblock', 'seuclidean', 'sqeuclidean', 
                 'cosine', 'correlation', 'chebyshev', 'canberra', 
                 'braycurtis', 'mahalanobis']

In [16]:
for distance in all_distances:
    D = cdist(GCV.values, GCV.values, distance)
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

# Clustering