In [1]:
from scipy.spatial.distance import squareform, pdist, cdist
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import graco
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
CPP_DIRECTORY = "/home/clusterduck123/Desktop/git/graco/graco/cpp"

In [3]:
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)
        
if not os.path.exists(RAW_DATA_DIRECTORY):
    os.makedirs(RAW_DATA_DIRECTORY)     
    
if not os.path.exists(PPI_DIRECTORY):
    os.makedirs(PPI_DIRECTORY)
    
if not os.path.exists(ANNOTATIONS_DIRECTORY):
    os.makedirs(ANNOTATIONS_DIRECTORY)
    
if not os.path.exists(MATRIX_DIRECTORY):
    os.makedirs(MATRIX_DIRECTORY)
    
if not os.path.exists(CLUSTERS_DIRECTORY):
    os.makedirs(CLUSTERS_DIRECTORY)
    
if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GDV"):
    os.makedirs(f"{CLUSTERS_DIRECTORY}/GDV")
    
if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GCV"):
    os.makedirs(f"{CLUSTERS_DIRECTORY}/GCV")

# Clustering

In [4]:
def get_number_of_max_runs(GV, distance, n_clusters = 100):
    runs = max(int(run) for run,species,db,ncluster_txt in 
             map(partial(str.split, sep='_'), os.listdir(f"{CLUSTERS_DIRECTORY}/{GV}/{distance}"))
                if int(ncluster_txt.split('.')[0]) == n_clusters)
    return runs

## GDV

In [6]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['mahalanobis',]

In [7]:
# Automated
for run in range(50):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GDV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/GDV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_GDV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

        int2gene = dict(enumerate(line.split()))

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_max_runs('GDV', distance, n_clusters)

            with open(f"{CLUSTERS_DIRECTORY}/GDV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

mahalanobis
99: 507.44sec
similarity
99: 501.11sec
mahalanobis
99: 497.43sec
similarity
99: 499.39sec
mahalanobis
99: 497.79sec
similarity
99: 499.23sec
mahalanobis
99: 498.46sec
similarity
99: 500.07sec
mahalanobis
99: 498.37sec
similarity
99: 500.44sec
mahalanobis
99: 500.16sec
similarity
99: 500.32sec
mahalanobis
99: 497.62sec
similarity
99: 499.23sec
mahalanobis
99: 498.95sec
similarity
99: 499.71sec
mahalanobis
99: 497.96sec
similarity
99: 499.15sec
mahalanobis
99: 498.80sec
similarity
99: 500.92sec
mahalanobis
99: 498.55sec
similarity
99: 500.23sec
mahalanobis
99: 499.02sec
similarity
99: 500.84sec
mahalanobis
99: 498.60sec
similarity
99: 499.89sec
mahalanobis
99: 498.61sec
similarity
99: 500.33sec
mahalanobis
99: 499.67sec
similarity
99: 500.79sec
mahalanobis
99: 499.14sec
similarity
99: 500.67sec
mahalanobis
99: 499.16sec
similarity
99: 500.90sec
mahalanobis
99: 499.08sec
similarity
99: 501.52sec
mahalanobis
99: 499.56sec
similarity
99: 500.99sec
mahalanobis
99: 498.98sec
simil

## GCV

In [8]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['canberra']

In [9]:
# Automated
for run in range(50):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/GCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_GCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('GCV', distance, n_clusters)
        
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/GCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

canberra
99: 506.22sec
canberra
99: 502.30sec
canberra
99: 504.05sec
canberra
99: 504.12sec
canberra
99: 503.22sec
canberra
99: 503.12sec
canberra
99: 502.59sec
canberra
99: 504.24sec
canberra
99: 501.88sec
canberra
99: 505.19sec
canberra
99: 503.14sec
canberra
99: 502.82sec
canberra
99: 493.82sec
canberra
99: 480.41sec
canberra
99: 480.19sec
canberra
99: 480.54sec
canberra
99: 480.99sec
canberra
99: 479.29sec
canberra
99: 479.52sec
canberra
99: 480.65sec
canberra
99: 480.17sec
canberra
99: 480.52sec
canberra
99: 479.34sec
canberra
99: 480.12sec
canberra
99: 479.85sec
canberra
99: 480.50sec
canberra
99: 481.70sec
canberra
99: 479.65sec
canberra
99: 480.18sec
canberra
99: 480.28sec
canberra
99: 480.39sec
canberra
99: 479.70sec
canberra
99: 479.83sec
canberra
99: 480.54sec
canberra
99: 480.62sec
canberra
99: 480.74sec
canberra
99: 480.56sec
canberra
99: 480.24sec
canberra
99: 480.46sec
canberra
99: 480.15sec
canberra
99: 479.94sec
canberra
99: 480.19sec
canberra
99: 480.32sec
canberra
99

### gGCV

In [6]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['normalizedlinf']

In [8]:
# Automated
for run in range(49):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_gGCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.loadtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skiprows=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('gGCV', distance, MAX_CLUSTERS-1) # CAREFULL !!!!!
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalizedlinf
99: 569.67sec
normalizedlinf
99: 555.84sec
normalizedlinf
99: 554.14sec
normalizedlinf
99: 555.52sec
normalizedlinf
99: 554.21sec
normalizedlinf
99: 555.25sec
normalizedlinf
99: 554.59sec
normalizedlinf
99: 554.19sec
normalizedlinf
99: 554.66sec
normalizedlinf
99: 555.05sec
normalizedlinf
99: 554.81sec
normalizedlinf
99: 554.58sec
normalizedlinf
99: 554.33sec
normalizedlinf
99: 554.85sec
normalizedlinf
99: 555.31sec
normalizedlinf
99: 554.66sec
normalizedlinf
99: 554.98sec
normalizedlinf
99: 554.77sec
normalizedlinf
99: 555.27sec
normalizedlinf
99: 555.33sec
normalizedlinf
99: 554.48sec
normalizedlinf
99: 555.08sec
normalizedlinf
99: 555.41sec
normalizedlinf
99: 554.54sec
normalizedlinf
99: 554.56sec
normalizedlinf
99: 556.31sec
normalizedlinf
99: 554.94sec
normalizedlinf
99: 555.55sec
normalizedlinf
99: 554.78sec
normalizedlinf
99: 555.50sec
normalizedlinf
99: 555.09sec
normalizedlinf
99: 554.69sec
normalizedlinf
99: 554.88sec
normalizedlinf
99: 555.27sec
normalizedlinf