In [1]:
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/media/clusterduck123/joe/data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

# Clustering

In [3]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster in splitted_file_names if ncluster == f"{n_clusters}.txt"]
    if pre_runs:
        return max(pre_runs)
    else:
        return -1

# Random

## GDV

In [4]:
feature = 'GDV'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

In [None]:
network = 'systematic_CoEx_COEXPRESdb' 
method  = 'kmedoid'

for run in range(10):
    for distance in {'GDV_similarity', 'canberra', 'mahalanobis'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{network}/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{network}/{feature}/{distance}.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

canberra
100: 597.44sec
mahalanobis
81: 471.32sec

## GCV-A

In [6]:
feature = 'GCV-A'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [7]:
method = 'kmedoid'

for run in range(20):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

braycurtis
100: 529.33sec
canberra
100: 528.15sec
chebyshev
100: 529.53sec
cityblock
100: 530.49sec
correlation
100: 527.20sec
cosine
100: 525.86sec
euclidean
100: 529.85sec
hellinger
100: 529.93sec
mahalanobis
100: 620.90sec
normalized1_l1
100: 528.36sec
normalized1_l2
100: 530.01sec
normalized1_linf
100: 532.74sec
normalized2_l1
100: 535.36sec
normalized2_l2
100: 532.05sec
normalized2_linf
100: 535.99sec
seuclidean
100: 534.11sec
sqeuclidean
100: 527.93sec
braycurtis
100: 532.14sec
canberra
100: 532.42sec
chebyshev
100: 532.79sec
cityblock
100: 533.83sec
correlation
100: 529.34sec
cosine
100: 530.91sec
euclidean
100: 532.54sec
hellinger
100: 533.69sec
mahalanobis
100: 612.19sec
normalized1_l1
100: 532.56sec
normalized1_l2
100: 533.48sec
normalized1_linf
100: 533.26sec
normalized2_l1
100: 532.84sec
normalized2_l2
100: 532.43sec
normalized2_linf
100: 534.67sec
seuclidean
100: 534.31sec
sqeuclidean
100: 528.00sec
braycurtis
100: 534.34sec
canberra
100: 533.20sec
chebyshev
100: 532.95sec

100: 557.23sec
normalized2_l2
100: 554.56sec
normalized2_linf
100: 549.87sec
seuclidean
100: 555.68sec
sqeuclidean
100: 547.22sec
braycurtis
100: 534.43sec
canberra
100: 542.00sec
chebyshev
100: 556.91sec
cityblock
100: 547.21sec
correlation
100: 549.89sec
cosine
100: 549.11sec
euclidean
100: 560.34sec
hellinger
100: 559.78sec
mahalanobis
100: 632.55sec
normalized1_l1
100: 557.74sec
normalized1_l2
100: 558.37sec
normalized1_linf
100: 556.11sec
normalized2_l1
100: 558.78sec
normalized2_l2
100: 530.77sec
normalized2_linf
100: 530.32sec
seuclidean
100: 533.70sec
sqeuclidean
100: 528.83sec
braycurtis
100: 552.61sec
canberra
100: 565.27sec
chebyshev
100: 561.27sec
cityblock
100: 534.50sec
correlation
100: 530.29sec
cosine
100: 537.72sec
euclidean
100: 561.12sec
hellinger
100: 567.32sec
mahalanobis
100: 635.30sec
normalized1_l1
100: 566.73sec
normalized1_l2
100: 563.89sec
normalized1_linf
100: 559.92sec
normalized2_l1
100: 564.82sec
normalized2_l2
100: 573.80sec
normalized2_linf
100: 556.70s

## GCV-G

In [8]:
feature = 'GCV-G'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [9]:
method = 'kmedoid'

for run in range(20):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

braycurtis
100: 561.76sec
canberra
100: 531.72sec
chebyshev
100: 527.84sec
cityblock
100: 533.95sec
correlation
100: 524.77sec
cosine
100: 525.39sec
euclidean
100: 525.04sec
hellinger
100: 524.32sec
mahalanobis
100: 581.10sec
normalized1_l1
100: 524.61sec
normalized1_l2
100: 524.87sec
normalized1_linf
100: 524.35sec
normalized2_l1
100: 525.07sec
normalized2_l2
100: 525.60sec
normalized2_linf
100: 526.75sec
seuclidean
100: 527.51sec
sqeuclidean
100: 523.55sec
braycurtis
100: 526.82sec
canberra
100: 526.51sec
chebyshev
100: 528.18sec
cityblock
100: 526.37sec
correlation
100: 523.67sec
cosine
100: 525.07sec
euclidean
100: 526.84sec
hellinger
100: 526.88sec
mahalanobis
100: 601.67sec
normalized1_l1
100: 526.01sec
normalized1_l2
100: 525.22sec
normalized1_linf
100: 524.86sec
normalized2_l1
100: 524.54sec
normalized2_l2
100: 526.00sec
normalized2_linf
100: 528.11sec
seuclidean
100: 526.09sec
sqeuclidean
100: 517.60sec
braycurtis
100: 521.32sec
canberra
100: 523.22sec
chebyshev
100: 527.74sec

100: 521.13sec
normalized2_l2
100: 521.14sec
normalized2_linf
100: 524.20sec
seuclidean
100: 523.60sec
sqeuclidean
100: 519.12sec
braycurtis
100: 523.18sec
canberra
100: 521.60sec
chebyshev
100: 523.60sec
cityblock
100: 525.57sec
correlation
100: 521.81sec
cosine
100: 521.85sec
euclidean
100: 525.24sec
hellinger
100: 524.13sec
mahalanobis
100: 599.41sec
normalized1_l1
100: 521.87sec
normalized1_l2
100: 523.68sec
normalized1_linf
100: 523.22sec
normalized2_l1
100: 523.19sec
normalized2_l2
100: 523.49sec
normalized2_linf
100: 524.93sec
seuclidean
100: 523.97sec
sqeuclidean
100: 520.38sec
braycurtis
100: 523.19sec
canberra
100: 521.77sec
chebyshev
100: 523.47sec
cityblock
100: 522.36sec
correlation
100: 520.44sec
cosine
100: 520.31sec
euclidean
100: 522.86sec
hellinger
100: 521.90sec
mahalanobis
100: 596.20sec
normalized1_l1
100: 521.67sec
normalized1_l2
100: 521.57sec
normalized1_linf
100: 520.56sec
normalized2_l1
100: 521.65sec
normalized2_l2
100: 521.41sec
normalized2_linf
100: 522.45s

# GCV-DA

In [10]:
feature = 'GCV-DA'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [11]:
network = 'systematic_CoEx_COEXPRESdb' 
method  = 'kmedoid'

for run in range(10):
    for distance in {'canberra', 'cityblock', 'hellinger'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{network}/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{network}/{feature}/{distance}.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

braycurtis
100: 527.33sec
canberra
100: 518.36sec
chebyshev
100: 526.71sec
cityblock
100: 527.16sec
correlation
100: 543.70sec
cosine
100: 533.77sec
euclidean
100: 525.15sec
hellinger
100: 518.82sec
mahalanobis
100: 668.69sec
normalized1_l1
100: 518.27sec
normalized1_l2
100: 520.02sec
normalized1_linf
100: 519.67sec
normalized2_l1
100: 517.79sec
normalized2_l2
100: 520.53sec
normalized2_linf
100: 527.32sec
seuclidean
100: 523.70sec
sqeuclidean
100: 523.57sec
braycurtis
100: 526.37sec
canberra
100: 518.04sec
chebyshev
100: 527.64sec
cityblock
100: 526.18sec
correlation
100: 543.80sec
cosine
100: 532.41sec
euclidean
100: 525.68sec
hellinger
100: 520.99sec
mahalanobis
100: 648.86sec
normalized1_l1
100: 518.31sec
normalized1_l2
100: 517.83sec
normalized1_linf
100: 519.07sec
normalized2_l1
100: 517.51sec
normalized2_l2
100: 518.02sec
normalized2_linf
100: 524.27sec
seuclidean
100: 523.65sec
sqeuclidean
100: 524.23sec
braycurtis
100: 527.25sec
canberra
100: 518.91sec
chebyshev
100: 528.88sec

KeyboardInterrupt: 

# GCV-DG

In [4]:
feature = 'GCV-DG'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [None]:
network = 'systematic_CoEx_COEXPRESdb' 
method  = 'kmedoid'

for run in range(10):
    for distance in {'canberra', 'cityblock', 'hellinger'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{network}/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{network}/{feature}/{distance}.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

hellinger
100: 560.68sec
hellinger
100: 542.59sec
hellinger
100: 689.55sec
hellinger
100: 667.81sec
hellinger
100: 671.09sec
hellinger
100: 672.14sec
hellinger
100: 768.94sec
hellinger
100: 672.18sec
hellinger
100: 550.07sec
hellinger
17: 89.37sec

## GCV-all

In [None]:
feature = 'GCV-all'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [None]:
method = 'kmedoid'

for run in range(20):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

## GCV-nonredundand

In [None]:
feature = 'GCV-nonredundant'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [None]:
method = 'kmedoid'

for run in range(20):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

## GCV-orca

In [4]:
feature = 'GCV-orca'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [5]:
method = 'kmedoid'

for run in range(30):
    for distance in {'hellinger'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

hellinger
100: 575.57sec
hellinger
100: 649.15sec
hellinger
100: 574.86sec
hellinger
100: 543.26sec
hellinger
100: 536.14sec
hellinger
100: 533.73sec
hellinger
100: 544.79sec
hellinger
100: 531.49sec
hellinger
100: 544.60sec
hellinger
100: 541.00sec
hellinger
100: 553.65sec
hellinger
100: 582.14sec
hellinger
100: 640.73sec
hellinger
100: 556.96sec
hellinger
100: 572.19sec
hellinger
13: 70.58sec

KeyboardInterrupt: 

## GCV-orca+

In [None]:
feature = 'GCV-orca+'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [None]:
method = 'kmedoid'

for run in range(10):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()