In [1]:
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/media/clusterduck123/joe/data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

# Clustering

In [3]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)
    else:
        return -1

## GDV

In [9]:
feature = 'GDV'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [10]:
method = 'kmedoid'

for run in range(30):
    for distance in {'hellinger'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

hellinger


FileNotFoundError: [Errno 2] File b'/media/clusterduck123/joe/data/processed-data/yeast/distance-matrices/GDV/hellinger_BioGRID.txt' does not exist: b'/media/clusterduck123/joe/data/processed-data/yeast/distance-matrices/GDV/hellinger_BioGRID.txt'

## GCV-A

In [4]:
feature = 'GCV-A'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [5]:
method = 'kmedoid'

for run in range(30):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalized1-linf
100: 567.56sec
normalized1-l2
100: 553.66sec
normalized1-l1
100: 568.33sec
normalized1-linf
100: 566.88sec
normalized1-l2
100: 543.78sec
normalized1-l1
100: 542.83sec
normalized1-linf
100: 562.03sec
normalized1-l2
100: 561.84sec
normalized1-l1
100: 565.90sec
normalized1-linf
100: 585.38sec
normalized1-l2
100: 553.77sec
normalized1-l1
100: 569.14sec
normalized1-linf
100: 544.02sec
normalized1-l2
100: 557.46sec
normalized1-l1
100: 550.29sec
normalized1-linf
100: 561.53sec
normalized1-l2
100: 546.45sec
normalized1-l1
100: 565.12sec
normalized1-linf
100: 555.60sec
normalized1-l2
100: 548.69sec
normalized1-l1
100: 579.96sec
normalized1-linf
100: 568.23sec
normalized1-l2
100: 558.88sec
normalized1-l1
100: 575.31sec
normalized1-linf
100: 552.40sec
normalized1-l2
100: 550.22sec
normalized1-l1
100: 570.18sec
normalized1-linf
100: 589.95sec
normalized1-l2
100: 594.89sec
normalized1-l1
100: 600.24sec
normalized1-linf
100: 591.80sec
normalized1-l2
100: 576.68sec
normalized1-l1
100

## GCV-G

In [None]:
feature = 'GCV-G'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [None]:
method = 'kmedoid'

for run in range(30):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

# GCV-DA

In [4]:
feature = 'GCV-DA'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [5]:
method = 'kmedoid'

for run in range(5):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

braycurtis
100: 655.56sec
canberra
100: 648.68sec
chebyshev
100: 684.83sec
cityblock
100: 634.92sec
correlation
100: 553.61sec
cosine
100: 571.81sec
euclidean
100: 580.79sec
mahalanobis
100: 858.79sec
normalized1_l1
100: 562.67sec
normalized1_l2
100: 547.87sec
normalized1_linf
100: 549.53sec
normalized2_l1
100: 555.34sec
normalized2_l2
100: 556.45sec
normalized2_linf
100: 551.65sec
seuclidean
100: 594.77sec
sqeuclidean
100: 564.47sec
braycurtis
100: 582.12sec
canberra
100: 610.63sec
chebyshev
100: 606.04sec
cityblock
100: 598.08sec
correlation
100: 624.48sec
cosine
100: 604.16sec
euclidean
100: 589.17sec
mahalanobis
100: 839.64sec
normalized1_l1
100: 577.07sec
normalized1_l2
100: 596.52sec
normalized1_linf
100: 578.79sec
normalized2_l1
100: 558.39sec
normalized2_l2
100: 566.19sec
normalized2_linf
100: 596.35sec
seuclidean
100: 586.19sec
sqeuclidean
100: 639.95sec
braycurtis
100: 617.50sec
canberra
100: 605.71sec
chebyshev
100: 581.36sec
cityblock
100: 604.22sec
correlation
100: 645.82s

# GCV-DG

In [4]:
feature = 'GCV-DG'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [5]:
method = 'kmedoid'

for run in range(30):
    for distance in {'hellinger'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

hellinger
100: 691.08sec
hellinger
100: 758.33sec
hellinger
100: 680.51sec
hellinger
100: 670.54sec
hellinger
100: 733.63sec
hellinger
100: 675.36sec
hellinger
100: 636.90sec
hellinger
100: 607.97sec
hellinger
100: 610.42sec
hellinger
100: 617.80sec
hellinger
100: 559.15sec
hellinger
100: 553.84sec
hellinger
100: 551.63sec
hellinger
100: 548.57sec
hellinger
100: 556.80sec
hellinger
100: 553.30sec
hellinger
100: 549.22sec
hellinger
100: 563.44sec
hellinger
100: 556.07sec
hellinger
100: 544.23sec
hellinger
100: 552.30sec
hellinger
100: 557.16sec
hellinger
100: 547.39sec
hellinger
100: 556.67sec
hellinger
100: 555.44sec
hellinger
100: 543.67sec
hellinger
100: 545.06sec
hellinger
100: 555.96sec
hellinger
100: 540.82sec
hellinger
100: 536.18sec


## GCV-all1

In [6]:
feature = 'GCV-all1'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [7]:
method = 'kmedoid'

for run in range(10):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

braycurtis
100: 533.07sec
canberra
100: 528.90sec
chebyshev
100: 548.92sec
cityblock
100: 531.91sec
correlation
100: 530.68sec
cosine
100: 542.59sec
euclidean
100: 543.28sec
hellinger
100: 532.75sec
mahalanobis
100: 650.96sec
normalized1_l1
100: 537.94sec
normalized1_l2
100: 529.21sec
normalized1_linf
100: 553.13sec
normalized2_l1
100: 530.69sec
normalized2_l2
100: 529.82sec
normalized2_linf
100: 550.64sec
seuclidean
100: 530.16sec
sqeuclidean
100: 528.86sec
braycurtis
100: 539.73sec
canberra
100: 523.87sec
chebyshev
100: 529.24sec
cityblock
100: 536.14sec
correlation
100: 542.37sec
cosine
100: 524.20sec
euclidean
100: 541.84sec
hellinger
100: 538.05sec
mahalanobis
100: 632.77sec
normalized1_l1
100: 548.03sec
normalized1_l2
100: 537.49sec
normalized1_linf
100: 528.24sec
normalized2_l1
100: 543.54sec
normalized2_l2
100: 531.62sec
normalized2_linf
100: 529.66sec
seuclidean
100: 551.03sec
sqeuclidean
100: 528.48sec
braycurtis
100: 529.10sec
canberra
100: 537.26sec
chebyshev
100: 535.67sec

KeyboardInterrupt: 

## GCV-all2

In [6]:
feature = 'GCV-all2'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [7]:
method = 'kmedoid'

for run in range(10):
    for distance in {'canberra'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

canberra
100: 562.88sec
canberra
100: 580.86sec
canberra
100: 557.69sec
canberra
100: 554.52sec
canberra
100: 568.93sec
canberra
100: 568.95sec
canberra
100: 593.80sec
canberra
100: 558.38sec
canberra
100: 559.47sec
canberra
100: 526.38sec


## GCV-orca

In [8]:
feature = 'GCV-orca'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [9]:
method = 'kmedoid'

for run in range(10):
    for distance in {'canberra'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

canberra
100: 535.50sec
canberra
100: 536.39sec
canberra
100: 524.62sec
canberra
100: 540.29sec
canberra
100: 524.23sec
canberra
100: 525.40sec
canberra
100: 533.61sec
canberra
100: 539.02sec
canberra
100: 529.45sec
canberra
100: 545.67sec


## GCV-orca+

In [4]:
feature = 'GCV-orca+'

MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = sorted('_'.join(filename.split('_')[:-1]) 
                           for filename in os.listdir(f"{MATRIX_DIRECTORY}/{feature}"))

In [5]:
method = 'kmedoid'

for run in range(10):
    for distance in {'canberra'}:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/{feature}/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/{feature}/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

canberra
100: 568.72sec
canberra
100: 567.64sec
canberra
100: 570.65sec
canberra
100: 552.77sec
canberra
100: 553.76sec
canberra
100: 542.94sec
canberra
100: 535.07sec
canberra
100: 552.04sec
canberra
100: 554.58sec
canberra
100: 531.31sec
