In [1]:
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/media/clusterduck123/joe/data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

# Clustering

In [3]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)
    else:
        return -1

## GDV

In [4]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GDV")]

In [5]:
method = 'kmedoid'

for run in range(49):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GDV/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GDV/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

mahalanobis


KeyboardInterrupt: 

# GCV-D0

In [4]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-D0")]

In [None]:
method = 'kmedoid'

for run in range(30):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-D0/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-D0/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

sqeuclidean
47: 273.53sec

## GCV-A

In [4]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-A")]

In [5]:
method = 'kmedoid'

for run in range(40):
    for distance in ['normalized1-linf',
                     'normalized1-l2',
                     'normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-A/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalized1-linf
100: 567.56sec
normalized1-l2
100: 553.66sec
normalized1-l1
100: 568.33sec
normalized1-linf
100: 566.88sec
normalized1-l2
100: 543.78sec
normalized1-l1
100: 542.83sec
normalized1-linf
100: 562.03sec
normalized1-l2
100: 561.84sec
normalized1-l1
100: 565.90sec
normalized1-linf
100: 585.38sec
normalized1-l2
100: 553.77sec
normalized1-l1
100: 569.14sec
normalized1-linf
100: 544.02sec
normalized1-l2
100: 557.46sec
normalized1-l1
100: 550.29sec
normalized1-linf
100: 561.53sec
normalized1-l2
100: 546.45sec
normalized1-l1
100: 565.12sec
normalized1-linf
100: 555.60sec
normalized1-l2
100: 548.69sec
normalized1-l1
100: 579.96sec
normalized1-linf
100: 568.23sec
normalized1-l2
100: 558.88sec
normalized1-l1
100: 575.31sec
normalized1-linf
100: 552.40sec
normalized1-l2
100: 550.22sec
normalized1-l1
100: 570.18sec
normalized1-linf
100: 589.95sec
normalized1-l2
100: 594.89sec
normalized1-l1
100: 600.24sec
normalized1-linf
100: 591.80sec
normalized1-l2
100: 576.68sec
normalized1-l1
100

## GCV-G

In [None]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-G")]

In [None]:
method = 'kmedoid'

for run in range(10):
    for distance in ['normalized1-linf',
                     'normalized1-l2',
                     'normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-G/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-G/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

# GCV-AD

In [5]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-AD")]

In [6]:
method = 'kmedoid'

for run in range(50):
    for distance in ['normalized1-linf',
                     'normalized1-l2',
                     'normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-AD/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-AD/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalized1-linf
100: 552.05sec
normalized1-l2
100: 582.32sec
normalized1-l1
100: 591.60sec
normalized1-linf
100: 592.12sec
normalized1-l2
100: 615.60sec
normalized1-l1
100: 606.99sec
normalized1-linf
100: 596.02sec
normalized1-l2
100: 609.16sec
normalized1-l1
100: 601.76sec
normalized1-linf
100: 588.87sec
normalized1-l2
100: 603.07sec
normalized1-l1
100: 581.43sec
normalized1-linf
100: 561.65sec
normalized1-l2
100: 579.20sec
normalized1-l1
100: 568.50sec
normalized1-linf
100: 557.98sec
normalized1-l2
100: 570.04sec
normalized1-l1
100: 566.63sec
normalized1-linf
100: 557.91sec
normalized1-l2
100: 567.28sec
normalized1-l1
100: 564.90sec
normalized1-linf
100: 561.06sec
normalized1-l2
100: 569.79sec
normalized1-l1
100: 566.39sec
normalized1-linf
100: 557.00sec
normalized1-l2
100: 571.23sec
normalized1-l1
100: 571.42sec
normalized1-linf
100: 562.81sec
normalized1-l2
100: 569.62sec
normalized1-l1
100: 568.80sec
normalized1-linf
100: 557.26sec
normalized1-l2
100: 568.03sec
normalized1-l1
100

KeyboardInterrupt: 

# GCV-DG

In [None]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-DG")]

In [None]:
method = 'kmedoid'

for run in range(50):
    for distance in ['normalized1-linf',
                     'normalized1-l2',
                     'normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-DG/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-DG/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()