In [1]:
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
YEAST_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/yeast"
NETWORK_DIRECTORY = f"{YEAST_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{YEAST_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY = f"{YEAST_DIRECTORY}/annotations"

# Clustering

In [3]:
def get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(CLUSTER_DIRECTORY)]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)
    else:
        return -1

## GDV

In [4]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GDV")]

In [5]:
method = 'kmedoid'

for run in range(49):
    for distance in all_distances:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GDV/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GDV/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

mahalanobis


KeyboardInterrupt: 

## GCV-A

In [None]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-A")]

In [None]:
method = 'kmedoid'

for run in range(40):
    for distance in ['all1_normalized1-linf',
                     'all1_normalized1-l2', 'all2_normalized1-l2',
                     'all1_normalized1-l1', 'all2_normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-A/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-A/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

## GCV-G

In [4]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-G")]

In [5]:
method = 'kmedoid'

for run in range(50):
    for distance in ['all1_normalized1-linf',
                     'all1_normalized1-l2', 'all2_normalized1-l2',
                     'all1_normalized1-l1', 'all2_normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-G/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-G/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

all1_normalized1-linf
100: 602.35sec
all1_normalized1-l2
100: 580.59sec
all2_normalized1-l2
100: 575.04sec
all1_normalized1-l1
100: 568.49sec
all2_normalized1-l1
100: 564.82sec
all1_normalized1-linf
100: 565.43sec
all1_normalized1-l2
100: 577.16sec
all2_normalized1-l2
100: 567.29sec
all1_normalized1-l1
100: 572.05sec
all2_normalized1-l1
100: 570.58sec
all1_normalized1-linf
100: 559.27sec
all1_normalized1-l2
100: 600.90sec
all2_normalized1-l2
100: 567.37sec
all1_normalized1-l1
100: 560.18sec
all2_normalized1-l1
100: 579.34sec
all1_normalized1-linf
100: 575.23sec
all1_normalized1-l2
100: 563.00sec
all2_normalized1-l2
100: 581.14sec
all1_normalized1-l1
100: 575.97sec
all2_normalized1-l1
100: 562.78sec
all1_normalized1-linf
100: 575.99sec
all1_normalized1-l2
100: 559.14sec
all2_normalized1-l2
100: 564.16sec
all1_normalized1-l1
100: 561.74sec
all2_normalized1-l1
100: 574.72sec
all1_normalized1-linf
100: 568.96sec
all1_normalized1-l2
100: 577.88sec
all2_normalized1-l2
100: 573.14sec
all1_nor

OSError: [Errno 28] No space left on device

# GCV-AD

In [None]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-AD")]

In [None]:
method = 'kmedoid'

for run in range(50):
    for distance in ['all1_normalized1-linf',
                     'all1_normalized1-l2', 'all2_normalized1-l2',
                     'all1_normalized1-l1', 'all2_normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-AD/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-AD/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

# GCV-DG

In [None]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GCV-DG")]

In [None]:
method = 'kmedoid'

for run in range(50):
    for distance in ['all1_normalized1-linf',
                     'all1_normalized1-l2', 'all2_normalized1-l2',
                     'all1_normalized1-l1', 'all2_normalized1-l1']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{YEAST_DIRECTORY}/clusterings/GCV-DG/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GCV-DG/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs(CLUSTER_DIRECTORY, distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

### gGCV

In [6]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['normalizedlinf']

In [8]:
# Automated
for run in range(49):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_gGCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.loadtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skiprows=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('gGCV', distance, MAX_CLUSTERS-1) # CAREFULL !!!!!
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalizedlinf
99: 569.67sec
normalizedlinf
99: 555.84sec
normalizedlinf
99: 554.14sec
normalizedlinf
99: 555.52sec
normalizedlinf
99: 554.21sec
normalizedlinf
99: 555.25sec
normalizedlinf
99: 554.59sec
normalizedlinf
99: 554.19sec
normalizedlinf
99: 554.66sec
normalizedlinf
99: 555.05sec
normalizedlinf
99: 554.81sec
normalizedlinf
99: 554.58sec
normalizedlinf
99: 554.33sec
normalizedlinf
99: 554.85sec
normalizedlinf
99: 555.31sec
normalizedlinf
99: 554.66sec
normalizedlinf
99: 554.98sec
normalizedlinf
99: 554.77sec
normalizedlinf
99: 555.27sec
normalizedlinf
99: 555.33sec
normalizedlinf
99: 554.48sec
normalizedlinf
99: 555.08sec
normalizedlinf
99: 555.41sec
normalizedlinf
99: 554.54sec
normalizedlinf
99: 554.56sec
normalizedlinf
99: 556.31sec
normalizedlinf
99: 554.94sec
normalizedlinf
99: 555.55sec
normalizedlinf
99: 554.78sec
normalizedlinf
99: 555.50sec
normalizedlinf
99: 555.09sec
normalizedlinf
99: 554.69sec
normalizedlinf
99: 554.88sec
normalizedlinf
99: 555.27sec
normalizedlinf