In [1]:
from pyclustering.cluster.kmedoids import kmedoids
from functools import partial
from random import sample

import os
import time
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
DATA_DIRECTORY = "/home/clusterduck123/Desktop/git/supplements/data"
HUMAN_DIRECTORY = f"{DATA_DIRECTORY}/processed-data/organisms/human"
NETWORK_DIRECTORY = f"{HUMAN_DIRECTORY}/networks"
MATRIX_DIRECTORY  = f"{HUMAN_DIRECTORY}/distance-matrices"
ANNOTATION_DIRECTORY = f"{HUMAN_DIRECTORY}/annotations"

# Clustering

In [3]:
def get_number_of_pre_runs(GV, distance, n_clusters = 99):
    splitted_file_names = [name.split('_') for name in os.listdir(f"{CLUSTER_DIRECTORY}")]
    pre_runs = [int(run) for run, ncluster, db_txt in splitted_file_names if ncluster == str(n_clusters)]
    if pre_runs:
        return max(pre_runs)
    else:
        return -1

## GDV

In [4]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = [filename.split('_')[0] for filename in os.listdir(f"{MATRIX_DIRECTORY}/GDV")]

In [5]:
method = 'kmedoid'

for run in range(40):
    for distance in ['mahalanobis', 'GDV-similarity', 'normalized1-l2', 'normalized1-l1', 'normalized1-linf']:
        print(distance)
        
        CLUSTER_DIRECTORY = f"{HUMAN_DIRECTORY}/clusterings/GDV/{distance}/{method}"
        if not os.path.exists(CLUSTER_DIRECTORY):
            os.makedirs(CLUSTER_DIRECTORY)
            
        df = pd.read_csv(f"{MATRIX_DIRECTORY}/GDV/{distance}_BioGRID.txt", delimiter=' ')
        D  = df.values.astype(float) 

        t1 = time.time()
        for n_clusters in range(MIN_CLUSTERS, MAX_CLUSTERS+1):
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()
            
            nr = get_number_of_pre_runs('GDV', distance, MAX_CLUSTERS)

            with open(f"{CLUSTER_DIRECTORY}/{nr+1}_{n_clusters}_BioGRID.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(df.columns[cluster]) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

mahalanobis
100: 5305.09sec
GDV-similarity
62: 3261.87sec

OSError: [Errno 28] No space left on device

## GCV

In [None]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['canberra']

In [None]:
# Automated
for run in range(50):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/GCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/GCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_GCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('GCV', distance, n_clusters)
        
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/GCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

### gGCV

In [6]:
MIN_CLUSTERS = 2
MAX_CLUSTERS = 100

all_distances = ['normalizedlinf']

In [8]:
# Automated
for run in range(49):
    for distance in all_distances:
        print(distance)

        if not os.path.exists(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}"):
            os.makedirs(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}")

        MATRIX_NAME = f"sc_BioGRID_gGCV_{distance}"
        with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
            line = f.readline()
        D = np.loadtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skiprows=1)

        int2gene = dict(enumerate(line.split()))
        
        t1 = time.time()
        for n_clusters in range(2, 100):
            nr = get_number_of_max_runs('gGCV', distance, MAX_CLUSTERS-1) # CAREFULL !!!!!
            initial_medoids = sample(range(len(D)), n_clusters)
            kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
            kmedoids_instance.process()

            with open(f"{CLUSTERS_DIRECTORY}/gGCV/{distance}/{nr+1}_sc_BioGRID_{n_clusters}.txt", 'w') as f:
                for cluster in kmedoids_instance.get_clusters():
                    f.write(' '.join(map(int2gene.get,cluster)) + '\n')
            t2 = time.time()
            print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
        print()

normalizedlinf
99: 569.67sec
normalizedlinf
99: 555.84sec
normalizedlinf
99: 554.14sec
normalizedlinf
99: 555.52sec
normalizedlinf
99: 554.21sec
normalizedlinf
99: 555.25sec
normalizedlinf
99: 554.59sec
normalizedlinf
99: 554.19sec
normalizedlinf
99: 554.66sec
normalizedlinf
99: 555.05sec
normalizedlinf
99: 554.81sec
normalizedlinf
99: 554.58sec
normalizedlinf
99: 554.33sec
normalizedlinf
99: 554.85sec
normalizedlinf
99: 555.31sec
normalizedlinf
99: 554.66sec
normalizedlinf
99: 554.98sec
normalizedlinf
99: 554.77sec
normalizedlinf
99: 555.27sec
normalizedlinf
99: 555.33sec
normalizedlinf
99: 554.48sec
normalizedlinf
99: 555.08sec
normalizedlinf
99: 555.41sec
normalizedlinf
99: 554.54sec
normalizedlinf
99: 554.56sec
normalizedlinf
99: 556.31sec
normalizedlinf
99: 554.94sec
normalizedlinf
99: 555.55sec
normalizedlinf
99: 554.78sec
normalizedlinf
99: 555.50sec
normalizedlinf
99: 555.09sec
normalizedlinf
99: 554.69sec
normalizedlinf
99: 554.88sec
normalizedlinf
99: 555.27sec
normalizedlinf