In [1]:
from sklearn.metrics.pairwise import cosine_distances
from scipy.spatial.distance import squareform, pdist, cdist
from itertools import islice, combinations, product
from pyclustering.cluster.kmedoids import kmedoids
from collections import defaultdict
from sklearn.cluster import KMeans
from scipy.stats import hypergeom
from goatools import obo_parser
from scipy.linalg import eigh
from zipfile import ZipFile

import os
import time
import gzip
import graco
import shutil
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import Bio.UniProt.GOA as GOA
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline
sns.set()
pd.set_option("display.max_columns", 50)

DATA_DIRECTORY = "/Users/markusyoussef/Desktop/git/supplements/data"
CPP_DIRECTORY = "/Users/markusyoussef/Desktop/git/graco/graco/cpp"

In [5]:
RAW_DATA_DIRECTORY = f"{DATA_DIRECTORY}/raw_data"
PPI_DIRECTORY = f"{DATA_DIRECTORY}/PPI"
ANNOTATIONS_DIRECTORY = f"{DATA_DIRECTORY}/annotations"
MATRIX_DIRECTORY = f"{DATA_DIRECTORY}/matrix"
CLUSTERS_DIRECTORY = f"{DATA_DIRECTORY}/clusters"

if not os.path.exists(DATA_DIRECTORY):
    os.makedirs(DATA_DIRECTORY)
        
if not os.path.exists(RAW_DATA_DIRECTORY):
    os.makedirs(RAW_DATA_DIRECTORY)     
    
if not os.path.exists(PPI_DIRECTORY):
    os.makedirs(PPI_DIRECTORY)
    
if not os.path.exists(ANNOTATIONS_DIRECTORY):
    os.makedirs(ANNOTATIONS_DIRECTORY)
    
if not os.path.exists(MATRIX_DIRECTORY):
    os.makedirs(MATRIX_DIRECTORY)
    
if not os.path.exists(CLUSTERS_DIRECTORY):
    os.makedirs(CLUSTERS_DIRECTORY)

# Downloads

### BioGRID

In [6]:
if not os.path.exists(RAW_DATA_DIRECTORY):
     os.makedirs(directory)

BioGRID_FILENAME = "BIOGRID-ORGANISM-3.5.177.tab2.zip"
BioGRID_URL = "https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-3.5.177"
BioGRID_FILEPATH = f"{RAW_DATA_DIRECTORY}/{BioGRID_FILENAME}" 

In [7]:
# Download
r = requests.get(f"{BioGRID_URL}/{BioGRID_FILENAME}", allow_redirects=True)
with open(BioGRID_FILEPATH, 'wb') as f:
    f.write(r.content)

In [8]:
# Unzip Saccharomyces cerevisiae file
with ZipFile(BioGRID_FILEPATH, 'r') as z:
    BioGRID_sc_FILENAME, = [name for name in z.namelist() if 'cerevisiae' in name.lower()]
    z.extract(BioGRID_sc_FILENAME, RAW_DATA_DIRECTORY)

### SGD annotations

In [9]:
SGD_FILENAME = "sgd.gaf.gz"
SGD_URL = "http://current.geneontology.org/annotations"
SGD_FILEPATH = f"{RAW_DATA_DIRECTORY}/{SGD_FILENAME}"

In [10]:
# Download
r = requests.get(f"{SGD_URL}/{SGD_FILENAME}", allow_redirects=True)
with open(SGD_FILEPATH, 'wb') as f:
    f.write(r.content)

### GO

In [11]:
GO_FILENAME = "go-basic.obo"
GO_URL = "http://purl.obolibrary.org/obo/go"
GO_FILEPATH = f"{RAW_DATA_DIRECTORY}/{GO_FILENAME}"

In [12]:
# Download
r = requests.get(f"{GO_URL}/{GO_FILENAME}", allow_redirects=True)
with open(GO_FILEPATH, 'wb') as f:
    f.write(r.content)

# BioGRID PPI (S. cerevisiae)

## Define interactor universes

In [13]:
# load BioGRID file as dataframe
organism_FILENAME = "BIOGRID-ORGANISM-Saccharomyces_cerevisiae_S288c-3.5.177.tab2.txt"
organism_FILEPATH = f"{RAW_DATA_DIRECTORY}/{organism_FILENAME}"

# remove entrez ids and scores because of mixed datatypes 
BioGRID_df = pd.read_csv(organism_FILEPATH, delimiter='\t',
                           usecols=[index for index in range(24) if index not in {1,2,18}])

In [14]:
# BioGRID universe is the collection of every gene known to BioGRID
universe = set(BioGRID_df['BioGRID ID Interactor A']) | \
           set(BioGRID_df['BioGRID ID Interactor B'])

systematic_universe = set(BioGRID_df['Systematic Name Interactor A']) | \
                      set(BioGRID_df['Systematic Name Interactor B'])
systematic_universe.remove('-')

In [15]:
# Filter for (reliable) physical interactions
EXPERIMENTAL_SYSTEM = {'Two-hybrid', 
                       'Affinity Capture-Luminescence',
                       'Affinity Capture-MS', 
                       'Affinity Capture-RNA', 
                       'Affinity Capture-Western'}
EXPERIMENTAL_SYSTEM_TYPE = {'physical'} # redundant because of experimental evidence filtering

physical_interaction_df = BioGRID_df[BioGRID_df['Experimental System'].isin(EXPERIMENTAL_SYSTEM)]
physical_interaction_df = physical_interaction_df[
    physical_interaction_df['Experimental System Type'].isin(EXPERIMENTAL_SYSTEM_TYPE)] # just in case...


physical_universe = set(physical_interaction_df['BioGRID ID Interactor A']) | \
                            set(physical_interaction_df['BioGRID ID Interactor B'])

systematic_physical_universe = set(physical_interaction_df['Systematic Name Interactor A']) | \
                                       set(physical_interaction_df['Systematic Name Interactor B'])
systematic_physical_universe.remove('-')

## Define PPI

In [16]:
# PPI in dataframe
PPI_df = physical_interaction_df[
    physical_interaction_df["Systematic Name Interactor A"].str.startswith('Y') & \
    physical_interaction_df["Systematic Name Interactor B"].str.startswith('Y')]

PPI_universe = set(PPI_df['Systematic Name Interactor A']) | \
               set(PPI_df['Systematic Name Interactor B'])

In [17]:
# Reduce PPI to simple network
PPI_nx = nx.from_pandas_edgelist(PPI_df,'Systematic Name Interactor A', 'Systematic Name Interactor B')
PPI_nx.remove_edges_from(nx.selfloop_edges(PPI_nx))

### Summary

In [18]:
print("BioGRID universe sizes:")
print("=========================")
print(f"    -         -    : {len(universe)}")
print(f"systematic    -    : {len(systematic_universe)}")
print(f"    -      physical: {len(physical_universe)}")
print(f"systematic physical: {len(systematic_physical_universe)}")
print("-------------------------")
print(f"PPI population size: {PPI_nx.number_of_nodes()}")

BioGRID universe sizes:
    -         -    : 7172
systematic    -    : 6535
    -      physical: 6551
systematic physical: 6121
-------------------------
PPI population size: 5726


### Save

In [19]:
nx.write_edgelist(PPI_nx, f"{PPI_DIRECTORY}/BioGRID_sc.txt", data=False)

# SGD annotations (S. cerevisiae)

In [20]:
SGD_FILENAME = "sgd.gaf.gz"
SGD_FILEPATH = f"{RAW_DATA_DIRECTORY}/{SGD_FILENAME}"

# load SGD gaf-file as dataframe 
with gzip.open(SGD_FILEPATH, 'rt') as gz:
    SGD_df = pd.DataFrame(annotation for annotation in GOA.gafiterator(gz))
    
# Define column with systematic gene names. In SGD this is always the first synonym.
SGD_df['Systematic_ID']  = SGD_df.Synonym.apply(lambda list_:list_[0])

In [21]:
# Filter for proteins
lc_protein_gaf_df = SGD_df[SGD_df['Systematic_ID'].str.startswith('Y')]

# Filter through evidence code
protein_gaf_df = lc_protein_gaf_df[lc_protein_gaf_df['Evidence'].isin(['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'])]

# Split into the three GOs
protein_BP_gaf_df = protein_gaf_df[protein_gaf_df['Aspect']=='P']
protein_MF_gaf_df = protein_gaf_df[protein_gaf_df['Aspect']=='F']
protein_CC_gaf_df = protein_gaf_df[protein_gaf_df['Aspect']=='C']

In [22]:
# Get rid of all unnecesarry columns in the GAFs
high_IC_annotations_df = protein_gaf_df[['Systematic_ID', 'GO_ID']].dropna().drop_duplicates()

# Split into the three GOs
high_IC_BP_annotations_df = protein_BP_gaf_df[['Systematic_ID', 'GO_ID']].dropna().drop_duplicates()
high_IC_MF_annotations_df = protein_MF_gaf_df[['Systematic_ID', 'GO_ID']].dropna().drop_duplicates()
high_IC_CC_annotations_df = protein_CC_gaf_df[['Systematic_ID', 'GO_ID']].dropna().drop_duplicates()

### GO DAG extention

In [23]:
# Load obo and gaf files
GO_FILENAME = "go-basic.obo"
GO_FILEPATH = f"{RAW_DATA_DIRECTORY}/{GO_FILENAME}"

# Create annotations with all upstream terms in dict
go_dag = obo_parser.GODag(GO_FILEPATH)
go2parents = {go_id:{*go_dag[go_id].get_all_parents(), go_id} for go_id in go_dag.keys()}

/Users/markusyoussef/Desktop/git/supplements/data/raw_data/go-basic.obo: fmt(1.2) rel(2019-10-07) 47,285 GO Terms


In [24]:
# Create full list/dataframe of annotations
all_annotations_list = [(gene_id, go_term, go_dag[go_term].level) 
                                for _, (gene_id, go_id) in high_IC_annotations_df.iterrows()
                                    for go_term in go2parents[go_id]]
all_annotations_df = pd.DataFrame(
                            all_annotations_list,  
                            columns = ['Systematic_ID', 'GO_ID', 'Level']).drop_duplicates()

# Split into the three GOs
all_BP_annotations_list = [entry for entry in all_annotations_list 
                               if go_dag[entry[1]].namespace == "biological_process"]
all_MF_annotations_list = [entry for entry in all_annotations_list 
                               if go_dag[entry[1]].namespace == "molecular_function"]
all_CC_annotations_list = [entry for entry in all_annotations_list 
                               if go_dag[entry[1]].namespace == "cellular_component"]

all_BP_annotations_df = pd.DataFrame(
                            all_BP_annotations_list,  
                            columns = ['Systematic_ID', 'GO_ID', 'Level']).drop_duplicates()
all_MF_annotations_df = pd.DataFrame(
                            all_MF_annotations_list,  
                            columns = ['Systematic_ID', 'GO_ID', 'Level']).drop_duplicates()
all_CC_annotations_df = pd.DataFrame(
                            all_CC_annotations_list,  
                            columns = ['Systematic_ID', 'GO_ID', 'Level']).drop_duplicates()

### Summary

In [25]:
print("SGD universe sizes:")
print("============================================")
print("Biological Process : " 
    f"{len(set(map(lambda x:x[0],all_BP_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_BP_annotations_list)))} GO-IDs ")
print("Molecular Functions: " 
    f"{len(set(map(lambda x:x[0],all_MF_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_MF_annotations_list)))} GO-IDs ")
print("Cellular Components: " 
    f"{len(set(map(lambda x:x[0],all_CC_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_CC_annotations_list)))}  GO-IDs ")
print('--------------------------------------------')
print("All annotations    : " 
    f"{len(set(map(lambda x:x[0],all_annotations_list)))} genes, "
    f"{len(set(map(lambda x:x[1],all_annotations_list)))} GO-IDs ")

SGD universe sizes:
Biological Process : 4532 genes, 4662 GO-IDs 
Molecular Functions: 3564 genes, 2105 GO-IDs 
Cellular Components: 4013 genes, 904  GO-IDs 
--------------------------------------------
All annotations    : 4967 genes, 7671 GO-IDs 


### Save

In [26]:
all_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_sc.csv", index=False)

all_BP_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_BP_sc.csv", index=False)
all_MF_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_MF_sc.csv", index=False)
all_CC_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_CC_sc.csv", index=False)

# BioGRID $\cap$ SGD

In [27]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")

all_annotations_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_sc.csv")

all_BP_annotations_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_BP_sc.csv")
all_MF_annotations_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_MF_sc.csv")
all_CC_annotations_df = pd.read_csv(f"{ANNOTATIONS_DIRECTORY}/SGD_CC_sc.csv")

In [28]:
PPI_annotations_df = all_annotations_df[all_annotations_df.Systematic_ID.isin(PPI_nx.nodes)]

PPI_BP_annotations_df = all_BP_annotations_df[all_BP_annotations_df.Systematic_ID.isin(PPI_nx.nodes)]
PPI_MF_annotations_df = all_MF_annotations_df[all_MF_annotations_df.Systematic_ID.isin(PPI_nx.nodes)]
PPI_CC_annotations_df = all_CC_annotations_df[all_CC_annotations_df.Systematic_ID.isin(PPI_nx.nodes)]

### Summary

In [29]:
print(r"SGD ∩ BioGRID universe sizes:")
print("============================================")
print("Biological Process : " 
    f"{len(set(PPI_BP_annotations_df.Systematic_ID))} genes, "
    f"{len(set(PPI_BP_annotations_df.Systematic_ID))} GO-IDs ")
print("Molecular Functions: " 
    f"{len(set(PPI_MF_annotations_df.Systematic_ID))} genes, "
    f"{len(set(PPI_MF_annotations_df.Systematic_ID))} GO-IDs ")
print("Cellular Components: " 
    f"{len(set(PPI_CC_annotations_df.Systematic_ID))} genes, "
    f"{len(set(PPI_CC_annotations_df.Systematic_ID))}  GO-IDs ")
print('--------------------------------------------')
print("All annotations    : " 
    f"{len(set(PPI_annotations_df.Systematic_ID))} genes, "
    f"{len(set(PPI_annotations_df.Systematic_ID))} GO-IDs ")

SGD ∩ BioGRID universe sizes:
Biological Process : 4496 genes, 4496 GO-IDs 
Molecular Functions: 3537 genes, 3537 GO-IDs 
Cellular Components: 3993 genes, 3993  GO-IDs 
--------------------------------------------
All annotations    : 4923 genes, 4923 GO-IDs 


### Save

In [30]:
PPI_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_sc.csv", index=False)

PPI_BP_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_BP_sc.csv", index=False)
PPI_MF_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_MF_sc.csv", index=False)
PPI_CC_annotations_df.to_csv(f"{ANNOTATIONS_DIRECTORY}/BioGRID-SGD_CC_sc.csv", index=False)

# Distance matrices

## GraCo distances

### Individuals

In [6]:
# Get orbit-specific graphlet coefficients
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GDV = graco.orbits(PPI_nx)
GCV = graco.coefficients(GDV)

In [7]:
C0 = np.array(GCV[['c_0-2' , 'c_0-3']])
C1 = np.array(GCV[['c_1-5' , 'c_1-8' , 'c_1-10', 'c_1-12']])
C2 = np.array(GCV[['c_2-7' , 'c_2-11', 'c_2-13']])
C3 = np.array(GCV[['c_3-11', 'c_3-13', 'c_3-14']])

In [9]:
tvd = '0'
N = PPI_nx.number_of_nodes()

t1 = time.time()
sqD = [graco.functions.tvd(eval(f'C{tvd}[{i}]'),eval(f'C{tvd}[{j}]')) 
            for (i,j) in combinations(range(N), 2)]
t2 = time.time()
print(f'{N} Nodes: 100% - {t2-t1:.2f}sec', end='\r')
    
D_tvd = squareform(sqD)
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_tvd{tvd}.txt", D_tvd, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

5726 Nodes: 100% - 776.28sec

### Combinations

In [None]:
MATRIX_NAME0 = "sc_BioGRID_tvd0"
D0_df = pd.read_csv(f"{MATRIX_DIRECTORY}/{MATRIX_NAME0}.txt", delimiter=' ')

MATRIX_NAME1 = "sc_BioGRID_tvd1"
D1_df = pd.read_csv(f"{MATRIX_DIRECTORY}/{MATRIX_NAME1}.txt", delimiter=' ')

MATRIX_NAME2 = "sc_BioGRID_tvd2"
D2_df = pd.read_csv(f"{MATRIX_DIRECTORY}/{MATRIX_NAME2}.txt", delimiter=' ')

MATRIX_NAME3 = "sc_BioGRID_tvd3"
D3_df = pd.read_csv(f"{MATRIX_DIRECTORY}/{MATRIX_NAME3}.txt", delimiter=' ')

MATRIX_NAMEt = "sc_BioGRID_tijana"
Dt_df = pd.read_csv(f"{MATRIX_DIRECTORY}/{MATRIX_NAMEt}.txt", delimiter=' ')

D_df = (D0_df + D1_df + D2_df + D3_df)/4
int2gene = dict(enumerate(D1_df.columns))

del D0_df, D1_df, D2_df, D3_df

In [None]:
MATRIX_NAME = "sc_BioGRID_tvd0123"

t1 = time.time()
for n_clusters in range(2,100):
    initial_medoids = range(n_clusters)
    kmedoids_instance = kmedoids(np.array(D_df), initial_medoids, data_type='distance_matrix')
    kmedoids_instance.process()

    with open(f"{CLUSTERS_DIRECTORY}/{METHOD}/{MATRIX_NAME}_{n_clusters}.txt", 'w') as f:
        for cluster in kmedoids_instance.get_clusters():
            f.write(' '.join(map(int2gene.get,cluster)) + '\n')
    t2 = time.time()
    print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')

### Signature similarity

In [114]:
def tijana_distance(GDV):
    LogGDV = np.log(np.array(GDV+1))

    orbit_dependencies = np.array((1,2,2,2,3,4,3,3,4,3,4,4,4,4,3))
    weights = 1 - np.log(orbit_dependencies) / np.log(len(orbit_dependencies))
    
    sqD = [np.sum(weights*np.abs(LogGDV[i,:]-LogGDV[j,:]) / np.log(np.max([GDV[i,:],GDV[j,:]], axis=0)+2))
              for (i,j) in combinations(range(len(GDV)), 2)]

    return squareform(sqD) / np.sum(weights)

In [6]:
# Calculate signature distance matrix
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GDV = graco.orbits(PPI_nx)
np.savetxt(f"{CPP_DIRECTORY}/matrix.in", GDV, 
           header=f"{len(GDV)} 15", fmt='%d')

In [7]:
D_tijana = np.genfromtxt(f"{CPP_DIRECTORY}/matrix.out")
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_tijana.txt", D_tijana, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

### pdist

#### GDV

In [4]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GDV = graco.orbits(PPI_nx)

In [6]:
# Automated
for distance in ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']:
    D = squareform(pdist(GDV, distance))
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

In [17]:
D_GDV_euclidean = squareform(pdist(GDV))
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_euclidean.txt", D_GDV_euclidean, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

#### GCV

In [None]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GDV = graco.orbits(PPI_nx)
GCV = graco.coefficients(GDV).fillna(0)

In [None]:
# Automated
for distance in ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']:
    D = squareform(pdist(GDV, distance))
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

In [19]:
D_GCV_euclidean = squareform(pdist(GCV))
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GCV_euclidean.txt", D_GCV_euclidean, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

0.3039555549621582


### Z-score

In [4]:
PPI_nx = nx.read_edgelist(f"{PPI_DIRECTORY}/BioGRID_sc.txt")
GDV = graco.orbits(PPI_nx)
z_GDV = (GDV-GDV.mean())/GDV.std()

In [7]:
# Automated
for distance in ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']:
    D = squareform(pdist(z_GDV, distance))
    np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_zscore_{distance}.txt", D, 
               fmt='%.7f', header=' '.join(PPI_nx), comments='')

In [21]:
D_GDV_zscore1 = cdist(z_GDV, z_GDV, 'cityblock')
np.savetxt(f"{MATRIX_DIRECTORY}/sc_BioGRID_GDV_zscore1.txt", D_GDV_zscore1, 
           fmt='%.7f', header=' '.join(PPI_nx), comments='')

### Spectral stuff

In [18]:
def normalized_laplacian(D):
    # can be optimized more
    diag = ((len(D)-1) - np.sum(D, axis=0)).reshape(-1,1)
    L  = D-1
    np.fill_diagonal(L, diag)
    L0 = np.multiply(np.multiply(diag**(-1/2),L).T,diag**(-1/2))
    return L0

# Clustering 

## k-medoid

In [4]:
METHOD = "kmedoids"

if not os.path.exists(f"{CLUSTERS_DIRECTORY}/{METHOD}"):
    os.makedirs(f"{CLUSTERS_DIRECTORY}/{METHOD}")

### GDV

In [5]:
# Automated
all_distances = ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']

for distance in ['sqeuclidean', 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis', 'euclidean', 'cityblock', 'seuclidean', 
                 'cosine', 'correlation']:
    print(distance)
    MATRIX_NAME = f"sc_BioGRID_GDV_{distance}"
    with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
        line = f.readline()
    D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

    int2gene = dict(enumerate(line.split()))

    t1 = time.time()
    for n_clusters in range(2, 100):
        initial_medoids = range(n_clusters)
        kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
        kmedoids_instance.process()

        with open(f"{CLUSTERS_DIRECTORY}/{METHOD}/{MATRIX_NAME}_{n_clusters}.txt", 'w') as f:
            for cluster in kmedoids_instance.get_clusters():
                f.write(' '.join(map(int2gene.get,cluster)) + '\n')
        t2 = time.time()
        print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

sqeuclidean
99: 533.32sec
chebyshev
99: 526.23sec
canberra
99: 521.68sec
braycurtis
99: 527.86sec
mahalanobis
99: 505.18sec
euclidean


KeyboardInterrupt: 

### z-GDV

In [6]:
# Automated
all_distances = ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']

for distance in ['sqeuclidean', 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis', 'euclidean', 'cityblock', 'seuclidean', 
                 'cosine', 'correlation']:
    print(distance)
    MATRIX_NAME = f"sc_BioGRID_GDV_zscore_{distance}"
    with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
        line = f.readline()
    D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

    int2gene = dict(enumerate(line.split()))

    t1 = time.time()
    for n_clusters in range(2, 100):
        initial_medoids = range(n_clusters)
        kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
        kmedoids_instance.process()

        with open(f"{CLUSTERS_DIRECTORY}/{METHOD}/{MATRIX_NAME}_{n_clusters}.txt", 'w') as f:
            for cluster in kmedoids_instance.get_clusters():
                f.write(' '.join(map(int2gene.get,cluster)) + '\n')
        t2 = time.time()
        print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

sqeuclidean
99: 512.07sec
chebyshev
99: 506.98sec
canberra
99: 507.52sec
braycurtis
99: 496.15sec
mahalanobis
99: 502.34sec
euclidean
99: 515.79sec
cityblock
99: 513.60sec
seuclidean
99: 518.31sec
cosine
99: 523.45sec
correlation
99: 518.05sec


### GCV

In [None]:
# Automated
all_distances = ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']

for distance in ['sqeuclidean', 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis', 'euclidean', 'cityblock', 'seuclidean', 
                 'cosine', 'correlation']:
    print(distance)
    MATRIX_NAME = f"sc_BioGRID_GCV_{distance}"
    with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
        line = f.readline()
    D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

    int2gene = dict(enumerate(line.split()))

    t1 = time.time()
    for n_clusters in range(2, 100):
        initial_medoids = range(n_clusters)
        kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
        kmedoids_instance.process()

        with open(f"{CLUSTERS_DIRECTORY}/{METHOD}/{MATRIX_NAME}_{n_clusters}.txt", 'w') as f:
            for cluster in kmedoids_instance.get_clusters():
                f.write(' '.join(map(int2gene.get,cluster)) + '\n')
        t2 = time.time()
        print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

### z-GCV

In [None]:
# Automated
all_distances = ['euclidean', 'cityblock', 'seuclidean', 
                 'sqeuclidean', 'cosine', 'correlation', 
                 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis']

for distance in ['sqeuclidean', 'chebyshev', 'canberra', 'braycurtis', 
                 'mahalanobis', 'euclidean', 'cityblock', 'seuclidean', 
                 'cosine', 'correlation']:
    print(distance)
    MATRIX_NAME = f"sc_BioGRID_GCV_zscore_{distance}"
    with open(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", 'r') as f:
        line = f.readline()
    D = np.genfromtxt(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", skip_header=1)

    int2gene = dict(enumerate(line.split()))

    t1 = time.time()
    for n_clusters in range(2, 100):
        initial_medoids = range(n_clusters)
        kmedoids_instance = kmedoids(D, initial_medoids, data_type='distance_matrix')
        kmedoids_instance.process()

        with open(f"{CLUSTERS_DIRECTORY}/{METHOD}/{MATRIX_NAME}_{n_clusters}.txt", 'w') as f:
            for cluster in kmedoids_instance.get_clusters():
                f.write(' '.join(map(int2gene.get,cluster)) + '\n')
        t2 = time.time()
        print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')
    print()

### TVDs

In [22]:
MATRIX_NAME = "sc_BioGRID_tvd3"
D_df = pd.read_csv(f"{MATRIX_DIRECTORY}/{MATRIX_NAME}.txt", delimiter=' ')

int2gene = dict(enumerate(D_df.columns))

t1 = time.time()
for n_clusters in range(2,50):
    initial_medoids = range(n_clusters)
    kmedoids_instance = kmedoids(np.array(D_df), initial_medoids, data_type='distance_matrix')
    kmedoids_instance.process()

    with open(f"{CLUSTERS_DIRECTORY}/{METHOD}/{MATRIX_NAME}_{n_clusters}.txt", 'w') as f:
        for cluster in kmedoids_instance.get_clusters():
            f.write(' '.join(map(int2gene.get,cluster)) + '\n')
    t2 = time.time()
    print(f'{n_clusters}: {t2-t1:.2f}sec', end='\r')

49: 273.25sec