In [22]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, SpectralClustering

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.manifold import TSNE

from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import normalized_mutual_info_score
from scipy.spatial.distance import pdist
import umap

In [23]:
df_gene = pd.read_csv('genedata.csv')

In [24]:
X_gene = df_gene.iloc[:,2:].to_numpy()
y_gene = df_gene['class'].to_numpy()

In [25]:
def cluster_gene_data(X):
    reducer = umap.UMAP(n_components=700)
    X_umap = reducer.fit_transform(X)
    kmeans_5 = KMeans(n_clusters=5)
    labels_umap_5 = kmeans_5.fit_predict(X_umap)
    return labels_umap_5

In [26]:
labels_gene = cluster_gene_data(X_gene)

In [27]:
normalized_mutual_info_score(y_gene, labels_gene, average_method="geometric")

0.980426301732183

In [28]:
pd.Series(labels_gene).to_csv('gene_labels.txt', index=False, header=None)

In [29]:
df_ms = pd.read_csv('msdata.csv')
X_ms = df_ms.iloc[:,2:].to_numpy()
y_ms = df_ms['class'].to_numpy()

In [30]:
def cluster_ms_data(X):
    pca = KernelPCA(n_components=140, kernel='cosine')
    X_red_pca = pca.fit_transform(X)
    spect = SpectralClustering(n_clusters=3, gamma=1, n_init=100, affinity='poly', degree=10)
    labels_spect = spect.fit_predict(X_red_pca)
    return labels_spect

In [31]:
labels_ms = cluster_ms_data(X_ms)

In [33]:
print('Highest achieved NMI score:', normalized_mutual_info_score(y_ms, labels_ms))

Highest achieved NMI score: 0.9748507486774602


In [35]:
pd.Series(labels_ms).to_csv('ms_labels.txt', index=False, header=None)