In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, SpectralClustering

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.manifold import TSNE

from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import normalized_mutual_info_score
from scipy.spatial.distance import pdist
import umap

In [2]:
df = pd.read_csv('genedata.csv')

In [4]:
df.shape

(795, 7002)

In [5]:
X = df.iloc[:,2:].to_numpy()
y = df['class'].to_numpy()

In [6]:
def cluster_gene_data(X):
    reducer = umap.UMAP(n_components=700)
    X_umap = reducer.fit_transform(X)
    kmeans_5 = KMeans(n_clusters=5)
    labels_umap_5 = kmeans_5.fit_predict(X_umap)
    return labels_umap_5

In [7]:
labels = cluster_gene_data(X)

In [8]:
normalized_mutual_info_score(y, labels, average_method="geometric")

0.980426301732183

In [10]:
pd.Series(labels).to_csv('gene_labels.txt', index=False, header=None)

In [11]:
def nmi(y, labels):
    nmi_ = normalized_mutual_info_score(y, labels, average_method='geometric')
    return nmi_

def distance_pair(arr, oth_arr, type='euclidean', p=2):
    k = oth_arr.shape[0]
    n = arr.shape[0]
    diff_matrix = np.zeros((k, n))
    if type=='euclidean':
        for i in range(k):
            diff = (arr - oth_arr[i])
            diff_prod = diff.dot(diff.T)
            diff_matrix[i] = np.sqrt(diff_prod[np.diag_indices(n)])
    if type=='manhattan':
        for i in range(k):
            diff = abs(arr - oth_arr[i])
            diff_matrix[i] = np.sum(diff, axis=1)
    if type=='lp':
        for i in range(k):
            diff = (arr - oth_arr[i])
            diff = diff**p
            diff_prod = diff.sum()
            diff_prod = diff.dot(diff.T)
            diff_matrix[i] = np.sqrt(diff_prod[np.diag_indices(n)])
            
    return diff_matrix.T

def kmeans(X, k, random_state=123):
    np.random.seed(random_state)
    n = X.shape[0]
    centroids = X[np.random.randint(0, n, k), :]
    num_iter = 300
    labels = np.argmin(distance_pair(X, centroids), axis=1)
    for iter in range(num_iter):
        for i in range(k):
            centroids[i] = X[np.where(labels==i)[0],:].mean(axis=0)
        new_labels = np.argmin(distance_pair(X, centroids), axis=1)
        if np.array_equal(labels, new_labels):
            return labels
        labels = new_labels
    return labels

def plot_data_2d(X_2d, y, xlabel, ylabel, title):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)

def plot_data_3d(X_3d, y, xlabel, ylabel, zlabel, title):
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1, projection='3d')
    ax.scatter(X_3d[:, 0], X_3d[:, 1], X_3d[:, 2], c=y)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_ylabel(zlabel)
    ax.set_title(title)

def draw_class_cluster_subplots(X_2d, classes, clusters, xlabel, ylabel, title_1, title_2):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))
    ax1.scatter(X_2d[:, 0], X_2d[:, 1], c=classes)
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)
    ax1.set_title(title_1)
    ax2.scatter(X_2d[:, 0], X_2d[:, 1], c=clusters)
    ax2.set_xlabel(xlabel)
    ax2.set_ylabel(ylabel)
    ax2.set_title(title_2)

df = pd.read_csv('msdata.csv')
X = df.iloc[:,2:].to_numpy()
y = df['class'].to_numpy()


In [12]:
# The following code performs all necessary steps to produce an NMI score of 0.97
pca = KernelPCA(n_components=140, kernel='cosine')
X_red_pca = pca.fit_transform(X)
spect = SpectralClustering(n_clusters=3, gamma=1, n_init=100, affinity='poly', degree=10)
labels_spect = spect.fit_predict(X_red_pca)
print('Highest achieved NMI score:', nmi(y, labels_spect))

Highest achieved NMI score: 0.9748507760700308


In [14]:
pd.Series(labels_spect).to_csv('ms_labels.txt', index=False, header=None)