In [4]:
from urllib.request import urlretrieve
import pandas as pd
import os.path
import numpy as np
from sklearn import decomposition, cluster, mixture, datasets
from matplotlib import pyplot as plt, lines
from mpl_toolkits.mplot3d import Axes3D
import scipy.cluster.hierarchy

In [8]:
moons = datasets.make_moons(n_samples=400, noise=1)

In [2]:
# Retrieve the url
if os.path.isfile('Cancer RNA-seq.tar.gz'):
    print('Cancer RNA-seq already exists in the parent directory')
else:
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00401/TCGA-PANCAN-HiSeq-801x20531.tar.gz'
    urlretrieve(url, 'Cancer RNA-seq.tar.gz')
data_path = 'Cancer RNA-seq.tar.gz'
if 'dataframe' not in locals():
    dataframe = pd.read_csv(data_path)

# Remove the targets for clustering
df_features = dataframe.iloc[:801, :]
# This next step was done to avoid errors in pca
df_features = df_features.drop(columns='TCGA-PANCAN-HiSeq-801x20531/')
df_labels = dataframe.iloc[802:, :]
df_labels = df_labels.drop(df_labels.columns[[np.arange(2, df_labels.shape[1])]], axis = 1)
df_labels = df_labels.drop(index=1603)

KeyboardInterrupt: 

In [None]:
# Perform pca
pca = decomposition.PCA(n_components=3)
pca.fit(df_features)
X_reduced = pca.transform(df_features)
fig2 = plt.figure()
ax = fig2.add_subplot(111, projection='3d')
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2])
plt.title('Distribution of Data after PCA')


In [None]:
# Define legend markers for the plots below
l_square = lines.Line2D([], [], color='black', marker='s', linestyle='None',
                        markersize=10, label='PRAD')
l_plus = lines.Line2D([], [], color='black', marker='+', linestyle='None',
                      markersize=10, label='LUAD')
l_circle = lines.Line2D([], [], color='black', marker='o', linestyle='None',
                        markersize=10, label='KIRC')
l_triangle = lines.Line2D([], [], color='black', marker='^', linestyle='None',
                          markersize=10, label='COAD')
l_wiggle = lines.Line2D([], [], color='black', marker='3', linestyle='None',
                        markersize=10, label='BRCA')

In [None]:
# Define different linkages in Hierarchical clustering
for link in ('ward', 'average', 'complete', 'single'):
    clustering = cluster.AgglomerativeClustering(linkage=link, n_clusters=5)
    clustering.fit(X_reduced)
    label = clustering.labels_
    fig = plt.figure()
    ax = Axes3D(fig)
    for m in range(len(df_labels)):
        if df_labels.iloc[m, 1] == 'PRAD':
            marker = 's'
        elif df_labels.iloc[m, 1] == 'LUAD':
            marker = '+'
        elif df_labels.iloc[m, 1] == 'KIRC':
            marker = 'o'
        elif df_labels.iloc[m, 1] == 'COAD':
            marker = '^'
        else:
            marker = '3'
        ax.scatter(X_reduced[m, 0], X_reduced[m, 1], X_reduced[m, 2],
                   color=plt.cm.nipy_spectral(float(label[m]) / np.max(label + 1)), marker=marker,
                   s=28, linewidths=1, edgecolor='k')

    plt.legend(handles=[l_square, l_plus, l_circle, l_triangle, l_wiggle], loc=1, title='Tumor Names')
    plt.title('%s linkage' % link, fontdict={'fontweight': 'bold', 'fontsize': 18})


In [None]:
for cov in ('full', 'tied', 'diag', 'spherical'):
    gmm = mixture.GaussianMixture(covariance_type=cov, n_components=5)
    gmm.fit(X_reduced)
    label = gmm.predict(X_reduced)
    fig = plt.figure()
    ax = Axes3D(fig)
    for m in range(len(df_labels)):
        if df_labels.iloc[m, 1] == 'PRAD':
            marker = 's'
        elif df_labels.iloc[m, 1] == 'LUAD':
            marker = '+'
        elif df_labels.iloc[m, 1] == 'KIRC':
            marker = 'o'
        elif df_labels.iloc[m, 1] == 'COAD':
            marker = '^'
        else:
            marker = '3'
        ax.scatter(X_reduced[m, 0], X_reduced[m, 1], X_reduced[m, 2],
                   color=plt.cm.jet(float(label[m]) / np.max(label + 1)), marker=marker,
                   s=28, linewidths=1, edgecolor='k')

    plt.legend(handles=[l_square, l_plus, l_circle, l_triangle, l_wiggle], loc=1, title='Tumor Names')
    plt.title('%s covariance' % cov, fontdict={'fontweight': 'bold', 'fontsize': 16})