In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib.patches as mpatches
%matplotlib qt

Cavalli = pd.read_csv('Medulloblastoma Files\Tumor_Medulloblastoma_Cavalli_763_rma_sketch_hugene11t.txt', sep=';', na_values=".")
print("The shape of the Cavalli data is: ", Cavalli.shape)

Northcott = pd.read_csv('Medulloblastoma Files\Tumor_Medulloblastoma_MAGIC_Northcott_285_rma_sketch_hugene11t.txt',sep=';', na_values=".")
print("The shape of the Northcott data is: ", Northcott.shape)

The shape of the Cavalli data is:  (18479, 763)
The shape of the Northcott data is:  (18473, 285)


In [2]:
Cavalli = Cavalli.T
Northcott = Northcott.T

In [3]:
subgroups = pd.read_csv('Medulloblastoma Files\GSE85218_subgroups.csv', sep=' ', na_values=".", header=None)
subgroups.columns = ['key','Subgroup']
subgroups = subgroups.set_index('key')

In [4]:
def variance_threshold_selector(data, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [5]:
from sklearn.feature_selection import VarianceThreshold

Cavalli_variance = variance_threshold_selector(Cavalli,0.1)
Northcott_variance = variance_threshold_selector(Northcott,0.1)

print("With a threshold of 0.10, the Cavalli dataset is just ", Cavalli_variance.shape)
print("With a threshold of 0.10, the Northcott dataset is just ", Northcott_variance.shape)

With a threshold of 0.10, the Cavalli dataset is just  (763, 13115)
With a threshold of 0.10, the Northcott dataset is just  (285, 12389)


In [6]:
Cavalli_columns = Cavalli_variance.columns
Northcott_columns = Northcott_variance.columns

Common_genes = list((set(Cavalli_columns))&(set(Northcott_columns)))

Cavalli_end = Cavalli[Common_genes]
Northcott_end = Northcott[Common_genes]

In [7]:
import umap
n_neighbors = [5,15,50,75,100,200]

for i in n_neighbors:
    reducer = umap.UMAP(n_components=3,n_neighbors=i)
    embedding = reducer.fit_transform(Cavalli_end)
    embedding_df = pd.DataFrame(embedding)
    embedding_df['Subgroups']= subgroups['Subgroup'].values

    X_data= embedding_df[0]
    Y_data = embedding_df[1]
    Z_data = embedding_df[2]
    Sbgrp = embedding_df['Subgroups']

    cdict = {'Group4': 'red', 'SHH': 'blue', 'WNT': 'green', 'Group3': 'yellow'}
    c = [cdict[val] for val in Sbgrp]

    plt.figure(figsize=(16,10))
    ax = plt.axes(projection='3d')
    ax.scatter3D(X_data, Y_data, Z_data, c=c);
    red_c = mpatches.Patch(color='red', label='Group4')
    blue_c = mpatches.Patch(color='blue', label='SHH')
    green_c = mpatches.Patch(color='green', label='WNT')
    yellow_c = mpatches.Patch(color='yellow', label='Group3')
    plt.legend(handles=[red_c,blue_c,green_c,yellow_c])
    plt.title('UMAP with n_neighbors %i'%(i))
    plt.show()

In [14]:
Cavalli_end.to_csv('Medulloblastoma_Cavalli_VAE_data.csv', sep=',', header=True)
Northcott_end.to_csv('Medulloblastoma_Northcott_VAE_data.csv', sep=',', header=True)