In [1]:
def find_outliers(data, percentage=0.10):
    
    candidates_final = []
    return_list = []
    
    for column in data.columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        candidates_final = candidates_final + (list(data.loc[data[column]<(Q1-1.5*IQR)].index))
        candidates_final = candidates_final + (list(data.loc[data[column]<(Q1-1.5*IQR)].index))

    for i in candidates_final:
        if candidates_final.count(i) > percentage*len(data.columns):
            return_list.append(i)
            
    return set(return_list)

In [2]:
from numpy import linalg as LA    

def max_min_distances(data):
    
    minor_list = []
    max_list = []    
    
    if type(data) == pd.core.frame.DataFrame:
        data = data.to_numpy()

    for i in range(0,len(data)):
        minor = 9999
        distance = 9999
        max = 0
        for j in range(0,len(data)):
            if i != j:
                distance=LA.norm(data[i]-data[j],ord=2)
            if distance < minor:
                minor = distance
                index_min = j
            if distance > max:
                max = distance
                index_max = j
        minor_list.append(minor)
        max_list.append(max)
    minor_average = sum(minor_list[1:])/len(data)
    max_average = sum(max_list[1:])/len(data)
    
    return minor_list, minor_average, max_list, max_average


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib.patches as mpatches
%matplotlib qt
import warnings; warnings.simplefilter('ignore')

Cavalli = pd.read_csv('Medulloblastoma Files\Tumor_Medulloblastoma_Cavalli_763_rma_sketch_hugene11t.txt', sep=';', na_values=".")
print("The shape of the Cavalli data is: ", Cavalli.shape)

Northcott = pd.read_csv('Medulloblastoma Files\Tumor_Medulloblastoma_MAGIC_Northcott_285_rma_sketch_hugene11t.txt',sep=';', na_values=".")
print("The shape of the Northcott data is: ", Northcott.shape)

The shape of the Cavalli data is:  (18479, 763)
The shape of the Northcott data is:  (18473, 285)


In [4]:
Cavalli = Cavalli.T
Northcott = Northcott.T

In [5]:
subgroups = pd.read_csv('Medulloblastoma Files\GSE85218_subgroups.csv', sep=' ', na_values=".", header=None)
subgroups.columns = ['key','Subgroup']
subgroups = subgroups.set_index('key')

In [6]:
def variance_threshold_selector(data, threshold=0.1):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [7]:
from sklearn.feature_selection import VarianceThreshold

Cavalli_variance = variance_threshold_selector(Cavalli,0.25)
Northcott_variance = variance_threshold_selector(Northcott,0.25)

print("With a threshold of 0.25, the Cavalli dataset is just ", Cavalli_variance.shape)
print("With a threshold of 0.25, the Northcott dataset is just ", Northcott_variance.shape)

With a threshold of 0.25, the Cavalli dataset is just  (763, 6498)
With a threshold of 0.25, the Northcott dataset is just  (285, 5880)


In [8]:
Cavalli_columns = Cavalli_variance.columns
Northcott_columns = Northcott_variance.columns

Common_genes = list((set(Cavalli_columns))&(set(Northcott_columns)))

Cavalli_end = Cavalli[Common_genes]
Northcott_end = Northcott[Common_genes]

print("Cavalli common genes: ", Cavalli_end.shape)
print("Northcott common genes: ", Northcott_end.shape)

Cavalli common genes:  (763, 5668)
Northcott common genes:  (285, 5668)


In [39]:
import umap

%matplotlib qt
n_neighbors = [5,15,50,75]
data = Northcott_end.drop(['Subgroups'],axis=1)


for i in n_neighbors:
    reducer = umap.UMAP(n_components=3,n_neighbors=i)
    embedding = reducer.fit_transform(data)
    embedding_df = pd.DataFrame(embedding)
    embedding_df['Subgroups']= subgroups_northcott[1].values

    X_data= embedding_df[0]
    Y_data = embedding_df[1]
    Z_data = embedding_df[2]
    Sbgrp = embedding_df['Subgroups']

    cdict = {'Group4': 'red', 'SHH': 'blue', 'WNT': 'green', 'Group3': 'yellow'}
    c = [cdict[val] for val in Sbgrp]

    plt.figure(figsize=(16,10))
    ax = plt.axes(projection='3d')
    ax.scatter3D(X_data, Y_data, Z_data, c=c);
    red_c = mpatches.Patch(color='red', label='Group4')
    blue_c = mpatches.Patch(color='blue', label='SHH')
    green_c = mpatches.Patch(color='green', label='WNT')
    yellow_c = mpatches.Patch(color='yellow', label='Group3')
    plt.legend(handles=[red_c,blue_c,green_c,yellow_c])
    plt.title('UMAP with n_neighbors %i'%(i))
    plt.show()

In [None]:
Cavalli_end.to_csv('Medulloblastoma Files\Medulloblastoma_Cavalli_VAE_data_Less.csv', sep=',', header=True)
Northcott_end.to_csv('Medulloblastoma Files\Medulloblastoma_Northcott_VAE_data_Less.csv', sep=',', header=True)

### Check if they are similar
#### Haussdorf distance


In [9]:
subgroups_cavalli = pd.read_csv('Medulloblastoma Files\GSE85218_subgroups.csv', sep=' ',header=None)
print("The shape of the subgroups is: ", subgroups_cavalli.shape)
subgroups_northcott = pd.read_csv('Medulloblastoma Files\GSE37382_subgroups.csv', sep=' ',header=None)
print("The shape of the subgroups is: ", subgroups_northcott.shape)

The shape of the subgroups is:  (763, 2)
The shape of the subgroups is:  (285, 2)


In [10]:
Cavalli_end['Subgroups'] = subgroups_cavalli[1].values
Northcott_end['Subgroups'] = subgroups_northcott[1].values

Cavalli_G3 = Cavalli_end.loc[Cavalli_end['Subgroups']=='Group3']
Cavalli_G4 = Cavalli_end.loc[Cavalli_end['Subgroups']=='Group4']

Northcott_G3 = Northcott_end.loc[Northcott_end['Subgroups'] == 'Group3']
Northcott_G4 = Northcott_end.loc[Northcott_end['Subgroups'] == 'Group4']

Cavalli_G3 = Cavalli_G3.drop(['Subgroups'],axis=1)
Cavalli_G4 = Cavalli_G4.drop(['Subgroups'],axis=1)

Northcott_G3 = Northcott_G3.drop(['Subgroups'],axis=1)
Northcott_G4 = Northcott_G4.drop(['Subgroups'],axis=1)

In [11]:
minor_g3_cavalli, average_g3_cavalli, max_g3_cavalli, average_max_g3_cavalli = max_min_distances(Cavalli_G3)
minor_g4_cavalli, average_g4_cavalli, max_g4_cavalli, average_max_g4_cavalli  = max_min_distances(Cavalli_G4)
minor_g3_northcott, average_g3_northcott, max_g3_northcott, average_max_g3_northcott  = max_min_distances(Northcott_G3)
minor_g4_northcott, average_g4_northcott, max_g4_northcott, average_max_g4_northcott  = max_min_distances(Northcott_G4)

In [30]:
from scipy.spatial.distance import directed_hausdorff

C_N_G3 = directed_hausdorff(Cavalli_G3.to_numpy(), Northcott_G3.to_numpy())[0]
C_N_G4 = directed_hausdorff(Cavalli_G4.to_numpy(), Northcott_G4.to_numpy())[0]

print("Haussdorf distances G3: ", round(C_N_G3,4))
print("Haussdorf distances G4: ", round(C_N_G4,4))

Haussdorf distances G3:  93.2322
Haussdorf distances G4:  79.651


In [31]:
print("Average Min Cavalli G3: ",round(average_g3_cavalli,4)," Average Max Cavalli G3: ", round(average_max_g3_cavalli,4))
print("Average Min Cavalli G4: ",round(average_g4_cavalli,4)," Average Max Cavalli G4: ", round(average_max_g4_cavalli,4))

print("Average Min Northcott G3: ",round(average_g3_northcott,4)," Average Max Northcott G4: ", round(average_max_g3_northcott,4))
print("Average Min Northcott G4: ",round(average_g4_northcott,4)," Average Max Northcott G4: ", round(average_max_g4_northcott,4))

Average Min Cavalli G3:  61.7802  Average Max Cavalli G3:  117.9298
Average Min Cavalli G4:  53.3281  Average Max Cavalli G4:  101.093
Average Min Northcott G3:  63.6665  Average Max Northcott G4:  102.7325
Average Min Northcott G4:  53.9956  Average Max Northcott G4:  113.1816


#### Haussdorf distance between the sets of Cavalli and Northcott is similar to the mean between the largest and closest distance. It means that they distance between both datasets is similar to the one that exists between two random points of a subset. Thus, they are really close.

In [14]:
outliers_cavalli_g3 = find_outliers(Cavalli_G3,0.10)
outliers_cavalli_g4 = find_outliers(Cavalli_G4,0.10)
outliers_northcott_g3 = find_outliers(Northcott_G3,0.10)
outliers_northcott_g4 = find_outliers(Northcott_G4,0.10)

print("Outliers Cavalli-G3: ", outliers_cavalli_g3)
print("Outliers Cavalli-G4: ", outliers_cavalli_g4)
print("Outliers Northcott-G3: ", outliers_northcott_g3)
print("Outliers Northcott-G4: ", outliers_northcott_g4)

Outliers Cavalli-G3:  set()
Outliers Cavalli-G4:  set()
Outliers Northcott-G3:  set()
Outliers Northcott-G4:  {'gsm917243', 'gsm917255', 'gsm917267'}


In [21]:
Northcott_end = Northcott_end.drop(Northcott_end.loc[Northcott_end.index.isin(outliers_northcott_g4)].index,axis=0)

In [32]:
subgroups_northcott = subgroups_northcott.drop(subgroups_northcott.loc[subgroups_northcott[0].isin(outliers_northcott_g4)].index,axis=0)