In [4]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm

### Load data

In [5]:
data = pd.read_csv('Medulloblastoma Files\Medulloblastoma_Cavalli_VAE_data.csv', sep=',', na_values=".")
print("The shape of the data is: ", data.shape)
data = data.rename(columns={'Unnamed: 0': 'Patient'})

subgroups = pd.read_csv('Medulloblastoma Files\GSE85218_subgroups.csv', sep=' ',header=None)
print("The shape of the subgroups is: ", subgroups.shape)

The shape of the data is:  (763, 12088)
The shape of the subgroups is:  (763, 2)


In [6]:
data_test = pd.read_csv('Medulloblastoma Files\Medulloblastoma_Northcott_VAE_data.csv', sep=',', na_values=".")
print("The shape of the data is: ", data_test.shape)
data_test = data_test.rename(columns={'Unnamed: 0': 'Patient'})

subgroups_test = pd.read_csv('Medulloblastoma Files\GSE37382_subgroups.csv', sep=' ',header=None)
print("The shape of the subgroups is: ", subgroups_test.shape)

The shape of the data is:  (285, 12088)
The shape of the subgroups is:  (285, 2)


In [7]:
data = data.drop(['Patient'],axis=1)
data_test = data_test.drop(['Patient'],axis=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit(data)
data = scaler.transform(data) #(x - mu / s) almost all values between -1,1

scaler.fit(data_test)
data_test = scaler.transform(data_test)

In [8]:
data = pd.DataFrame(data)
train_dataset = torch.tensor(data.values).float()

data_test = pd.DataFrame(data_test)
test_dataset = torch.tensor(data_test.values).float()

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False
)

### Define the Autoencoder

In [9]:
class Autoencoder(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.encoder_input_layer = nn.Linear(
            in_features=kwargs["input_shape"], out_features=kwargs["embedding_dim"]
        )
        self.encoder_output_layer = nn.Linear(
            in_features=kwargs["embedding_dim"], out_features=kwargs["bottleneck"]
        )
        self.decoder_input_layer = nn.Linear(
            in_features=kwargs["bottleneck"], out_features=kwargs["embedding_dim"]
        )
        self.decoder_output_layer = nn.Linear(
            in_features=kwargs["embedding_dim"], out_features=kwargs["input_shape"]
        )

    def forward(self, features):
        activation = self.encoder_input_layer(features)
        activation = torch.relu(activation)

        code = self.encoder_output_layer(activation)
        code = torch.relu(code)
        
        activation = self.decoder_input_layer(code)
        activation = torch.relu(activation)

        activation = self.decoder_output_layer(activation)
        reconstructed = torch.tanh(activation)

        return reconstructed, code

model = Autoencoder(input_shape=12087, embedding_dim=2048, bottleneck=256)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

### Train and test functions
#### Doesn't needed, the values of the trained model are stored in a dictionary some chunks below

In [None]:
def fit(model, dataloader):
    model.train()
    running_loss = 0.0
    for i, data in tqdm(enumerate(dataloader), total=int(len(train_dataset)/dataloader.batch_size)):
        data = data # we want the data, not the label
        data = data.view(data.size(0), -1) #flat the data
        optimizer.zero_grad() # reset the gradients back to zero
        reconstruction, _ = model(data)  # compute reconstructions
        #print("RECONSTRUCTION :", reconstruction)
        loss = criterion(reconstruction, data) #calculate reconstruction loss
        running_loss += loss.item() 
        loss.backward() # compute accumulated gradients
        optimizer.step() #update the weights (net.parameters)
    train_loss = running_loss/len(dataloader.dataset) # average loss
    return train_loss

In [None]:
def validate(model, dataloader):
    model.eval()
    running_loss = 0.0
    with torch.no_grad(): # in validation we don't want to update weights
        for i, data in tqdm(enumerate(dataloader), total=int(len(test_dataset)/dataloader.batch_size)):
            data = data
            data = data.view(data.size(0), -1)
            reconstruction, _ = model(data)
            loss = criterion(reconstruction, data)
            running_loss += loss.item()
            
    val_loss = running_loss/len(dataloader.dataset)
    return val_loss

### Train the Autoencoder

In [None]:
epochs = 20   #the loss stuck up at this epoch
batch_size = 8

train_loss = []
test_loss = []
for epoch in range(epochs):
    print(f"\n Epoch {epoch+1} of {epochs}")
    train_epoch_loss = fit(model, train_loader)
    test_epoch_loss = validate(model, test_loader)
    train_loss.append(train_epoch_loss)
    test_loss.append(test_epoch_loss)
    print(f"\nTrain Loss: {train_epoch_loss:.4f}")
    print(f"Test Loss: {test_epoch_loss:.4f}")

### Save model

In [None]:
PATH = './AE_MDLTBM.pth'
torch.save(model.state_dict(), PATH) #save in a dictionary all parameters

### Load model
#### This chunk loads a trained AE

In [10]:
PATH = './AE_MDLTBM.pth'
model = Autoencoder(input_shape=12087, embedding_dim=2048, bottleneck=256)
model.load_state_dict(torch.load(PATH))

<All keys matched successfully>

### Function to get the embeddings and the reconstrunctions

In [11]:
def get_embeddings(model,dataloader):
    model.eval()
    rec_model = np.zeros(shape=(0,12087))
    embedding_model = np.zeros(shape=(0,256))
    with torch.no_grad(): # in validation we don't want to update weights
        for i, data in tqdm(enumerate(dataloader), total=int(len(test_dataset)/dataloader.batch_size)):
            data = data.view(data.size(0), -1)
            reconstruction, coded = model(data)
            rec_model = np.concatenate((rec_model, reconstruction), axis=0)
            embedding_model = np.concatenate((embedding_model, coded), axis=0)
    return rec_model, embedding_model

In [12]:
reconstructed, coded = get_embeddings(model, test_loader)

36it [00:02, 13.99it/s]


### UMAP to visualize the data. I visualize the embeddings

In [10]:
from mpl_toolkits import mplot3d
import matplotlib.patches as mpatches
import umap

%matplotlib qt
n_neighbors = [5,15,50,75,100,200]

for i in n_neighbors:
    reducer = umap.UMAP(n_components=3,n_neighbors=i)
    embedding = reducer.fit_transform(data_test)
    embedding_df = pd.DataFrame(embedding)
    embedding_df['Subgroups']= subgroups_test[1].values

    X_data= embedding_df[0]
    Y_data = embedding_df[1]
    Z_data = embedding_df[2]
    Sbgrp = embedding_df['Subgroups']

    cdict = {'Group4': 'red', 'SHH': 'blue', 'WNT': 'green', 'Group3': 'yellow'}
    c = [cdict[val] for val in Sbgrp]

    plt.figure(figsize=(16,10))
    ax = plt.axes(projection='3d')
    ax.scatter3D(X_data, Y_data, Z_data, c=c);
    red_c = mpatches.Patch(color='red', label='Group4')
    blue_c = mpatches.Patch(color='blue', label='SHH')
    green_c = mpatches.Patch(color='green', label='WNT')
    yellow_c = mpatches.Patch(color='yellow', label='Group3')
    plt.legend(handles=[red_c,blue_c,green_c,yellow_c])
    plt.title('UMAP with n_neighbors %i'%(i))
    plt.show()

### Now we get, per every observation, with which observation has the greater cosine similarity

In [13]:
def comparison_cosine(coded):
    indexes = []
    for i in range(0,len(coded)):
        minor = -1
        index = 0
        for j in range(0,len(coded)):
            if i != j:
                similarity = cosine_similarity(coded.iloc[i].values.reshape(1,256),coded.iloc[j].values.reshape(1,256))
                if similarity > minor:
                    minor = similarity
                    index = j
        if index < np.argmax(subgroups_test[1]=='Group3'):
            indexes.append('SHH')
        elif np.argmax(subgroups_test[1]=='Group3') < index < np.argmax(subgroups_test[1]=='Group4'):
            indexes.append('Group3')
        else:
            indexes.append('Group4')
    return indexes

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

coded = pd.DataFrame(coded)
indexes = comparison_cosine(coded)
subgroups_test['Match'] = indexes

### Which percentage of observations has the greater cosine similarity with a observation of its own subgroup?

In [24]:
match = 0
mismatch = []
for i in range(0,len(subgroups_test)):
    mismatch_list = []
    if subgroups_test['Match'].iloc[i] == subgroups_test[1].iloc[i]:
        match = match+1
    else:
       mismatch_list.append(subgroups_test[1].iloc[i])
       mismatch_list.append(subgroups_test['Match'].iloc[i])
       mismatch_list.append(i)
       mismatch.append(mismatch_list)
total = match/len(subgroups_test)
print("The % of observations that have the greatest cosine similarity with a observation of its own subgroups is: ", round((total*100),5))


The % of observations that have the greatest cosine similarity with a observation of its own subgroups is:  93.33333


### The observations that doesn't match with a observation of its subgroup, with which subgroup match?

In [25]:
mismatch

[['SHH', 'Group4', 6],
 ['SHH', 'Group4', 35],
 ['Group3', 'Group4', 54],
 ['Group3', 'Group4', 58],
 ['Group3', 'Group4', 71],
 ['Group3', 'SHH', 89],
 ['Group3', 'Group4', 91],
 ['Group3', 'Group4', 95],
 ['Group3', 'Group4', 96],
 ['Group4', 'Group3', 106],
 ['Group4', 'Group3', 111],
 ['Group4', 'Group3', 134],
 ['Group4', 'Group3', 172],
 ['Group4', 'Group3', 195],
 ['Group4', 'Group3', 241],
 ['Group4', 'SHH', 244],
 ['Group4', 'Group3', 256],
 ['Group4', 'Group3', 273],
 ['Group4', 'Group3', 278]]

### It can be seen that the greatest number of mismatch happens between G3 and G4
#### Something interesting is that it can be seen that some SHH and G4 are matchs, and this can be also seen in the UMAP. There are some red points near of blue points.

### Now let's compute a standard of each subgroup with the average values and calculate the cosine similarity between these standards

In [17]:
coded['Subgroup'] = subgroups_test[1] #add subgroups column to the embedding data

mean_shh = coded[coded['Subgroup'] == 'SHH']
mean_g3 = coded[coded['Subgroup'] == 'Group3']
mean_g4 = coded[coded['Subgroup'] == 'Group4']

standard_shh = mean_shh.mean().values.reshape(1,256)
standard_g3 = mean_g3.mean().values.reshape(1,256)
standard_g4 = mean_g4.mean().values.reshape(1,256)

coded = coded.drop(['Subgroup'],axis=1)

cosine_shh_g3 = cosine_similarity(standard_shh,standard_g3)
cosine_shh_g4 = cosine_similarity(standard_shh,standard_g4)
cosine_g4_g3 = cosine_similarity(standard_g4,standard_g3)
print("Cosine similarity SHH-G3: ", cosine_shh_g3)
print("Cosine similarity SHH-G4: ", cosine_shh_g4)
print("Cosine similarity G4-G3: ", cosine_g4_g3)

Cosine similarity SHH-G3:  [[0.79669056]]
Cosine similarity SHH-G4:  [[0.78173266]]
Cosine similarity G4-G3:  [[0.876827]]


### The greatest cosine similarity is between G3 and G4

### Now it's computed the cosine similarity between observations and the standards

In [19]:
def comparison_cosine_standard(coded):
    indexes = []
    for i in range(0,len(coded)):
        minor = -1
        cosine_shh = cosine_similarity(coded.iloc[i].values.reshape(1,256),standard_shh)
        cosine_g3 = cosine_similarity(coded.iloc[i].values.reshape(1,256),standard_g3)
        cosine_g4 = cosine_similarity(coded.iloc[i].values.reshape(1,256),standard_g4)
        if max(cosine_shh,cosine_g3,cosine_g4) == cosine_shh:
            indexes.append('SHH')
        elif max(cosine_shh,cosine_g3,cosine_g4) == cosine_g3:
            indexes.append('Group3')
        else: 
            indexes.append('Group4')
    return indexes

In [20]:
indexes_stardard = comparison_cosine_standard(coded)
subgroups_test['Match_standard'] = indexes_stardard

In [30]:
match = 0
mismatch = []
for i in range(0,len(subgroups_test)):
    mismatch_list = []
    if subgroups_test['Match_standard'].iloc[i] == subgroups_test[1].iloc[i]:
        match = match+1
    else:
        mismatch_list.append(subgroups_test[1].iloc[i])
        mismatch_list.append(subgroups_test['Match_standard'].iloc[i])
        mismatch_list.append(i)
        mismatch.append(mismatch_list)
total = match/len(subgroups_test)
print("The % of observations that have the greatest cosine similarity with the standard of its own subgroups is: ", round((total*100),5))

The % of observations that have the greatest cosine similarity with the standard of its own subgroups is:  97.89474


In [32]:
mismatch

[['Group3', 'Group4', 71],
 ['Group3', 'Group4', 91],
 ['Group4', 'Group3', 106],
 ['Group4', 'SHH', 244],
 ['Group4', 'SHH', 256],
 ['Group4', 'Group3', 278]]