In [1]:
import pandas
import numpy as np

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.metrics import confusion_matrix,accuracy_score,silhouette_score,calinski_harabasz_score
from sklearn.feature_selection import SelectKBest,f_classif,SelectFdr
from sklearn import svm
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.preprocessing import normalize,RobustScaler
from sklearn.cluster import KMeans
from lifelines import CoxPHFitter
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras import backend
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import glorot_uniform,RandomUniform,Constant

In [3]:
# read data
methy = pandas.read_csv("input data/methylation2.csv")
mrna = pandas.read_csv("input data/mrna.csv")
mirna = pandas.read_csv("input data/mirna.csv")
clinical_new = pandas.read_csv("clinical_data3.csv")

In [4]:
## clean up the training sets

# drop redundant columns 
methy = methy.drop(['Unnamed: 0'], axis=1)
mrna = mrna.drop(['Unnamed: 0'], axis=1)
mirna = mirna.drop(['Unnamed: 0'], axis=1)

clinical_new = clinical_new.drop(['Unnamed: 0'], axis = 1)
clinical_new = clinical_new[['bcr_patient_barcode','vital_status','survival','cause_of_death']]

# reset index
mrna = mrna.set_index(['Group.1'])
mirna = mirna.set_index(['GeneSymbol'])

clinical_new.reset_index(inplace=True)
# transpose
methy = methy.transpose()
mrna = mrna.transpose()
mirna = mirna.transpose()

# vital status has to be 0/1 not 1/2
clinical_new[["vital_status"]] = clinical_new[["vital_status"]] -1

In [5]:
# data log2 transformation
mrna = np.log2(mrna+1)
mirna = np.log2(mirna+1)

In [6]:
# concatenate the multi-omics data
data_all = pandas.concat([methy,mrna,mirna],axis = 1)
data_all2 = data_all.loc[clinical_new['bcr_patient_barcode'],:]

# train autoencoder for 146 samples

# functions

In [7]:
def normalize_ae(X_train):
    # normalize each data type set

    # split dataset into multi omics data
    methy_train = X_train.iloc[:,0:20980]
    mrna_train = X_train.iloc[:,20980:38168]
    mirna_train = X_train.iloc[:,38168:38597]

    # l2 normalization,sample norm
    methy_train = normalize(methy_train, norm='l2',axis = 1)
    mrna_train = normalize(mrna_train, norm='l2',axis = 1)
    mirna_train = normalize(mirna_train, norm='l2',axis = 1)

    data = pandas.concat([pandas.DataFrame(methy_train),pandas.DataFrame(mrna_train),pandas.DataFrame(mirna_train)],axis = 1)
    
    return data

In [13]:
def coxph_feature_selection(data,clinical_data):
    p_value = []
    cph = CoxPHFitter()
    for j in range(0,len(data.columns)):
        data_label = pandas.concat([data[j],clinical_data[["survival","vital_status"]]],axis = 1)
        cph.fit(data_label, duration_col="survival", event_col="vital_status")
        # get p value
        if cph.summary.iloc[0,4] <0.05:
            p_value.append(j)
            # print(j)
    data_new = data[p_value]
    #print(len(p_value))    
    return data_new         

In [9]:
def kmeans_function(data,cluster):
    for n in range(2,6):
        kmeans = KMeans(n_clusters=n, n_init =10).fit(data)
        labels = kmeans.labels_
        # print(silhouette_score(data, labels))
        # print(calinski_harabasz_score(data, labels))
    kmeans = KMeans(n_clusters=cluster, n_init=10).fit(data)
    labels = kmeans.labels_
    return labels

In [10]:
seed_num = 1000

def paad_model(activation = "tanh",hidden_layers = 500, bottleneck = 100,l2 = 0.001,l1=0.001):
    
    model = Sequential()
    
    model.add(Dense(hidden_layers,activation= activation,input_shape=(38597,),kernel_regularizer=regularizers.l2(l2),
                    activity_regularizer=regularizers.l1(l1),kernel_initializer=glorot_uniform(seed = seed_num)))#             
    
    model.add(Dropout(0.5))
    
    model.add(Dense(bottleneck, activation=activation,kernel_initializer=glorot_uniform(seed = seed_num)))
    #,random_uniform,Constant(value=0.005),glorot_uniform
                    
    model.add(Dropout(0.5))
    
    model.add(Dense(hidden_layers, activation=activation,kernel_initializer=glorot_uniform(seed = seed_num))) 
    
    model.add(Dropout(0.5))
    
    model.add(Dense(38597, activation=activation,kernel_initializer=glorot_uniform(seed = seed_num)))
    
    model.compile(loss='mean_squared_logarithmic_error',optimizer='sgd') #,mean_squared_error
    
    return model

# run

In [14]:
df = normalize_ae(data_all2)

model = paad_model('tanh',500,200,0.001,0.0001)
autoencoder_train = model.fit(x=df, y=df, epochs=10, batch_size=1) #,validation_split=0.2

# get bottleneck layer
layers = backend.function([model.layers[0].input],[model.layers[2].output])
feature_new = pandas.DataFrame(layers([df])[0])

# calculate p value
df_new = coxph_feature_selection(feature_new,clinical_new)
label_all = kmeans_function(df_new,2)

Train on 146 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
