In [1]:
import pandas
import numpy as np

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.metrics import confusion_matrix,accuracy_score,silhouette_score#,calinski_harabaz_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest,f_classif,SelectFdr
from sklearn import svm
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.preprocessing import normalize,RobustScaler,StandardScaler,MinMaxScaler

In [3]:
# read data
methy = pandas.read_csv("pancreatic_cancer/input data/methylation2.csv")
mrna = pandas.read_csv("pancreatic_cancer/input data/mrna.csv")
mirna = pandas.read_csv("pancreatic_cancer/input data/mirna.csv")
clinical_new = pandas.read_csv("pancreatic_cancer/clinical_data3.csv")
# 
clinical_new = clinical_new.drop(['Unnamed: 0'], axis = 1)
clinical_new = clinical_new[['bcr_patient_barcode','vital_status','survival']]

methy = methy.drop(['Unnamed: 0'], axis=1)
mrna = mrna.drop(['Unnamed: 0'], axis=1)
mirna = mirna.drop(['Unnamed: 0'], axis=1)

methy = methy.set_index(['Group.1'])
mrna = mrna.set_index(['Group.1'])
mirna = mirna.set_index(['GeneSymbol'])

methy = methy.transpose()
mrna = mrna.transpose()
mirna = mirna.transpose()

# data log2 transformation
mrna = np.log2(mrna+1)
mirna = np.log2(mirna+1)

In [4]:
# vital status has to be 0/1 not 1/2
clinical_new[["vital_status"]] = clinical_new[["vital_status"]] -1

data_all = pandas.concat([methy,mrna,mirna],axis = 1)
data_all2 = data_all.loc[clinical_new['bcr_patient_barcode'],:]

In [5]:
# class label
label_all = pandas.read_csv("pancreatic cancer/class_label.txt")
predictor_mrna = pandas.read_csv("pancreatic cancer/predictor_mrna.txt")
predictor_mirna = pandas.read_csv("pancreatic cancer/predictor_mirna.txt")
predictor_methy = pandas.read_csv("pancreatic cancer/predictor_methy.txt")

In [6]:
mrna_train = mrna.loc[clinical_new['bcr_patient_barcode'],:]
mirna_train = mirna.loc[clinical_new['bcr_patient_barcode'],:]
methy_train = methy.loc[clinical_new['bcr_patient_barcode'],:]

define functions

In [22]:
# train the classifier for prediction
def train_svm(train,test):
    
    #normalizr
    train1 = MinMaxScaler().fit_transform(train.transpose()).transpose()
    test1 = MinMaxScaler().fit_transform(test.transpose()).transpose()
    train1 = RobustScaler().fit_transform(train1)
    test1 = RobustScaler().fit_transform(test1)
    
    train1 = pandas.DataFrame(train1,columns = train.columns)
    test1 = pandas.DataFrame(test1,columns = test.columns)
    
    train_select = SelectFdr(f_classif).fit(train1,label_all).get_support(indices = True)
    train2 = train1.iloc[:,train_select]
    #print(train2.columns)
    test2 = test1.iloc[:,train_select]
    
    svm_parameters = {
        #'kernel': ['rbf','sigmoid','poly','linear'],#
        'C': [0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5],#,1,1.5,2,2.5,3,3.5,4,4.5,,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10
        'coef0': [0.001,0.005,0.05,0.1,0.25,0.5,0.75,1],#,1.5,2,2.5
        #'degree' : [2,4,3],
        'gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,1.5,2,2.5,3]}#0.0001,0.0005,

    svm_tune = GridSearchCV(estimator=svm.SVC(kernel = 'sigmoid'),param_grid=svm_parameters,cv=3)#,iid = False,probability = True,class_weight = 'balanced'
    svm_tune.fit(train2,label_all)
    
    print(cross_val_score(svm_tune, train2,label_all, cv=3))
    print(svm_tune.best_params_)
    
    # predict
    pre = svm_tune.predict(test2)
    #pre2 = svm_tune.predict_proba(test2)
    
    return pre

ICGC mRNA-seq

In [11]:
au_rna = pandas.read_csv("pancreatic_cancer/external dataset/PDAC_AU/rnaseq.tsv")
# au_rna = au_rna.drop(['Unnamed: 0'], axis=1)

au_rna = au_rna.set_index(['Unnamed: 0'])
# au_rna = au_rna.transpose()
# au_rna = np.log2(au_rna+1)
# au_rna

In [13]:
common_gene = predictor_mrna.intersection(au_rna.columns)
train_panel = mrna_train[list(common_gene)]
test_panel = au_rna[list(common_gene)]

In [14]:
common_gene = predictor_mrna.intersection(au_rna.columns)
train_panel = mrna_train[list(common_gene)]
test_panel = au_rna[list(common_gene)]
# predict
pred1 = train_svm(train_panel,test_panel)

ICGC mrna array

In [15]:
au_mrna_array = pandas.read_csv("pancreatic_cancer/external dataset/PDAC_AU/mrna_array.tsv")

au_mrna_array = au_mrna_array.drop(['Unnamed: 0'],axis = 1)
au_mrna_array = au_mrna_array.set_index(["Group.1"])

au_mrna_array = au_mrna_array.transpose()

In [18]:
common_gene = predictor_mrna.intersection(au_mrna_array.columns)
train_panel = mrna_train[list(common_gene)]
test_panel = au_mrna_array[list(common_gene)]

pred2 = train_svm(train_panel,test_panel)

geo mrna array

In [20]:
geo_mrna = pandas.read_csv("pancreatic_cancer/external dataset/PDAC_GEO_mrna/mrna.tsv")
geo_mrna = geo_mrna.drop(['Unnamed: 0'],axis = 1)
geo_mrna = geo_mrna.set_index(["Group.1"])

geo_mrna = np.log2(geo_mrna+1)
geo_mrna = geo_mrna.transpose()

In [23]:
common_gene = predictor_mrna.intersection(geo_mrna.columns)
train_panel = mrna_train[list(common_gene)]
test_panel = geo_mrna[list(common_gene)]

pred3 = train_svm(train_panel,test_panel)

geo pdac mirna

In [31]:
geo_mirna = pandas.read_csv("pancreatic_cancer/external dataset/PDAC_GEO_mirna/mirna.tsv",sep = "\t")
geo_mirna = geo_mirna.set_index(['Unnamed: 0'])
# geo_mirna = geo_mirna.drop(['Unnamed: 0'],axis = 1)
geo_mirna = np.log2(geo_mirna+1)
# geo_mirna

In [32]:
common_gene = predictor_mirna.intersection(geo_mirna.columns)
train_panel = mirna_train[list(common_gene)]
test_panel = geo_mirna[list(common_gene)]

In [33]:
common_gene = predictor_mirna.intersection(geo_mirna.columns)
train_panel = mirna_train[list(common_gene)]
test_panel = geo_mirna[list(common_gene)]

pred4 = train_svm(train_panel,test_panel)

australia pdac methylation

In [34]:
au_methy = pandas.read_csv("pancreatic_cancer/external dataset/PDAC_AU/methylation.tsv")

au_methy = au_methy.drop(['Unnamed: 0'], axis=1)
au_methy = au_methy.set_index(['Group.1'])
au_methy = au_methy.transpose()
# au_methy

In [35]:
common_gene = predictor_methy.intersection(au_methy.columns)
train_panel = methy_train[list(common_gene)]
test_panel = au_methy[list(common_gene)]

pred5 = train_svm(train_panel,test_panel)