In [None]:
import pandas
import random
import numpy as np
import sklearn
from sklearn.model_selection import GridSearchCV, train_test_split,cross_val_score,StratifiedKFold,KFold
from sklearn.metrics import confusion_matrix,accuracy_score,silhouette_score#,calinski_harabaz_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest,f_classif,SelectFdr,RFECV, VarianceThreshold
print(sklearn.__version__)
print(np.__version__)
print(pandas.__version__)
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from matplotlib import pyplot as plt
from sklearn.preprocessing import normalize,RobustScaler,StandardScaler,MinMaxScaler
from mlxtend.evaluate import paired_ttest_5x2cv

1.2.2
1.22.4
1.5.3


In [None]:
# read training data
methy = pandas.read_csv("methylation2.csv")
mrna = pandas.read_csv("mrna.csv")
mirna = pandas.read_csv("mirna.csv")
clinical_new = pandas.read_csv("clinical_data3.csv")

# CHANGED get the gold class labels directly from the clinical data
label_all = clinical_new['class']

clinical_new = clinical_new.drop(['Unnamed: 0'], axis = 1)
clinical_new = clinical_new[['bcr_patient_barcode','vital_status','survival']]

methy = methy.drop(['Unnamed: 0'], axis=1)
mrna = mrna.drop(['Unnamed: 0'], axis=1)
mirna = mirna.drop(['Unnamed: 0'], axis=1)

methy = methy.set_index(['Group.1'])
mrna = mrna.set_index(['Group.1'])
mirna = mirna.set_index(['GeneSymbol'])

methy = methy.transpose()
mrna = mrna.transpose()
mirna = mirna.transpose()

# data log2 transformation
mrna = np.log2(mrna+1)
mirna = np.log2(mirna+1)

In [None]:
# vital status has to be 0/1 not 1/2
clinical_new[["vital_status"]] = clinical_new[["vital_status"]] -1

data_all = pandas.concat([methy,mrna,mirna],axis = 1)
data_all2 = data_all.loc[clinical_new['bcr_patient_barcode'],:]

In [None]:
# MODEL-P read in of predictor data
#label_all = pandas.read_csv("pancreatic cancer/class_label.txt")
#predictor_mrna = pandas.read_csv("pancreatic cancer/predictor_mrna.txt")
#predictor_mirna = pandas.read_csv("pancreatic cancer/predictor_mirna.txt")
#predictor_methy = pandas.read_csv("pancreatic cancer/predictor_methy.txt")

# CHANGED to read in the predictors as csv file, resulting from the single_omics function
predictor_mrna = pandas.read_csv("mrna_predictors.csv")
predictor_mirna = pandas.read_csv("mirna_predictors.csv")
predictor_methy = pandas.read_csv("methy_predictors.csv")

In [None]:
mrna_train = mrna.loc[clinical_new['bcr_patient_barcode'],:]
mirna_train = mirna.loc[clinical_new['bcr_patient_barcode'],:]
methy_train = methy.loc[clinical_new['bcr_patient_barcode'],:]

In [None]:
# train the classifier for prediction
def train_svm(train,test):
    #normalizr
    train1 = MinMaxScaler().fit_transform(train.transpose()).transpose()
    test1 = MinMaxScaler().fit_transform(test.transpose()).transpose()

    train1 = RobustScaler().fit_transform(train1)
    test1 = RobustScaler().fit_transform(test1)

    train1 = pandas.DataFrame(train1,columns = train.columns)
    test1 = pandas.DataFrame(test1,columns = test.columns)

    # selectfdr feature selection
    train_index = SelectFdr(f_classif).fit(train1,label_all).get_support(indices = True)
    print(len(train_index))

    # CHANGED tests to get variance threshold to use in the Galaxy workflow
    # cv_list = []
    # for i in range(0, 10):
    #   i = i * 0.1
    #   train_select = VarianceThreshold(i)
    #   train_index = train_select.fit(train1).get_support(indices = True)
    #   print(len(train_index))
    #   train2 = train1.iloc[:,train_index]
    #   test2 = test1.iloc[:,train_index]

    #   svm_parameters = {
    #   #'kernel': ['rbf','sigmoid','poly','linear'],#
    #   'C': [0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5],#,1,1.5,2,2.5,3,3.5,4,4.5,,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10
    #   #'coef0': [0.25],
    #   'coef0': [0.001,0.005,0.05,0.1,0.25,0.5,0.75,1],#,1.5,2,2.5
    #   #'degree' : [2,4,3],
    #   'gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,1.5,2,2.5,3]}#0.0001,0.0005,

    #   svm_tune = GridSearchCV(estimator=svm.SVC(kernel = 'sigmoid'),param_grid=svm_parameters,cv=3)
    #   svm_tune.fit(train2, label_all)
    #   cv_scores = cross_val_score(svm_tune, train2,label_all, cv=5)
    #   cvscore_mean = np.mean(cv_scores)
    #   print(i, cvscore_mean)
    # END tests for variance threshold

    # CHANGED tests for RFECV feature selection
    #selector = RFECV(estimator=svm.SVC(kernel = 'linear'), min_features_to_select=10, step=1)
    #selector = selector.fit(train1, label_all)
    #train_index = selector.get_support(indices = True)
    #print(len(train_index))
    # END tests for RFECV feature selection

    train2 = train1.iloc[:,train_index]
    test2 = test1.iloc[:,train_index]


    svm_parameters = {
      #     #'kernel': ['rbf','sigmoid','poly','linear'],#
           'C': [0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5],#,1,1.5,2,2.5,3,3.5,4,4.5,,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10
      #     #'coef0': [0.25],
           'coef0': [0.001,0.005,0.05,0.1,0.25,0.5,0.75,1],#,1.5,2,2.5
      #     #'degree' : [2,4,3],
           'gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,1.5,2,2.5,3]}#0.0001,0.0005,

    svm_tune = GridSearchCV(estimator=svm.SVC(kernel = 'sigmoid'),param_grid=svm_parameters,cv=3)
    svm_tune.fit(train2, label_all)

    print(cross_val_score(svm_tune, train2,label_all, cv=3))
    print(svm_tune.best_params_)

    # predict
    pre = svm_tune.predict(test2)
    #pre2 = svm_tune.predict_proba(test2)

    return pre, train2, test2

## rna seq ICGC

In [None]:
# read in the processed test set
au_rna = pandas.read_csv("rnaseq.tsv")
au_rna = au_rna.set_index(['Unnamed: 0'])

# CHANGED because predictor_mrna was in csv format instead of txt
common_gene = list(set(predictor_mrna['Group.1']).intersection(set(au_rna.columns)))
print(len(common_gene))
train_panel = mrna_train[common_gene]
test_panel = au_rna[list(common_gene)]

# save train and test panel for input in the Galaxy workflows
#train_panel.to_csv("mrna_trainpanel.csv", index=False)
#test_panel.to_csv("mrna_testpanel.csv", index=False)

109


In [None]:
pred1, train, test = train_svm(train_panel,test_panel)

107
[0.81632653 0.89795918 0.79166667]
{'C': 0.5, 'coef0': 0.25, 'gamma': 0.005}


In [None]:
# save predictions in csv for input in Galaxy
print(pred1) # made with setting the SVM to SVC(C=0.5, coef0=0.75, gamma=0.01)
#pred1.to_csv('mrna_pred_MODELP.csv', index = False)

[0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1]


In [None]:
# run for variance threshold
pred1, train, test = train_svm(train_panel,test_panel)

109
0.0 0.8558620689655172
109
0.1 0.8558620689655172
109
0.2 0.8558620689655172
109
0.30000000000000004 0.8558620689655172
104
0.4 0.8558620689655172
84
0.5 0.8282758620689655
39
0.6000000000000001 0.8216091954022989
22
0.7000000000000001 0.7593103448275862
7
0.8 0.8006896551724138
2
0.9 0.6439080459770116


Extra code needed for gathering data for Kaplan-Meier plots

In [None]:
# filter out the donor_survival_time , donor_vital_status, icgc_donor_id
au_rna_cli = pandas.read_csv("rnaseq_cli1 (1).tsv")
au_rna_cli_logrank = au_rna_cli[['icgc_donor_id', 'donor_vital_status', 'donor_survival_time']]
au_rna_cli_logrank

In [None]:
print(au_rna_cli['icgc_donor_id'].tolist())
print(len(au_rna_cli['icgc_donor_id'].tolist()))

if au_rna_cli['icgc_donor_id'].tolist() == test_panel.index.tolist():
    print("The lists are the same.")
else:
    print("The lists are different.")

['DO49193', 'DO49178', 'DO49185', 'DO49183', 'DO49168', 'DO49166', 'DO49170', 'DO49164', 'DO49138', 'DO49135', 'DO49133', 'DO49129', 'DO49127', 'DO49130', 'DO49113', 'DO49090', 'DO49079', 'DO49078', 'DO34504', 'DO33168', 'DO33128', 'DO34432', 'DO33472', 'DO33480', 'DO34785', 'DO34793', 'DO34720', 'DO33408', 'DO34736', 'DO33400', 'DO33392', 'DO33376', 'DO34680', 'DO34696', 'DO33336', 'DO33344', 'DO34640', 'DO34656', 'DO34608', 'DO34600', 'DO34336', 'DO34312', 'DO34264', 'DO34240', 'DO34288', 'DO33984', 'DO32900', 'DO32860', 'DO32863', 'DO32875', 'DO32878', 'DO32829', 'DO33600', 'DO34905', 'DO34801', 'DO34817', 'DO33544', 'DO34849', 'DO33512']
59
The lists are the same.


In [None]:
# save new dataframe for further use
au_rna_cli_logrank.to_csv('rna_cli_logrank.csv', index=False)

## mrna array ICGC

In [None]:
au_mrna_array = pandas.read_csv("mrna_array.tsv")

au_mrna_array = au_mrna_array.drop(['Unnamed: 0'],axis = 1)
au_mrna_array = au_mrna_array.set_index(["Group.1"])
au_mrna_array = au_mrna_array.transpose()

In [None]:
# CHANGED because predictor_mrna was in csv format instead of txt
common_gene = set(predictor_mrna['Group.1']).intersection(set(au_mrna_array.columns))
print(len(common_gene))
train_panel = mrna_train[list(common_gene)]
test_panel = au_mrna_array[list(common_gene)]


101


In [None]:
# save train and test panel for input in the Galaxy workflows
train_panel.to_csv('final_mrna_array_trainpanel.csv', index=False)
test_panel.to_csv('final_mrna_array_testpanel.csv', index=False)

In [None]:
pred2, train2, test2 = train_svm(train_panel,test_panel)

99
[0.85714286 0.91836735 0.8125    ]
{'C': 0.5, 'coef0': 0.001, 'gamma': 0.01}


In [None]:
print(pred2)

pred2 = pandas.DataFrame(pred2)
pred2.to_csv('mrna_array_pred_MODELP.csv', index=False)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]


In [None]:
pred2, train2, test2 = train_svm(train_panel,test_panel) # variance threshold run

101
0.0 0.8558620689655172
101
0.1 0.8558620689655172
101
0.2 0.8558620689655172
101
0.30000000000000004 0.8558620689655172
95
0.4 0.862758620689655
68
0.5 0.862758620689655
22
0.6000000000000001 0.7880459770114943
6
0.7000000000000001 0.7055172413793104
3
0.8 0.7055172413793104
1
0.9 0.7055172413793104
1
1.0 0.7055172413793104


Extra code needed for gathering data for Kaplan-Meier plots

In [None]:
mrna_array_cli = pandas.read_csv("mrnaarray_cli.tsv")
logrank_mrna_array = mrna_array_cli[['icgc_donor_id', 'donor_vital_status', 'donor_survival_time']]

# merge the two DataFrames based on the 'id' column
id_df = pandas.DataFrame(test_panel.index, columns=['icgc_donor_id'])
merged_id_df = pandas.merge(id_df, logrank_mrna_array, on='icgc_donor_id')

if merged_id_df['icgc_donor_id'].tolist() == test_panel.index.tolist():
    print("The lists are the same.")
else:
    print("The lists are different.")

#merged_id_df.to_csv('mrna_array_cli_logrank.csv', index=False)

The lists are the same.


##ICGC methylation

In [None]:
au_methy = pandas.read_csv("methylation.tsv")

au_methy = au_methy.drop(['Unnamed: 0'], axis=1)
au_methy = au_methy.set_index(['Group.1'])
au_methy = au_methy.transpose()

In [None]:
# CHANGED because predictor_methy was in csv format instead of txt
common_gene = set(predictor_methy['Group.1']).intersection(set(au_methy.columns))
print(len(common_gene))
train_panel = methy_train[list(common_gene)]
test_panel = au_methy[list(common_gene)]

85


In [None]:
train_panel.to_csv('final_methy_trainpanel.csv', index=False)
test_panel.to_csv('final_methy_testpanel.csv', index=False)

In [None]:
pred3, train3, test3 = train_svm(train_panel, test_panel)

81
[0.83673469 0.87755102 0.875     ]
{'C': 0.5, 'coef0': 0.001, 'gamma': 0.01}


In [None]:
print(pred3)
pred3 = pandas.DataFrame(pred3)
pred3.to_csv('methylation_pred_MODELP.csv', index=False)

[0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1]


In [None]:
pred3, train3, test3 = train_svm(train_panel, test_panel) # variance threshold run

85
0.0 0.8629885057471263
84
0.1 0.8629885057471263
84
0.2 0.8629885057471263
84
0.30000000000000004 0.8629885057471263
68
0.4 0.8698850574712644
24
0.5 0.7804597701149426
16
0.6000000000000001 0.7328735632183908
8
0.7000000000000001 0.7124137931034482
2
0.8 0.7055172413793104
1
0.9 0.7055172413793104


Extra code needed for gathering data for Kaplan-Meier plots

In [None]:
methy_cli = pandas.read_csv("methy_clinic.tsv")
logrank_methy = methy_cli[['icgc_donor_id.x', 'donor_vital_status', 'donor_survival_time']]

# Merge the two DataFrames based on the 'id' column
id_df = pandas.DataFrame(test_panel.index, columns=['icgc_donor_id.x'])

merged_id_df = pandas.merge(id_df, logrank_methy, on='icgc_donor_id.x')
merged_id_df = merged_id_df.drop_duplicates(subset=['icgc_donor_id.x'])

if merged_id_df['icgc_donor_id.x'].tolist() == test_panel.index.tolist():
    print("The lists are the same.")
else:
    print("The lists are different.")

merged_id_df.to_csv('methy_cli_logrank.csv', index=False)

The lists are the same.


## Nanostring experiment geo pdac mirna

In [None]:
geo_mirna = pandas.read_csv("mirna.tsv",sep = "\t")
geo_mirna = geo_mirna.set_index(['Unnamed: 0'])

# geo_mirna = geo_mirna.drop(['Unnamed: 0'],axis = 1)
geo_mirna = np.log2(geo_mirna+1)

In [None]:
# CHANGED because predictor_mirna was in csv format instead of txt
common_gene = set(predictor_mirna['GeneSymbol']).intersection(set(geo_mirna.columns))
train_panel = mirna_train[list(common_gene)]
test_panel = geo_mirna[list(common_gene)]

In [None]:
# save train and test panel for input in the Galaxy workflows
train_panel.to_csv('final_nanostring_trainpanel.csv', index=False)
test_panel.to_csv('final_nanostring_testpanel.csv', index=False)

Extra code needed for gathering data for Kaplan-Meier plots

In [None]:
nano_cli = pandas.read_csv("geomirna_cli (1).tsv")
logrank_nano = nano_cli[['V1', 'vital_status', 'time']]

id_df = pandas.DataFrame(test_panel.index.tolist(), columns=['V1'])

# merge the two DataFrames based on the 'id' column
merged_id_df = pandas.merge(id_df, logrank_nano, on='V1')

if merged_id_df['V1'].tolist() == test_panel.index.tolist():
    print("The lists are the same.")
else:
    print("The lists are different.")

merged_id_df.to_csv('nano_cli_logrank.csv', index=False)

The lists are the same.


## COMPASS dataset preparation

In [None]:
clinical_data = pandas.read_csv('Clinical_compass.csv')
compass_full_dataset = pandas.read_csv('tpmRSEMCOMPASS.csv')

In [None]:
# remove patients where treatment response was not examined
df = clinical_data.drop(clinical_data[clinical_data['Best response 1st line'] == 'NE'].index)

In [None]:
# create a new column 'response_label' with the same values as 'Best response 1st line'
df['response_label'] = df['Best response 1st line']


# change the labels from PR and SD to 0 and PD to 1
# 0 refers to response to treatment, 1 refers to non_respons to treatment
df.loc[df['Best response 1st line'] == 'PR', 'response_label'] = 0
df.loc[df['Best response 1st line'] == 'SD', 'response_label'] = 0
df.loc[df['Best response 1st line'] == 'PD', 'response_label'] = 1

In [None]:
# keep only patient ID (PCSI ID) and response label
selected_columns = ['PCSI ID', 'response_label']
response_df = df.loc[:, selected_columns]

Preprocess the full dataset with gene information

In [None]:
compass_full_dataset.head(10)

Unnamed: 0.1,Unnamed: 0,PCSI_1001,PCSI_1003,PCSI_1004,PCSI_1005,PCSI_1007,PCSI_1008,PCSI_1009,PCSI_1010,PCSI_1011,...,PCSI_0986,PCSI_0987,PCSI_0989,PCSI_0991,PCSI_0992,PCSI_0993,PCSI_0994,PCSI_0996,PCSI_0998,PCSI_0999
0,A1BG,0.0,0.0,0.0,0.0,0.0,1.74,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1CF,0.63,0.0,0.0,0.0,0.0,0.0,0.71,0.0,0.0,...,0.81,1.62,0.0,1.03,0.0,0.0,0.0,0.0,0.0,0.0
2,A2M,9.58,2.67,4.28,1.75,5.82,0.0,10.18,1.56,0.0,...,1.76,0.0,2.31,4.45,11.25,11.27,2.29,0.0,0.0,2.31
3,A2ML1,0.0,1.06,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.28,0.0
4,A4GALT,3.28,0.0,3.45,2.1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,AAAS,3.92,12.92,2.22,5.39,2.77,7.34,8.8,13.65,4.37,...,5.4,3.7,0.0,6.85,0.0,10.49,0.0,5.35,0.0,2.62
7,AACS,2.09,0.0,1.09,1.34,0.0,0.0,2.28,0.0,0.0,...,0.0,0.0,0.0,0.0,5.74,0.0,3.49,0.0,0.0,1.28
8,AADAC,19.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.97,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,AADAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.04,0.0,0.0


In [None]:
compass_full_dataset = compass_full_dataset.set_index('Unnamed: 0')

In [None]:
# select only the patient columns where we have a treatment response label for
common_patient = set(response_df['PCSI ID']).intersection(set(compass_full_dataset.columns))
filtered_compass = compass_full_dataset[list(common_patient)]


In [None]:
# remove the genes with more than 40% zero values
filtered_compass = filtered_compass.transpose()
small_filtered_compass = filtered_compass.loc[:, filtered_compass.eq(0).mean() < .40]

In [None]:
small_filtered_compass = np.log2(small_filtered_compass +1)

In [None]:
small_filtered_compass

Unnamed: 0,A2M,AAAS,AAK1,AAMP,AARS1,AATF,ABCA1,ABCA2,ABCB7,ABCC1,...,ZNF888,ZNF91,ZNF92,ZNF93,ZNFX1,ZPR1,ZRANB2,ZSWIM8,ZYX,ZZEF1
PCSI_0714,4.286142,2.792855,2.531069,3.937344,4.428276,3.670161,1.117695,2.010780,1.941106,1.827819,...,2.869871,3.673556,3.240314,2.104337,0.000000,0.704872,2.897240,1.618239,3.253989,0.765535
PCSI_0612,3.001802,2.972693,1.411426,3.549669,2.046142,0.000000,0.956057,1.500802,0.000000,0.000000,...,6.357376,8.650836,5.912410,1.594549,1.910733,0.839960,1.575312,1.427606,1.859970,0.000000
PCSI_0778,2.503349,2.592158,0.000000,0.000000,2.903038,2.313246,1.207893,0.000000,1.384050,1.650765,...,7.286049,1.875780,3.144046,0.000000,0.000000,2.097611,0.000000,1.117695,0.000000,0.000000
PCSI_1056,1.575312,2.851999,4.316870,2.811471,1.867896,2.560715,0.000000,0.604071,0.992768,1.207893,...,1.090853,6.480265,4.435629,0.000000,1.117695,0.739848,1.416840,0.782409,1.739848,0.000000
PCSI_0647,3.257011,3.214125,2.948601,2.754888,3.904966,3.005400,1.339137,0.584963,0.963474,1.531069,...,7.831054,8.227857,7.273236,1.400538,1.704872,0.704872,1.378512,1.238787,2.395063,0.432959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PCSI_0719,4.050502,2.801159,2.746313,3.749534,1.891419,0.000000,1.389567,1.042644,0.000000,2.117695,...,2.761285,5.169925,2.090853,1.459432,2.198494,0.000000,2.147307,1.678072,3.344828,0.790772
PCSI_1026,3.990955,3.724650,3.580145,3.779260,3.092546,2.985500,1.941106,1.310340,1.257011,2.720278,...,2.073820,5.555509,4.919817,0.000000,2.587365,0.000000,2.521051,2.330558,2.931683,0.000000
PCSI_0695,1.778209,3.654206,1.438293,2.241840,2.558268,0.000000,2.292782,1.182692,0.000000,0.000000,...,5.590063,5.393004,4.723012,3.976364,1.655352,1.790772,0.000000,0.910733,1.963474,0.000000
PCSI_0712,3.122673,4.310340,2.939227,1.847997,2.976364,2.877744,1.250962,0.000000,0.887525,1.937344,...,2.788686,5.666473,4.130931,1.321928,2.601697,1.454176,3.040892,2.241840,1.545968,1.356144


In [None]:
# the rows correspond with the patients, columns correspond with the genes
small_filtered_compass.to_csv('preprocessed_tpmRSEMCOMPASS1.csv', index=False)

Create csv file with gold labels of COMPASS dataset

In [None]:
response_df

Unnamed: 0,PCSI ID,response_label
0,PCSI_0630,0
1,PCSI_0632,0
2,PCSI_0634,0
3,PCSI_0637,0
4,PCSI_0640,0
...,...,...
188,PCSI_1066,0
189,PCSI_1068,0
190,PCSI_1070,0
191,PCSI_1071,0


In [None]:
response_df = response_df.transpose()

In [None]:
# set patient id as column names
response_df.columns = response_df.loc['PCSI ID', :].values

In [None]:
response_df = response_df.drop('PCSI ID', axis=0)

In [None]:
# select only the patient columns where we have a treatment response label for
response_df = response_df[list(common_patient)]

In [None]:
response_df = response_df.transpose()

In [None]:
response_df

Unnamed: 0,response_label
PCSI_0714,1
PCSI_0612,0
PCSI_0778,0
PCSI_1056,0
PCSI_0647,0
...,...
PCSI_0719,0
PCSI_1026,1
PCSI_0695,0
PCSI_0712,0


In [None]:
response_df = response_df.reset_index(drop=True)
response_df

Unnamed: 0,response_label
0,1
1,0
2,0
3,0
4,0
...,...
152,0
153,1
154,0
155,0


In [None]:
response_df = response_df.astype('int')

In [None]:
response_df.to_csv('response_labels.csv', index=False)

## Iknowit dataset preparation


In [None]:
full_dataset = pandas.read_excel('processed_Folfirinox_iknowit2.xlsx')

In [None]:
full_dataset

Unnamed: 0.1,Unnamed: 0,Sample.ID,Pair,Time,Response,IRF5.,ELK1.,C4BPA.,C7.,CASP10.,...,ANXA1.,POU2AF1.,SELL.,LAIR2.,CTSW.,C9.,IRGM.,CD74.,MAGEA4.,CD3E.
0,1,PA-180001,1,Baseline,Disease Control,8.175069,6.351946,4.154007,2.651507,7.304949,...,13.044361,5.154007,13.118603,7.043824,9.175069,4.154007,3.388472,12.846674,4.236469,10.238972
1,2,PA-180013,7,Baseline,Disease Control,7.89723,6.483671,3.253374,3.012365,6.619023,...,12.706644,6.970786,13.123501,7.974999,8.149124,3.722859,4.410915,13.001566,3.360289,10.659497
2,3,PA-180022,9,Baseline,Disease Control,7.959287,6.353566,2.315431,2.830004,6.7842,...,12.859843,6.959287,13.224038,7.975681,8.865628,3.508076,4.208515,13.334028,3.208515,9.541499
3,4,PA-180044,14,Baseline,Disease Control,8.094771,6.567792,7.465912,3.10836,6.705295,...,13.300776,6.161471,13.000296,8.317813,9.390058,3.486872,4.430288,13.177541,3.245864,10.238673
4,5,PA-180062,17,Baseline,Disease Control,7.712696,5.380679,2.869717,2.717714,6.85157,...,12.97798,6.262034,13.220104,8.230784,9.263751,3.717714,4.302676,12.241494,2.869717,10.68734
5,6,PA-180065,19,Baseline,Disease Control,7.9852,6.264817,2.550572,2.550572,6.740396,...,13.239368,5.408553,13.482509,7.427089,8.831342,3.638034,4.357927,13.175253,3.135534,10.722374
6,7,PA-180107,24,Baseline,Disease Control,7.830536,6.147271,1.446831,2.616756,6.685236,...,12.73757,6.00142,12.754317,7.962531,8.557967,3.254186,4.201719,12.477154,2.446831,10.672039
7,8,PA-190147,32,Baseline,Disease Control,7.141266,0.399799,3.207154,2.984762,6.859231,...,13.556515,6.422167,13.651134,8.661894,7.954388,4.207154,4.487262,11.592092,3.569724,9.628618
8,9,PA-190170,34,Baseline,Disease Control,7.937708,6.604973,3.134653,2.54969,6.825815,...,13.564844,5.88954,13.242088,7.572058,9.539794,3.719615,4.357045,13.291473,3.942008,10.893986
9,10,PA-190190,36,Baseline,Disease Control,7.873866,5.431644,3.367513,2.23001,6.580507,...,13.009729,5.580507,13.91686,8.355165,9.01399,3.814972,3.908082,12.636215,3.715437,10.408924


In [None]:
# Create a new column 'response_label' with the same values as 'Response'
full_dataset['response_label'] = full_dataset['Response']

# 0 refers to response to treatment, 1 refers to non_respons to treatment
full_dataset.loc[full_dataset['Response'] == 'Disease Control', 'response_label'] = 0
full_dataset.loc[full_dataset['Response'] == 'Progressive Disease', 'response_label'] = 1

In [None]:
response_df = pandas.DataFrame(full_dataset['response_label'])
response_df

Unnamed: 0,response_label
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [None]:
response_df.to_csv('IknowIT_labels.csv', index=False)

In [None]:
full_dataset = full_dataset.drop(['Unnamed: 0', 'Sample.ID', 'Pair', 'Time', 'Response', 'response_label'], axis=1)

In [None]:
full_dataset = np.log2(full_dataset + 1)

In [None]:
full_dataset

Unnamed: 0,IRF5.,ELK1.,C4BPA.,C7.,CASP10.,PBK.,CCL25.,CD1D.,TFRC.,FPR2.,...,ANXA1.,POU2AF1.,SELL.,LAIR2.,CTSW.,C9.,IRGM.,CD74.,MAGEA4.,CD3E.
0,3.197719,2.878126,2.365695,1.868492,3.053971,2.133719,1.04722,3.423353,3.630598,3.726651,...,3.811919,2.621526,3.819525,3.007882,3.346967,2.365695,2.133719,3.791468,2.388594,3.490438
1,3.153356,2.903746,2.088608,2.004453,2.929606,2.124424,2.124424,3.303431,3.601167,3.685025,...,3.776803,2.994722,3.820026,3.165912,3.193634,2.23966,2.435873,3.807516,2.124424,3.543434
2,3.163384,2.878444,1.729196,1.937346,2.960549,2.142402,1.988233,3.336359,3.536485,3.717252,...,3.792839,2.992639,3.830259,3.166021,3.302411,2.172512,2.380872,3.841372,2.073311,3.398008
3,3.185037,2.919872,3.081665,2.038563,2.94585,2.30928,2.199684,3.318855,3.565061,3.691737,...,3.838022,2.840256,3.807385,3.219991,3.377132,2.16571,2.441029,3.825535,2.086058,3.4904
4,3.123119,2.67371,1.952228,1.894416,2.972981,2.32401,2.086861,3.290748,3.573337,3.750619,...,3.805084,2.860374,3.82986,3.206453,3.359486,2.238088,2.406721,3.726994,1.952228,3.546875
5,3.167551,2.860926,1.828051,1.828051,2.952407,2.213514,1.350821,3.367428,3.498521,3.734736,...,3.831813,2.679999,3.85624,3.075034,3.297388,2.213514,2.421675,3.825303,2.048074,3.551193
6,3.142501,2.837393,1.290915,1.854696,2.94209,2.052162,1.91409,3.296317,3.593001,3.700251,...,3.780055,2.807648,3.781813,3.163906,3.256704,2.088883,2.378988,3.752444,1.785271,3.544985
7,3.025253,0.48522,2.072845,1.994494,2.974388,2.380495,1.895972,3.139009,3.659105,3.79518,...,3.863593,2.891841,3.87294,3.272306,3.162595,2.380495,2.456087,3.654446,2.192107,3.409882
8,3.159905,2.926943,2.047766,1.827693,2.968241,2.047766,2.213239,3.315705,3.50775,3.679333,...,3.864418,2.784408,3.832089,3.099642,3.397775,2.238669,2.421437,3.837083,2.305097,3.57216
9,3.149563,2.685187,2.126812,1.691539,2.922294,2.295159,1.893532,3.319361,3.552113,3.770179,...,3.808357,2.718199,3.898872,3.225763,3.323945,2.267527,2.295159,3.769371,2.237391,3.512091


In [None]:
# import compass to use as training set
compass_data1 = pandas.read_csv('preprocessed_tpmRSEMCOMPASS1.csv')

In [None]:
# remove the endpoint from column names
full_dataset.columns = full_dataset.columns.str.replace('.', '')

  full_dataset.columns = full_dataset.columns.str.replace('.', '')


In [None]:
# for the experiment compass is used as training set and IknowIT as test data so find the overlapping genes
common_gene = list(set(full_dataset.columns).intersection(set(compass_data1.columns)))
print(len(common_gene))
train_compass = compass_data1[list(common_gene)]
test_IknowIT = full_dataset[list(common_gene)]

122


In [None]:
test_IknowIT.to_csv('test_IknowIT.csv', index=False)
train_compass.to_csv('train_full_compass.csv', index=False)

In [None]:
full_dataset.to_csv('IknowIT_fulldata.csv', index=False)