In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
print("NumPy version " + np.__version__)
print("Tensorflow version " + tf.__version__)

NumPy version 1.18.5
Tensorflow version 1.15.4


In [3]:
## Data Reading ##
#tf.data.experimental.CsvDataset("data2/train_drug.csv", "float32")
train_features = pd.read_csv("data/train_features.csv")
results = pd.read_csv("data/train_targets_scored.csv")
testSet = pd.read_csv("data/test_features.csv")

In [4]:
## Parameters ##
validationProportion = 0.2

N = len(train_features) #Nombre d'échantillons testés
I = train_features.shape[1]-1 #Nombre de input
M = results.shape[1]-1 #Nombre de pathologies obserables

In [5]:
## HYPER parameters ##
nbEpoch = 30 
learningRate = 0.01 #with the adam optimizer

# nombre de couches cachées
# nombre de neurones
# fonctions d'activation
# epsilon (adam)

# cf randomizedSearchCV

In [6]:
## Normalization ##
maxVal=np.max(np.max(np.abs(train_features.iloc[:,4:])))
train_features.iloc[:,4:] = train_features.iloc[:,4:]/maxVal
train_features

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,0.10620,0.05577,-0.02479,-0.06208,-0.01944,-0.10120,...,0.02862,0.02584,0.08076,0.05523,-0.01912,0.06584,-0.03981,0.02139,0.03801,0.04176
1,id_000779bfc,trt_cp,72,D1,0.00743,0.04087,0.02991,0.00604,0.10190,0.05207,...,-0.04265,0.07543,0.04708,0.00230,0.02957,0.04899,0.01522,0.01241,0.06077,0.07371
2,id_000a6266a,trt_cp,48,D1,0.06280,0.05817,0.15540,-0.00764,-0.00323,0.12390,...,-0.07250,-0.06297,0.06103,0.00223,-0.13240,-0.03174,-0.06417,-0.02187,-0.14080,0.06931
3,id_0015fd391,trt_cp,48,D1,-0.05138,-0.02491,-0.02656,0.05288,0.40620,-0.08095,...,-0.20990,-0.06441,-0.56300,-0.13780,-0.08632,-0.12880,-0.16210,-0.08784,-0.03876,-0.08154
4,id_001626bd3,trt_cp,72,D2,-0.03254,-0.04009,0.09700,0.06919,0.14180,-0.08244,...,0.00042,0.00048,0.06670,0.10690,0.05523,-0.03031,0.01094,0.02885,-0.03786,0.07125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,trt_cp,24,D2,0.01394,-0.00636,-0.01112,-0.05080,-0.04713,0.07201,...,0.01969,0.00262,-0.08121,0.03434,0.05372,-0.03246,0.00631,0.09171,0.05258,0.04680
23810,id_fffb70c0c,trt_cp,24,D2,-0.13260,0.03478,-0.03743,0.09905,-0.07178,0.06621,...,0.04286,0.04426,0.00423,-0.03195,-0.08086,-0.09798,-0.02084,-0.01224,-0.02715,0.03689
23811,id_fffc1c3f4,ctl_vehicle,48,D2,0.03942,0.03756,0.03109,-0.07389,0.05505,-0.00159,...,0.05409,0.03755,0.07343,0.02807,0.04116,0.06422,0.02256,0.07592,0.06656,0.03808
23812,id_fffcb9e7c,trt_cp,24,D1,0.06660,0.02324,0.04392,0.02044,0.08531,-0.00343,...,-0.01105,0.04258,-0.02012,0.01506,0.15230,0.07101,0.01732,0.07015,-0.06290,0.00740


In [7]:
## replacing sig_id by drug_id ##
dataSet = train_features.join(results, lsuffix='sig_id', rsuffix='sig_id')
dataSet = dataSet.drop(columns=['sig_idsig_id'])

dataSet

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,trt_cp,24,D1,0.10620,0.05577,-0.02479,-0.06208,-0.01944,-0.10120,-0.10220,...,0,0,0,0,0,0,0,0,0,0
1,trt_cp,72,D1,0.00743,0.04087,0.02991,0.00604,0.10190,0.05207,0.02341,...,0,0,0,0,0,0,0,0,0,0
2,trt_cp,48,D1,0.06280,0.05817,0.15540,-0.00764,-0.00323,0.12390,0.01715,...,0,0,0,0,0,0,0,0,0,0
3,trt_cp,48,D1,-0.05138,-0.02491,-0.02656,0.05288,0.40620,-0.08095,-0.19590,...,0,0,0,0,0,0,0,0,0,0
4,trt_cp,72,D2,-0.03254,-0.04009,0.09700,0.06919,0.14180,-0.08244,-0.02800,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,trt_cp,24,D2,0.01394,-0.00636,-0.01112,-0.05080,-0.04713,0.07201,0.05773,...,0,0,0,0,0,0,0,0,0,0
23810,trt_cp,24,D2,-0.13260,0.03478,-0.03743,0.09905,-0.07178,0.06621,-0.02252,...,0,0,0,0,0,0,0,0,0,0
23811,ctl_vehicle,48,D2,0.03942,0.03756,0.03109,-0.07389,0.05505,-0.00159,-0.02541,...,0,0,0,0,0,0,0,0,0,0
23812,trt_cp,24,D1,0.06660,0.02324,0.04392,0.02044,0.08531,-0.00343,0.00323,...,0,0,0,0,0,0,0,0,0,0


In [8]:
## Shuffling ##
dataSet = dataSet.reindex(np.random.permutation(dataSet.index))
dataSet

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
20191,trt_cp,24,D1,-0.03300,-0.04524,-0.01393,-0.00857,-0.02157,-0.02869,-0.03888,...,0,0,0,0,0,0,0,0,0,0
4219,ctl_vehicle,72,D2,0.13310,-0.02277,0.03214,-0.15250,0.11680,-0.08135,0.12770,...,0,0,0,0,0,0,0,0,0,0
18502,ctl_vehicle,72,D1,0.09451,-0.10910,-0.06399,0.04455,0.00073,-0.01540,0.08679,...,0,0,0,0,0,0,0,0,0,0
5432,trt_cp,48,D2,-0.00580,-0.00806,0.15950,0.04327,-0.00947,-0.10810,0.04048,...,0,0,0,0,0,0,0,0,0,0
17614,trt_cp,72,D2,0.10190,0.00012,0.01214,0.05992,0.08868,-0.02127,0.07984,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9337,trt_cp,48,D2,-0.00176,0.02263,0.08410,-0.12800,-0.08908,-0.03525,-0.01847,...,0,0,0,0,0,0,0,0,0,0
6298,trt_cp,24,D2,-0.01964,0.05278,-0.07003,-0.04183,-0.02404,0.03663,-0.08447,...,0,0,0,0,0,0,0,0,0,0
15799,trt_cp,24,D1,0.01741,-0.00956,0.03237,0.00290,-0.02844,0.04726,-0.10600,...,0,0,0,0,0,0,0,0,0,0
10214,trt_cp,72,D1,0.02195,0.02605,0.01151,-0.09933,0.04873,-0.01893,0.00828,...,0,0,0,0,0,0,0,0,0,0


In [9]:
## Replacing categories values by numbers ##
features_to_convert=['cp_type','cp_time','cp_dose']
for feat in features_to_convert:
    dataSet[feat] = pd.Categorical(dataSet[feat])
    dataSet[feat] = dataSet[feat].cat.codes
    
dataSet

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
20191,1,0,0,-0.03300,-0.04524,-0.01393,-0.00857,-0.02157,-0.02869,-0.03888,...,0,0,0,0,0,0,0,0,0,0
4219,0,2,1,0.13310,-0.02277,0.03214,-0.15250,0.11680,-0.08135,0.12770,...,0,0,0,0,0,0,0,0,0,0
18502,0,2,0,0.09451,-0.10910,-0.06399,0.04455,0.00073,-0.01540,0.08679,...,0,0,0,0,0,0,0,0,0,0
5432,1,1,1,-0.00580,-0.00806,0.15950,0.04327,-0.00947,-0.10810,0.04048,...,0,0,0,0,0,0,0,0,0,0
17614,1,2,1,0.10190,0.00012,0.01214,0.05992,0.08868,-0.02127,0.07984,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9337,1,1,1,-0.00176,0.02263,0.08410,-0.12800,-0.08908,-0.03525,-0.01847,...,0,0,0,0,0,0,0,0,0,0
6298,1,0,1,-0.01964,0.05278,-0.07003,-0.04183,-0.02404,0.03663,-0.08447,...,0,0,0,0,0,0,0,0,0,0
15799,1,0,0,0.01741,-0.00956,0.03237,0.00290,-0.02844,0.04726,-0.10600,...,0,0,0,0,0,0,0,0,0,0
10214,1,2,0,0.02195,0.02605,0.01151,-0.09933,0.04873,-0.01893,0.00828,...,0,0,0,0,0,0,0,0,0,0


In [10]:
## Splitting into training and validation ##
trainingSize = int((1-validationProportion) * N)

trainingSet = dataSet.iloc[:trainingSize,:] 
validationSet = dataSet.iloc[trainingSize:,:]

print("Training size:", len(trainingSet))
print("Validation size:", len(validationSet))

Training size: 19051
Validation size: 4763


In [11]:
XTrain = trainingSet.iloc[:,:I]
YTrain = trainingSet.iloc[:,I:]
XTrain

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
20191,1,0,0,-0.03300,-0.04524,-0.01393,-0.00857,-0.02157,-0.02869,-0.03888,...,-0.00674,0.00301,-0.00205,-0.09122,0.05137,0.00946,0.06000,0.02353,0.00165,-0.00236
4219,0,2,1,0.13310,-0.02277,0.03214,-0.15250,0.11680,-0.08135,0.12770,...,-0.10860,-0.04509,-0.03263,-0.04086,-0.14400,0.03881,-0.02897,-0.04541,0.01149,0.02740
18502,0,2,0,0.09451,-0.10910,-0.06399,0.04455,0.00073,-0.01540,0.08679,...,0.02112,-0.12140,-0.02578,0.06216,0.06163,-0.00628,0.02705,-0.03471,0.09380,0.04275
5432,1,1,1,-0.00580,-0.00806,0.15950,0.04327,-0.00947,-0.10810,0.04048,...,-0.03404,0.06129,0.00424,0.11070,-0.02057,-0.01246,-0.01157,-0.08023,0.04899,0.02548
17614,1,2,1,0.10190,0.00012,0.01214,0.05992,0.08868,-0.02127,0.07984,...,-0.14130,0.02331,0.06722,-0.04869,-0.06210,0.03609,0.01283,-0.03014,0.06697,-0.05239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3709,1,0,0,-0.04983,0.10530,-0.02151,-0.04895,-0.02884,-0.02713,-0.11250,...,0.02313,0.09845,0.08315,0.01313,0.07425,-0.00825,-0.01914,-0.02095,-0.03011,-0.09284
7771,1,0,0,0.06530,-0.05459,0.07917,-0.09864,-0.17440,-0.15910,0.03573,...,-0.08245,0.00622,0.00689,0.06354,0.05057,0.03424,0.02116,0.05319,0.00280,0.02187
19048,1,0,1,0.03914,-0.03385,0.06576,0.08742,0.01936,-0.01702,0.11820,...,-0.04355,-0.04229,0.01463,-0.01748,0.08390,0.00371,0.06234,0.03428,-0.11000,-0.04489
22,1,2,0,0.06111,-0.02907,-0.07853,0.01947,-0.09804,-0.04740,-0.04197,...,0.00947,0.05732,0.00289,0.01109,0.01208,0.05104,0.07287,-0.00168,-0.03555,-0.00509


In [12]:
## Batching ##
"""
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), dataframe))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

trainingds = df_to_dataset(trainingSet)
trainingds
"""

'\n# A utility method to create a tf.data dataset from a Pandas Dataframe\ndef df_to_dataset(dataframe, shuffle=True, batch_size=32):\n    dataframe = dataframe.copy()\n    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), dataframe))\n    if shuffle:\n        ds = ds.shuffle(buffer_size=len(dataframe))\n    ds = ds.batch(batch_size)\n    ds = ds.prefetch(batch_size)\n    return ds\n\ntrainingds = df_to_dataset(trainingSet)\ntrainingds\n'

In [13]:
## Training ##
model = tf.keras.models.Sequential([tf.keras.layers.Flatten(),
                                    tf.keras.layers.Dense(512, activation=tf.nn.relu),
                                    tf.keras.layers.Dense(M, activation=tf.nn.sigmoid)])

#sigmoid is more appropriate rather softmax because we are multiple factors of cancer 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
model.fit(XTrain, YTrain, epochs=nbEpoch)

ValueError: Please provide as model inputs either a single array or a list of arrays. You passed: inputs=       cp_type  cp_time  cp_dose      g-0      g-1      g-2      g-3      g-4  \
20191        1        0        0 -0.03300 -0.04524 -0.01393 -0.00857 -0.02157   
4219         0        2        1  0.13310 -0.02277  0.03214 -0.15250  0.11680   
18502        0        2        0  0.09451 -0.10910 -0.06399  0.04455  0.00073   
5432         1        1        1 -0.00580 -0.00806  0.15950  0.04327 -0.00947   
17614        1        2        1  0.10190  0.00012  0.01214  0.05992  0.08868   
...        ...      ...      ...      ...      ...      ...      ...      ...   
3709         1        0        0 -0.04983  0.10530 -0.02151 -0.04895 -0.02884   
7771         1        0        0  0.06530 -0.05459  0.07917 -0.09864 -0.17440   
19048        1        0        1  0.03914 -0.03385  0.06576  0.08742  0.01936   
22           1        2        0  0.06111 -0.02907 -0.07853  0.01947 -0.09804   
18168        1        2        0  0.00882  0.07114 -0.10200  0.07222 -0.00790   

           g-5      g-6  ...     c-90     c-91     c-92     c-93     c-94  \
20191 -0.02869 -0.03888  ... -0.00674  0.00301 -0.00205 -0.09122  0.05137   
4219  -0.08135  0.12770  ... -0.10860 -0.04509 -0.03263 -0.04086 -0.14400   
18502 -0.01540  0.08679  ...  0.02112 -0.12140 -0.02578  0.06216  0.06163   
5432  -0.10810  0.04048  ... -0.03404  0.06129  0.00424  0.11070 -0.02057   
17614 -0.02127  0.07984  ... -0.14130  0.02331  0.06722 -0.04869 -0.06210   
...        ...      ...  ...      ...      ...      ...      ...      ...   
3709  -0.02713 -0.11250  ...  0.02313  0.09845  0.08315  0.01313  0.07425   
7771  -0.15910  0.03573  ... -0.08245  0.00622  0.00689  0.06354  0.05057   
19048 -0.01702  0.11820  ... -0.04355 -0.04229  0.01463 -0.01748  0.08390   
22    -0.04740 -0.04197  ...  0.00947  0.05732  0.00289  0.01109  0.01208   
18168 -0.01457  0.00171  ...  0.06480  0.09624  0.06488  0.06768  0.08838   

          c-95     c-96     c-97     c-98     c-99  
20191  0.00946  0.06000  0.02353  0.00165 -0.00236  
4219   0.03881 -0.02897 -0.04541  0.01149  0.02740  
18502 -0.00628  0.02705 -0.03471  0.09380  0.04275  
5432  -0.01246 -0.01157 -0.08023  0.04899  0.02548  
17614  0.03609  0.01283 -0.03014  0.06697 -0.05239  
...        ...      ...      ...      ...      ...  
3709  -0.00825 -0.01914 -0.02095 -0.03011 -0.09284  
7771   0.03424  0.02116  0.05319  0.00280  0.02187  
19048  0.00371  0.06234  0.03428 -0.11000 -0.04489  
22     0.05104  0.07287 -0.00168 -0.03555 -0.00509  
18168 -0.05963  0.13100  0.05394 -0.02572  0.04472  

[19051 rows x 875 columns]