In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
print("NumPy version " + np.__version__)
print("Tensorflow version " + tf.__version__)

NumPy version 1.18.5
Tensorflow version 2.3.1


In [3]:
## Data Reading ##
#tf.data.experimental.CsvDataset("data2/train_drug.csv", "float32")
train_features = pd.read_csv("data/train_features.csv")
results = pd.read_csv("data/train_targets_scored.csv")
testSet = pd.read_csv("data/test_features.csv")

In [4]:
## Parameters ##
validationProportion = 0.2

N = len(train_features) #Nombre d'échantillons testés
I = train_features.shape[1]-1 #Nombre de input
M = results.shape[1]-1 #Nombre de pathologies obserables

In [5]:
## HYPER parameters ##
nbEpoch = 30 
learningRate = 0.01 #with the adam optimizer

# nombre de couches cachées
# nombre de neurones
# fonctions d'activation
# epsilon (adam)

# cf randomizedSearchCV

In [6]:
## Normalization ##
maxVal=np.max(np.max(np.abs(train_features.iloc[:,4:])))
train_features.iloc[:,4:] = train_features.iloc[:,4:]/maxVal
train_features

Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,0.10620,0.05577,-0.02479,-0.06208,-0.01944,-0.10120,...,0.02862,0.02584,0.08076,0.05523,-0.01912,0.06584,-0.03981,0.02139,0.03801,0.04176
1,id_000779bfc,trt_cp,72,D1,0.00743,0.04087,0.02991,0.00604,0.10190,0.05207,...,-0.04265,0.07543,0.04708,0.00230,0.02957,0.04899,0.01522,0.01241,0.06077,0.07371
2,id_000a6266a,trt_cp,48,D1,0.06280,0.05817,0.15540,-0.00764,-0.00323,0.12390,...,-0.07250,-0.06297,0.06103,0.00223,-0.13240,-0.03174,-0.06417,-0.02187,-0.14080,0.06931
3,id_0015fd391,trt_cp,48,D1,-0.05138,-0.02491,-0.02656,0.05288,0.40620,-0.08095,...,-0.20990,-0.06441,-0.56300,-0.13780,-0.08632,-0.12880,-0.16210,-0.08784,-0.03876,-0.08154
4,id_001626bd3,trt_cp,72,D2,-0.03254,-0.04009,0.09700,0.06919,0.14180,-0.08244,...,0.00042,0.00048,0.06670,0.10690,0.05523,-0.03031,0.01094,0.02885,-0.03786,0.07125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,trt_cp,24,D2,0.01394,-0.00636,-0.01112,-0.05080,-0.04713,0.07201,...,0.01969,0.00262,-0.08121,0.03434,0.05372,-0.03246,0.00631,0.09171,0.05258,0.04680
23810,id_fffb70c0c,trt_cp,24,D2,-0.13260,0.03478,-0.03743,0.09905,-0.07178,0.06621,...,0.04286,0.04426,0.00423,-0.03195,-0.08086,-0.09798,-0.02084,-0.01224,-0.02715,0.03689
23811,id_fffc1c3f4,ctl_vehicle,48,D2,0.03942,0.03756,0.03109,-0.07389,0.05505,-0.00159,...,0.05409,0.03755,0.07343,0.02807,0.04116,0.06422,0.02256,0.07592,0.06656,0.03808
23812,id_fffcb9e7c,trt_cp,24,D1,0.06660,0.02324,0.04392,0.02044,0.08531,-0.00343,...,-0.01105,0.04258,-0.02012,0.01506,0.15230,0.07101,0.01732,0.07015,-0.06290,0.00740


In [7]:
## replacing sig_id by drug_id ##
dataSet = train_features.join(results, lsuffix='sig_id', rsuffix='sig_id')
dataSet = dataSet.drop(columns=['sig_idsig_id'])

dataSet

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,trt_cp,24,D1,0.10620,0.05577,-0.02479,-0.06208,-0.01944,-0.10120,-0.10220,...,0,0,0,0,0,0,0,0,0,0
1,trt_cp,72,D1,0.00743,0.04087,0.02991,0.00604,0.10190,0.05207,0.02341,...,0,0,0,0,0,0,0,0,0,0
2,trt_cp,48,D1,0.06280,0.05817,0.15540,-0.00764,-0.00323,0.12390,0.01715,...,0,0,0,0,0,0,0,0,0,0
3,trt_cp,48,D1,-0.05138,-0.02491,-0.02656,0.05288,0.40620,-0.08095,-0.19590,...,0,0,0,0,0,0,0,0,0,0
4,trt_cp,72,D2,-0.03254,-0.04009,0.09700,0.06919,0.14180,-0.08244,-0.02800,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,trt_cp,24,D2,0.01394,-0.00636,-0.01112,-0.05080,-0.04713,0.07201,0.05773,...,0,0,0,0,0,0,0,0,0,0
23810,trt_cp,24,D2,-0.13260,0.03478,-0.03743,0.09905,-0.07178,0.06621,-0.02252,...,0,0,0,0,0,0,0,0,0,0
23811,ctl_vehicle,48,D2,0.03942,0.03756,0.03109,-0.07389,0.05505,-0.00159,-0.02541,...,0,0,0,0,0,0,0,0,0,0
23812,trt_cp,24,D1,0.06660,0.02324,0.04392,0.02044,0.08531,-0.00343,0.00323,...,0,0,0,0,0,0,0,0,0,0


In [8]:
## Shuffling ##
dataSet = dataSet.reindex(np.random.permutation(dataSet.index))
dataSet

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
18157,trt_cp,72,D1,-0.03080,0.03801,0.00159,0.05902,0.03877,0.02964,-0.10840,...,0,0,0,0,0,0,0,0,0,0
9899,trt_cp,48,D1,0.21580,-0.01185,-0.05095,0.02782,-0.12140,-0.06538,0.21610,...,0,0,0,0,0,0,0,0,0,0
20105,trt_cp,48,D1,-0.02341,-0.00121,0.00641,0.03764,0.05195,0.13460,-0.02284,...,0,0,0,0,0,0,0,0,0,0
21527,trt_cp,48,D1,0.37190,-0.11150,0.05574,-0.09925,0.01588,0.00851,-0.15080,...,0,0,0,0,0,0,0,0,0,0
18387,trt_cp,72,D1,0.01331,0.05224,-0.09559,0.00892,-0.04694,0.00609,-0.04165,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12643,trt_cp,72,D1,-0.00491,0.00433,-0.07849,0.02448,0.03151,-0.09774,0.14960,...,0,0,0,0,0,0,0,0,0,0
10676,trt_cp,24,D2,-0.06113,-0.07959,0.13290,-0.00813,-0.02080,0.06860,0.08262,...,0,0,0,0,0,0,0,0,0,0
12568,trt_cp,72,D1,-0.07894,-0.10770,-0.00352,0.01882,-0.01705,0.18180,0.16560,...,0,0,0,0,0,0,0,0,0,0
10462,trt_cp,24,D2,0.05670,0.07501,0.04271,-0.16130,0.02720,-0.04297,0.00524,...,0,0,0,0,0,0,0,0,0,0


In [9]:
## Replacing categories values by numbers ##
features_to_convert=['cp_type','cp_time','cp_dose']
for feat in features_to_convert:
    dataSet[feat] = pd.Categorical(dataSet[feat])
    dataSet[feat] = dataSet[feat].cat.codes
    
dataSet

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
18157,1,2,0,-0.03080,0.03801,0.00159,0.05902,0.03877,0.02964,-0.10840,...,0,0,0,0,0,0,0,0,0,0
9899,1,1,0,0.21580,-0.01185,-0.05095,0.02782,-0.12140,-0.06538,0.21610,...,0,0,0,0,0,0,0,0,0,0
20105,1,1,0,-0.02341,-0.00121,0.00641,0.03764,0.05195,0.13460,-0.02284,...,0,0,0,0,0,0,0,0,0,0
21527,1,1,0,0.37190,-0.11150,0.05574,-0.09925,0.01588,0.00851,-0.15080,...,0,0,0,0,0,0,0,0,0,0
18387,1,2,0,0.01331,0.05224,-0.09559,0.00892,-0.04694,0.00609,-0.04165,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12643,1,2,0,-0.00491,0.00433,-0.07849,0.02448,0.03151,-0.09774,0.14960,...,0,0,0,0,0,0,0,0,0,0
10676,1,0,1,-0.06113,-0.07959,0.13290,-0.00813,-0.02080,0.06860,0.08262,...,0,0,0,0,0,0,0,0,0,0
12568,1,2,0,-0.07894,-0.10770,-0.00352,0.01882,-0.01705,0.18180,0.16560,...,0,0,0,0,0,0,0,0,0,0
10462,1,0,1,0.05670,0.07501,0.04271,-0.16130,0.02720,-0.04297,0.00524,...,0,0,0,0,0,0,0,0,0,0


In [10]:
## Splitting into training and validation ##
trainingSize = int((1-validationProportion) * N)

trainingSet = dataSet.iloc[:trainingSize,:] 
validationSet = dataSet.iloc[trainingSize:,:]

print("Training size:", len(trainingSet))
print("Validation size:", len(validationSet))

Training size: 19051
Validation size: 4763


In [11]:
XTrain = trainingSet.iloc[:,:I]
YTrain = trainingSet.iloc[:,I:]
XTrain

Unnamed: 0,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,g-6,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
18157,1,2,0,-0.03080,0.03801,0.00159,0.05902,0.03877,0.02964,-0.10840,...,0.03613,0.11490,0.09304,0.07080,0.09541,-0.07239,0.10470,0.00846,-0.01343,0.03360
9899,1,1,0,0.21580,-0.01185,-0.05095,0.02782,-0.12140,-0.06538,0.21610,...,-0.13180,-0.05171,0.00101,0.10490,0.03435,-0.08129,0.00017,0.04164,-0.23060,-0.07537
20105,1,1,0,-0.02341,-0.00121,0.00641,0.03764,0.05195,0.13460,-0.02284,...,-0.09744,-0.10380,0.00370,-0.03622,-0.01321,-0.18180,-0.02573,0.02467,-0.02560,-0.11270
21527,1,1,0,0.37190,-0.11150,0.05574,-0.09925,0.01588,0.00851,-0.15080,...,-0.28760,-0.38810,-0.34720,-0.17040,-0.54270,-0.22320,-0.27770,-0.40150,-0.49320,-0.25200
18387,1,2,0,0.01331,0.05224,-0.09559,0.00892,-0.04694,0.00609,-0.04165,...,0.04393,0.02536,0.02345,-0.02220,0.05195,-0.09391,-0.06682,0.06108,0.09155,0.03976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13695,1,0,0,-0.01895,-0.10490,-0.01547,-0.05738,0.13200,0.06913,0.11520,...,0.03483,-0.03073,0.02903,0.07710,0.13980,-0.03179,0.04711,0.13050,-0.01334,0.10900
5598,1,0,1,0.07368,0.08291,-0.08529,-0.11410,0.02894,-0.06615,0.05214,...,0.03011,0.01356,-0.04222,-0.02280,0.07929,0.02553,-0.01536,0.01318,-0.00242,0.07095
13323,1,2,1,-0.04026,-0.10730,0.00370,0.03302,-0.06096,0.06928,-0.03379,...,-0.03772,0.00161,-0.04642,-0.01270,0.05387,0.08008,-0.03053,-0.07242,0.05858,0.00234
10501,1,1,0,0.00900,0.03149,0.16050,-0.06442,-0.01156,0.00705,-0.00026,...,-0.08925,-0.05902,0.05215,-0.08506,-0.06957,-0.07087,-0.13010,-0.14590,-0.03782,-0.03695


In [12]:
## Batching ##
"""
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), dataframe))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

trainingds = df_to_dataset(trainingSet)
trainingds
"""

'\n# A utility method to create a tf.data dataset from a Pandas Dataframe\ndef df_to_dataset(dataframe, shuffle=True, batch_size=32):\n    dataframe = dataframe.copy()\n    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), dataframe))\n    if shuffle:\n        ds = ds.shuffle(buffer_size=len(dataframe))\n    ds = ds.batch(batch_size)\n    ds = ds.prefetch(batch_size)\n    return ds\n\ntrainingds = df_to_dataset(trainingSet)\ntrainingds\n'

In [13]:
## Training ##
model = tf.keras.models.Sequential([tf.keras.layers.Flatten(),
                                    tf.keras.layers.Dense(512, activation=tf.nn.relu),
                                    tf.keras.layers.Dense(M, activation=tf.nn.sigmoid)])

#sigmoid is more appropriate rather softmax because we are multiple factors of cancer 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
model.fit(XTrain, YTrain, epochs=nbEpoch)

Epoch 1/30


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x1f18a003788>