## Classification des poches protéiques en fonction du type de druggabilité, par un CNN

### 1) Préparation des données

In [1]:
import keras
import numpy as np
from random import shuffle
from random import sample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras.layers import Dense, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks.callbacks import EarlyStopping, Callback, ModelCheckpoint
from keras.layers import add, Activation
from keras.layers import Conv3D, MaxPool3D
from keras.models import Sequential, load_model
from os import listdir

Using TensorFlow backend.


In [2]:
PATH_DATA = "/media/anthony/POULOP/deepdrug3d_voxel_data/"

In [None]:
listdir(PATH_DATA)

In [24]:
def equilibrator_samplor(path, nucleotid, heme, control, steroid, k):
    all_pocket = listdir(path)
    ech = sample(nucleotid, k) + sample(heme, k) + sample(control, k)
    shuffle(ech)
    return ech

def remove_list(chosen_pocket, nucleotid, heme, control, steroid):
    for pocket in chosen_pocket:
        if pocket in nucleotid:
            nucleotid.remove(pocket)
        elif pocket in heme:
            heme.remove(pocket)
        elif pocket in control:
            control.remove(pocket)
        elif pocket in steroid:
            steroid.remove(pocket)

def load_x(path, chosen_pocket): #!
    X = np.zeros((len(chosen_pocket),14,32,32,32))
    for i in range(0,len(chosen_pocket)):
        X[i,:,:,:,:] = np.load("{}{}".format(path, chosen_pocket[i]))
        np.squeeze(X[i,:,:,:,:])
    return X

def load_y(chosen_pocket, nucleotid, heme, control, steroid):
    Y = []
    for pocket in chosen_pocket:
        if pocket in nucleotid:
            Y.append(1)
        elif pocket in heme:
            Y.append(2)
        elif pocket in steroid:
            Y.append(4)
        elif pocket in control:
            Y.append(3)
    Y  = np.array(Y)
    return Y

def one_hot_encoding(y):
    classes = LabelEncoder()
    integer_encoded = classes.fit_transform(y)
    one_hot = keras.utils.to_categorical(integer_encoded, num_classes= 3) 
    return one_hot

def list_generator(file):
    with open(file, "r") as filin:
        liste = ["{}.npy".format(line[:-1]) for line in filin]
    return liste

In [25]:
nucleotid = list_generator("nucleotide.list.txt")
heme = list_generator("heme.list.txt")
steroid = list_generator("steroid.list.txt")
control = list_generator("control.list.txt")

In [26]:
print(len(nucleotid))
print(len(heme))
print(len(control))
print(len(steroid))
print(len(nucleotid)+len(heme)+len(control)+len(steroid))
print(len(listdir(PATH_DATA)))

1553
596
1946
69
4164
4164


In [27]:
train_pocket = equilibrator_samplor(PATH_DATA, nucleotid, heme, control, steroid, 75)
X_train = load_x(PATH_DATA, train_pocket)
Y_train = load_y(train_pocket, nucleotid, heme, control, steroid)
one_hot_Y_train = one_hot_encoding(Y_train)
remove_list(train_pocket, nucleotid, heme, control, steroid)

print(len(nucleotid))
print(len(heme))
print(len(control))
print(len(steroid))
print(len(train_pocket))

1478
521
1871
69
225


In [28]:
test_pocket = equilibrator_samplor(PATH_DATA, nucleotid, heme, control, steroid, 25)
X_test = load_x(PATH_DATA, test_pocket)
Y_test = load_y(test_pocket, nucleotid, heme, control, steroid)
one_hot_Y_test = one_hot_encoding(Y_test)
print(X_train.shape)
print(type(X_train))
print(X_test.shape)
print(type(X_test))
print(len(Y_train))
print(type(Y_train))
print(len(Y_test))
print(type(Y_test))
print(one_hot_Y_train.shape)
print(type(one_hot_Y_train))
print(one_hot_Y_test.shape)
print(type(one_hot_Y_test))

(225, 14, 32, 32, 32)
<class 'numpy.ndarray'>
(75, 14, 32, 32, 32)
<class 'numpy.ndarray'>
225
<class 'numpy.ndarray'>
75
<class 'numpy.ndarray'>
(225, 3)
<class 'numpy.ndarray'>
(75, 3)
<class 'numpy.ndarray'>


In [31]:
for pocket in train_pocket:
    if pocket in test_pocket:
        print("putain")

In [32]:
nt= 0
hem = 0
ste = 0
ctr = 0

for i in range(0, one_hot_Y_train.shape[0]):
    if one_hot_Y_train[i,0]:
        ctr += 1
    elif one_hot_Y_train[i,1]:
        nt += 1
    elif one_hot_Y_train[i,2]:
        hem += 1
    else:
        ste += 1

In [33]:
print(nt)
print(hem)
print(ste)
print(ctr)
print("{}+{}+{}+{} = {}".format(nt,hem,ste,ctr, nt+hem+ste+ctr))
print(len(one_hot_Y_train))

75
75
0
75
75+75+0+75 = 225
225


### 2) Construction du modèle

In [35]:
def model_nique():
    model = Sequential()
    model.add(Conv3D(filters = 14, kernel_size = 5, data_format="channels_first",
                     strides=1, padding= "valid", activation = "relu", kernel_initializer="he_normal",
                     input_shape = (14,32,32,32)))
    model.add(Conv3D(filters = 14, kernel_size = 3, data_format="channels_first", strides=1, padding= "valid", activation = "relu"))
    model.add(Dropout(rate = 0.3))
    model.add(MaxPool3D(pool_size = 4, strides = 1, padding = "valid"))
    model.add(Dropout(rate = 0.3))
    model.add(Flatten(data_format = "channels_first"))
    model.add(Dense(units = 100, activation = "relu"))
    model.add(Dropout(rate = 0.3))
    model.add(Dense(units = 3, activation = "softmax"))
    model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['accuracy'])
    return model

In [37]:
np.random.seed(2000)
critor = EarlyStopping(monitor = "val_loss", patience = 3, mode = "min")
model_lol = model_nique()

#best_model_path = "../results/my_model"+".h5"
#best_model = ModelCheckpoint(best_model_path, monitor = "val_loss", verbose = 2, save_best_only = True)
#my_best_model = load_model("../results/my_model.h5")

model_lol.fit(X_train, one_hot_Y_train, epochs = 10, batch_size = 20,
             validation_split = 0.1, callbacks = [critor])

Train on 202 samples, validate on 23 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f784eaf5c90>

In [40]:
model_lol.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv3d_7 (Conv3D)            (None, 14, 28, 28, 28)    24514     
_________________________________________________________________
conv3d_8 (Conv3D)            (None, 14, 26, 26, 26)    5306      
_________________________________________________________________
dropout_10 (Dropout)         (None, 14, 26, 26, 26)    0         
_________________________________________________________________
max_pooling3d_4 (MaxPooling3 (None, 11, 23, 23, 26)    0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 11, 23, 23, 26)    0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 151294)            0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)              

# 3) Evaluation du modèle

In [42]:
evaluation = model_lol.evaluate(X_test, one_hot_Y_test, batch_size = 10, callbacks = [critor])
print(evaluation)

[1.0986279169718425, 0.3333333432674408]


In [None]:
training = KerasClassifier(build_fn = model_one, epochs = 5, batch_size=20, verbose=0)
kfold = KFold(n_splits = 5, shuffle=True)
cv_result = cross_val_score(training, X_train, one_hot_Y_train, cv = kfold)
print(cv_result)
print("%.2f%%(%2d%%)"%(cv_result.mean()*100, cv_result.std()*100))

In [21]:
predictions = model_lol.predict(X_train, batch_size = 10)

In [22]:
for p in predictions:
    print(p)

[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.30903995 0.34117636]
[0.34978378 0.309039

In [None]:



tp = 0
fp = 0
tn = 0
fn = 0

for i in range(predictions.shape[0]):
    maxi = max(predictions[i,:])
    if maxi == predictions[i, 0]:
        classe = 0
    elif maxi == predictions[i,1]:
        classe = 1
    elif maxi == predictions[i,2]:
        classe = 2
        
    if (one_hot_Y_test[i, 0] == 1.0) and (classe == 0):
        tp += 1
    elif (one_hot_Y_test[i, 1] == 1.0) and (classe == 1):
        tp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 0):
        fp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 1):
        fp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 2):
        tn += 1
    elif (one_hot_Y_test[i, 2] == 0.0) and (classe == 2):
        fn += 1
        
from math import sqrt

print("TP:{:.2f}%".format(tp*100/len(predictions)))
print("FP:{:.2f}%".format(fp*100/len(predictions)))
print("TN:{:.2f}".format(tn*100/len(predictions)))
print("FN:{:.2f}".format(fn*100/len(predictions)))
print("ACC = {:.2f}%".format((tp+tn)*100/(tp+tn+fp+fn)))
print("PPV = {:.2f}%".format(tp*100/(tp+fp)))
print("TNR = {:.2f}%".format(tn*100/(tn+fp)))
print("TPR = {:.2f}%".format(tp*100/(tp+fn)))
print("FPR = {:.2f}%".format(fp*100/(fp+tn)))
print("MCC = {:.2f}".format(((tn*tp)-(fp*fn))/sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))))