## Classification des poches protéiques en fonction du type de druggabilité, par un CNN

### 1) Préparation des données

In [1]:
import keras
from keras import Input, Model
import numpy as np
from random import shuffle, sample, randint
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks.callbacks import EarlyStopping, Callback, ModelCheckpoint
from keras.layers import add, Activation, Conv3D, MaxPooling3D, Dense, Flatten, Dropout
from keras.models import Sequential, load_model
from os import listdir

Using TensorFlow backend.


In [2]:
PATH_DATA = "/media/anthony/POULOP/deepdrug3d_voxel_data/"

In [43]:
def equilibrator_samplor(path, nucleotid, heme, control, steroid, k):
    all_pocket = listdir(path)
    ech = sample(nucleotid, randint(k-15, k+15)) + sample(heme, randint(k-15, k+15)) + sample(control, randint(k-15, k+15))
    shuffle(ech)
    return ech

def remove_list(chosen_pocket, nucleotid, heme, control, steroid):
    for pocket in chosen_pocket:
        if pocket in nucleotid:
            nucleotid.remove(pocket)
        elif pocket in heme:
            heme.remove(pocket)
        elif pocket in control:
            control.remove(pocket)
        elif pocket in steroid:
            steroid.remove(pocket)

def load_x(path, chosen_pocket):
    X = np.zeros((len(chosen_pocket),14,32,32,32))
    for i in range(0, len(chosen_pocket)):
        X[i,:,:,:,:] = np.load("{}{}".format(path, chosen_pocket[i]))
        if X[i,:,:,:,:].shape == (1,14,32,32,32):
            np.squeeze(X[i,:,:,:,:])
    return X

def load_y(chosen_pocket, nucleotid, heme, control, steroid):
    Y = np.zeros((len(chosen_pocket),3))
    for i in range(0, len(chosen_pocket)):
        if chosen_pocket[i] in nucleotid:
            Y[i,0] = 1
        elif chosen_pocket[i] in heme:
            Y[i,1] = 1
        elif chosen_pocket[i] in control:
            Y[i,2] = 1
    return Y

def one_hot_encoding(y):
    classes = LabelEncoder()
    integer_encoded = classes.fit_transform(y)
    one_hot = keras.utils.to_categorical(integer_encoded, num_classes= 3) 
    return one_hot

def list_generator(file):
    with open(file, "r") as filin:
        liste = ["{}.npy".format(line[:-1]) for line in filin]
    return liste

In [197]:
nucleotid = list_generator("nucleotide.list.txt")
heme = list_generator("heme.list.txt")
steroid = list_generator("steroid.list.txt")
control = list_generator("control.list.txt")

In [198]:
print(len(nucleotid))
print(len(heme))
print(len(control))
print(len(steroid))
print(len(nucleotid)+len(heme)+len(control)+len(steroid))
print(len(listdir(PATH_DATA)))

1553
596
1946
69
4164
4164


In [199]:
train_pocket = equilibrator_samplor(PATH_DATA, nucleotid, heme, control, steroid, 100)
X_train = load_x(PATH_DATA, train_pocket)
one_hot_Y_train = load_y(train_pocket, nucleotid, heme, control, steroid)
#one_hot_Y_train = one_hot_encoding(Y_train)
remove_list(train_pocket, nucleotid, heme, control, steroid)

print(len(nucleotid))
print(len(heme))
print(len(control))
print(len(steroid))
print(len(train_pocket))

1453
497
1848
69
297


In [200]:
test_pocket = equilibrator_samplor(PATH_DATA, nucleotid, heme, control, steroid, 100)
X_test = load_x(PATH_DATA, test_pocket)
one_hot_Y_test = load_y(test_pocket, nucleotid, heme, control, steroid)
#one_hot_Y_test = one_hot_encoding(Y_test)
print(X_train.shape)
print(type(X_train))
print(X_test.shape)
print(type(X_test))
#print(len(Y_train))
#print(type(Y_train))
#print(len(Y_test))
#print(type(Y_test))
print(one_hot_Y_train.shape)
print(type(one_hot_Y_train))
print(one_hot_Y_test.shape)
print(type(one_hot_Y_test))

(297, 14, 32, 32, 32)
<class 'numpy.ndarray'>
(319, 14, 32, 32, 32)
<class 'numpy.ndarray'>
(297, 3)
<class 'numpy.ndarray'>
(319, 3)
<class 'numpy.ndarray'>


In [201]:
for pocket in train_pocket:
    if pocket in test_pocket:
        print("putain")

In [202]:
nt= 0
hem = 0
ste = 0
ctr = 0

for i in range(0, one_hot_Y_train.shape[0]):
    if one_hot_Y_train[i,0]:
        ctr += 1
    elif one_hot_Y_train[i,1]:
        nt += 1
    elif one_hot_Y_train[i,2]:
        hem += 1
    else:
        ste += 1

In [203]:
print(nt)
print(hem)
print(ste)
print(ctr)
print("{}+{}+{}+{} = {}".format(nt,hem,ste,ctr, nt+hem+ste+ctr))
print(len(one_hot_Y_train))

99
98
0
100
99+98+0+100 = 297
297


In [204]:
from numpy import isnan

print(True in isnan(X_test))
print(True in isnan(X_train))

False
False


### 2) Construction du modèle

In [None]:
def seq_model():
    model = Sequential()
    model.add(Conv3D(filters = 14, kernel_size = 5, data_format="channels_first",
                     strides=1, padding= "same", activation = "relu", kernel_initializer="he_normal",
                     input_shape = (14,32,32,32)))
    model.add(Conv3D(filters = 14, kernel_size = 3, data_format="channels_first",
                     strides=1, padding= "same", activation = "relu"))
    model.add(Dropout(rate = 0.5))
    model.add(MaxPool3D(pool_size = 4, strides = 1, padding = "valid"))
    model.add(Dropout(rate = 0.5))
    model.add(Flatten(data_format = "channels_first"))
    model.add(Dense(100))
    model.add(Dense(units = 3, activation = "softmax"))
    model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['accuracy'])
    return model

In [205]:
def model_one():
    input_layer = keras.Input(shape=(14, 32, 32, 32))
    conv_1 = keras.layers.Conv3D(
        filters = 32,
        kernel_size = 5,
        activation = "relu", 
        data_format = "channels_first",
        padding = "valid"
        )(input_layer)
    dropout_1 = keras.layers.Dropout(rate=0.2)(conv_1)
    conv_2 = keras.layers.Conv3D(
        filters = 32,
        kernel_size = 3,
        activation = "relu", 
        data_format="channels_first",
        padding="valid"
        )(dropout_1)
    max_pooling_1 = keras.layers.MaxPooling3D(
        pool_size=(2,2,2),
        strides=None,
        padding="valid",
        data_format="channels_first"
        )(conv_2)
    dropout_2 = keras.layers.Dropout(rate=0.4)(max_pooling_1)
    flatten_1 = keras.layers.Flatten()(dropout_2)
    dense_1 = keras.layers.Dense(units=100, activation="relu")(flatten_1)
    output_layer = keras.layers.Dense(units=3, activation="softmax")(dense_1)
    notdeepdrug_model = keras.Model(inputs=input_layer,outputs=output_layer)
    notdeepdrug_model.compile(
            optimizer="adam", 
            loss="categorical_crossentropy",
            metrics=["accuracy"]
            )
    return notdeepdrug_model

In [206]:
fct_model  = model_one()
fct_model.fit(X_train, one_hot_Y_train, epochs = 10, batch_size = 20,
              validation_split = 0.05, shuffle = True)

Train on 282 samples, validate on 15 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7fb09d95f7d0>

In [207]:
fct_model.summary()

Model: "model_30"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_41 (InputLayer)        (None, 14, 32, 32, 32)    0         
_________________________________________________________________
conv3d_79 (Conv3D)           (None, 32, 28, 28, 28)    56032     
_________________________________________________________________
dropout_72 (Dropout)         (None, 32, 28, 28, 28)    0         
_________________________________________________________________
conv3d_80 (Conv3D)           (None, 32, 26, 26, 26)    27680     
_________________________________________________________________
max_pooling3d_31 (MaxPooling (None, 32, 13, 13, 13)    0         
_________________________________________________________________
dropout_73 (Dropout)         (None, 32, 13, 13, 13)    0         
_________________________________________________________________
flatten_31 (Flatten)         (None, 70304)             0  

# 3) Evaluation du modèle

In [None]:
evaluation = fct_model.evaluate(X_test, one_hot_Y_test, batch_size = 32)
print(evaluation)

 32/319 [==>...........................] - ETA: 21s

In [None]:
training = KerasClassifier(build_fn = model_one, epochs = 5, batch_size=20, verbose=0)
kfold = KFold(n_splits = 5, shuffle=True)
cv_result = cross_val_score(training, X_train, one_hot_Y_train, cv = kfold)
print(cv_result)
print("%.2f%%(%2d%%)"%(cv_result.mean()*100, cv_result.std()*100))

In [194]:
predictions = fct_model.predict(X_test, batch_size = 20)

In [196]:
for p in predictions:
    print(p)

[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.3376421  0.33284447]
[0.32951343 0.337642

In [None]:
tp = 0
fp = 0
tn = 0
fn = 0

for i in range(predictions.shape[0]):
    maxi = max(predictions[i,:])
    if maxi == predictions[i, 0]:
        classe = 0
    elif maxi == predictions[i,1]:
        classe = 1
    elif maxi == predictions[i,2]:
        classe = 2
        
    if (one_hot_Y_test[i, 0] == 1.0) and (classe == 0):
        tp += 1
    elif (one_hot_Y_test[i, 1] == 1.0) and (classe == 1):
        tp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 0):
        fp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 1):
        fp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 2):
        tn += 1
    elif (one_hot_Y_test[i, 2] == 0.0) and (classe == 2):
        fn += 1
        
from math import sqrt

print("TP:{:.2f}%".format(tp*100/len(predictions)))
print("FP:{:.2f}%".format(fp*100/len(predictions)))
print("TN:{:.2f}".format(tn*100/len(predictions)))
print("FN:{:.2f}".format(fn*100/len(predictions)))
print("ACC = {:.2f}%".format((tp+tn)*100/(tp+tn+fp+fn)))
print("PPV = {:.2f}%".format(tp*100/(tp+fp)))
print("TNR = {:.2f}%".format(tn*100/(tn+fp)))
print("TPR = {:.2f}%".format(tp*100/(tp+fn)))
print("FPR = {:.2f}%".format(fp*100/(fp+tn)))
print("MCC = {:.2f}".format(((tn*tp)-(fp*fn))/sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))))