## Classification des poches protéiques en fonction du type de druggabilité, par un CNN

### 1) Préparation des données

In [1]:
import keras
import numpy as np
from random import shuffle
from random import sample

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from keras.layers import Dense, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks.callbacks import EarlyStopping, Callback, ModelCheckpoint
from keras.layers import add, Activation
from keras.layers import Conv3D, MaxPool3D
from keras.models import Sequential, load_model
from os import listdir
from math import floor

Using TensorFlow backend.


In [2]:
PATH_DATA = "/media/anthony/POULOP/deepdrug3d_voxel_data"

In [3]:
def equilibrator_samplor(path, nucleotid, heme, control, steroid, k):
    all_pocket = listdir(path)
    ech = sample(nucleotid, k) + sample(heme, k) + sample(control, k)
    shuffle(ech)
    chosen_pocket = [pocket for pocket in all_pocket if pocket in ech]
    return chosen_pocket

def remove_list(chosen_pocket, nucleotid, heme, control, steroid):
    for pocket in chosen_pocket:
        if pocket in nucleotid:
            nucleotid.remove(pocket)
        elif pocket in heme:
            heme.remove(pocket)
        elif pocket in control:
            control.remove(pocket)
        elif pocket in steroid:
            steroid.remove(pocket)

def load_x(path, chosen_pocket):
    try:
        X = [np.load("{}/{}".format(path, pocket))
             for pocket in chosen_pocket]
    except ValueError:
        print(pocket)
    X = [np.squeeze(array) for array in X]
    X = np.array(X)
    X = np.moveaxis(X, 1, -1)
    return X

def load_y(chosen_pocket, nucleotid, heme, control, steroid):
    Y = []
    for pocket in chosen_pocket:
        if pocket in nucleotid:
            Y.append(1)
        elif pocket in heme:
            Y.append(2)
        elif pocket in steroid:
            Y.append(4)
        elif pocket in control:
            Y.append(3)
    Y  = np.array(Y)
    return Y

def one_hot_encoding(y):
    classes = LabelEncoder()
    integer_encoding = classes.fit_transform(y)
    one_hot_Y = keras.utils.to_categorical(integer_encoding)
    return one_hot_Y

def list_generator(file):
    with open(file, "r") as filin:
        liste = ["{}.npy".format(line[:-1]) for line in filin]
    return liste

In [44]:
nucleotid = list_generator("nucleotide.list.txt")
heme = list_generator("heme.list.txt")
steroid = list_generator("steroid.list.txt")
control = list_generator("control.list.txt")

In [5]:
print(len(nucleotid))
print(len(heme))
print(len(control))
print(len(steroid))

1553
596
1946
69


In [79]:
train_pocket = equilibrator_samplor(PATH_DATA, nucleotid, heme, control, steroid, 75)
X_train = load_x(PATH_DATA, train_pocket)
Y_train = load_y(train_pocket, nucleotid, heme, control, steroid)
one_hot_Y_train = one_hot_encoding(Y_train)

remove_list(train_pocket, nucleotid, heme, control, steroid)
print(len(nucleotid))
print(len(heme))
print(len(control))
print(len(steroid))

1478
521
1871
69


In [80]:
test_pocket = equilibrator_samplor(PATH_DATA, nucleotid, heme, control, steroid, 25)
X_test = load_x(PATH_DATA, test_pocket)
Y_test = load_y(test_pocket, nucleotid, heme, control, steroid)
one_hot_Y_test = one_hot_encoding(Y_test)
print(X_train.shape)
print(X_test.shape)
print(len(Y_train))
print(len(Y_test))
print(one_hot_Y_train.shape)
print(one_hot_Y_test.shape)

(225, 32, 32, 32, 14)
(75, 32, 32, 32, 14)
225
75
(225, 3)
(75, 3)


In [81]:
for pocket in train_pocket:
    if pocket in test_pocket:
        print("putain")

In [82]:
nt= 0
hem = 0
ste = 0
ctr = 0

for i in range(0, one_hot_Y_train.shape[0]):
    if one_hot_Y_train[i,0]:
        ctr += 1
    elif one_hot_Y_train[i,1]:
        nt += 1
    elif one_hot_Y_train[i,2]:
        hem += 1
    else:
        ste += 1

In [83]:
print(nt)
print(hem)
print(ste)
print(ctr)
print("{}+{}+{}+{} = {}".format(nt,hem,ste,ctr, nt+hem+ste+ctr))
print(len(one_hot_Y_train))

75
75
0
75
75+75+0+75 = 225
225


### 2) Construction du modèle

In [84]:
def model_one():
    model = Sequential()
    model.add(Conv3D(filters =64, kernel_size = (28,28,28), data_format="channels_last", strides=1, padding= "same", activation = "relu"))
    model.add(Conv3D(filters = 64, kernel_size = (26,26,26), data_format="channels_last", strides=1, padding= "same", activation = "relu"))
    #model.add(Conv3D(filters = 8, kernel_size = 3, data_format="channels_last", strides=1, padding= "same", activation = "relu", kernel_initializer="he_normal"))
    model.add(Dropout(0.2))
    model.add(MaxPool3D(pool_size = 2, strides = 1, padding = "same"))
    model.add(Dropout(0.4))
    model.add(Flatten())
    model.add(Dense(units = 75, activation = "relu"))
    model.add(Dropout(0.4))
    model.add(Dense(units = 3, activation = "softmax"))
    model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['accuracy'])
    return model

In [None]:
my_model.summary()

In [None]:
np.random.seed(2000)
critor = EarlyStopping(monitor = "val_loss", patience = 4, mode = "min")
my_model = model_one()

#best_model_path = "../results/my_model"+".h5"
#best_model = ModelCheckpoint(best_model_path, monitor = "val_loss", verbose = 2, save_best_only = True)
#my_best_model = load_model("../results/my_model.h5")

my_model.fit(X_train, one_hot_Y_train, epochs = 15, batch_size = 50,
             validation_split = 0.1, callbacks = [critor])

Train on 202 samples, validate on 23 samples
Epoch 1/15


# 3) Evaluation du modèle

In [13]:
evaluation = my_model.evaluate(X_test, one_hot_Y_test)
print(evaluation)

[1.0986277310535162, 0.3333333432674408]


In [None]:
training = KerasClassifier(build_fn = model_one, epochs = 5, batch_size=20, verbose=0)
kfold = KFold(n_splits = 5, shuffle=True)
cv_result = cross_val_score(training, X_train, one_hot_Y_train, cv = kfold)
print(cv_result)
print("%.2f%%(%2d%%)"%(cv_result.mean()*100, cv_result.std()*100))

In [None]:
predictions = my_model.predict(X_test)

In [None]:
tp = 0
fp = 0
tn = 0
fn = 0

for i in range(predictions.shape[0]):
    maxi = max(predictions[i,:])
    if maxi == predictions[i, 0]:
        classe = 0
    elif maxi == predictions[i,1]:
        classe = 1
    elif maxi == predictions[i,2]:
        classe = 2
        
    if (one_hot_Y_test[i, 0] == 1.0) and (classe == 0):
        tp += 1
    elif (one_hot_Y_test[i, 1] == 1.0) and (classe == 1):
        tp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 0):
        fp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 1):
        fp += 1
    elif (one_hot_Y_test[i, 2] == 1.0) and (classe == 2):
        tn += 1
    elif (one_hot_Y_test[i, 2] == 0.0) and (classe == 2):
        fn += 1
        
from math import sqrt

print("TP:{:.2f}%".format(tp*100/len(predictions)))
print("FP:{:.2f}%".format(fp*100/len(predictions)))
print("TN:{:.2f}".format(tn*100/len(predictions)))
print("FN:{:.2f}".format(fn*100/len(predictions)))
print("ACC = {:.2f}%".format((tp+tn)*100/(tp+tn+fp+fn)))
print("PPV = {:.2f}%".format(tp*100/(tp+fp)))
print("TNR = {:.2f}%".format(tn*100/(tn+fp)))
print("TPR = {:.2f}%".format(tp*100/(tp+fn)))
print("FPR = {:.2f}%".format(fp*100/(fp+tn)))
print("MCC = {:.2f}".format(((tn*tp)-(fp*fn))/sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))))