In [1]:
import functools
import time

import keras
import keras.backend as K
import matplotlib.pyplot as plt
# import scikitplot as skplt
import numpy as np
import pandas as pd
import scikitplot as skplt
import seaborn as sns
import tensorflow as tf
import xgboost as xgb
from keras.layers import Dense, Dropout, Input, Lambda
from keras.models import Model, Sequential, load_model
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, accuracy_score, auc, auc, average_precision_score, average_precision_score, \
    confusion_matrix, confusion_matrix, pairwise_distances, precision_score, precision_score, recall_score, \
    recall_score, roc_auc_score, roc_auc_score, roc_curve, roc_curve
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem import AllChem
from data_analysis import calculate_metrics, get_rdkit_features
import dill

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [9]:
def get_ECFP4(df):
    df = df.reset_index(drop=True)
    mols = [Chem.MolFromSmiles(rdk) for rdk in df.rdkit]
    ECFP = [AllChem.GetMorganFingerprintAsBitVect(x,2,nBits=2048) for x in mols]
    a = np.array(ECFP)
    ECFP_mols = a.astype(np.float32)
    Y = np.array(df.Binary)
    return(ECFP_mols, Y)

In [3]:
def get_keras_simple_nn(num_units=35, activation='relu', drop_rate=0.2):
    model = Sequential()
    model.add(Dense(num_units, activation=activation))
    model.add(Dropout(drop_rate))
    model.add(Dense(num_units, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [4]:
baselines_map = {
    'knn_clf': KNeighborsClassifier(n_neighbors=1, weights='distance'),
    'random_forest': RandomForestClassifier(),
    'logistic_regression': LogisticRegression(),
    'svc': SVC(gamma='auto'),
    'xgboost':
        xgb.XGBClassifier(
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=0.1,
            max_depth=6,
            min_child_weight=12,
            n_estimators=100,
            subsample=0.95
        ),
    'simple-NN': get_keras_simple_nn()
}

In [5]:
#change inputs with ECFP fingerprints and not panos features.
def get_baselines_performance(df_train, df_val, use_only=None):
    X_train, Y_train = get_ECFP4(df_train)
    X_cold, Y_cold =  get_ECFP4(df_val)

    if use_only is None:
        use_only = baselines_map.keys()
    metrics = {}
    for name, model in baselines_map.items():
        if name in use_only:
            if name == 'simple-NN':
                model.fit(X_train, Y_train, epochs=30, batch_size=32)
            else:
                model.fit(X_train, Y_train)

            y_pred = model.predict(X_cold).squeeze()
            y_true = Y_cold.squeeze()
            metrics[name] = calculate_metrics(y_true, y_pred)

    return pd.DataFrame(metrics).T

In [6]:
target_1 = 'p38'
base_path_1 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_1 = base_path_1+f'/data/{target_1}/data.csv'
df_p38=pd.read_csv(data_fpath_1).set_index('biolab_index')

with open(base_path_1+f'/data/{target_1}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_p38 = dill.load(in_f)

with open(base_path_1+f'/data/{target_1}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_p38 = dill.load(in_f)
    
target_2 = 'akt1'
base_path_2 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_2 = base_path_2+f'/data/{target_2}/data.csv'
df_akt1 = pd.read_csv(data_fpath_2).set_index('biolab_index')

with open(base_path_2+f'/data/{target_2}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_akt1 = dill.load(in_f)
with open(base_path_2+f'/data/{target_2}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_akt1 = dill.load(in_f)
    
target_3 = 'pi3k'
base_path_3 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_3 = base_path_3+f'/data/{target_3}/data.csv'
df_pi3k = pd.read_csv(data_fpath_3).set_index('biolab_index')

with open(base_path_3+f'/data/{target_3}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_pi3k = dill.load(in_f)
with open(base_path_3+f'/data/{target_3}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_pi3k = dill.load(in_f)

In [7]:
training_p38 = [df_p38.loc[train_val_folds_p38[0][0]],
                 df_p38.loc[train_val_folds_p38[1][0]],
                 df_p38.loc[train_val_folds_p38[2][0]],
                 df_p38.loc[train_val_folds_p38[3][0]],
                 df_p38.loc[train_val_folds_p38[4][0]],
                 df_p38.loc[train_val_folds_p38[5][0]],
                 df_p38.loc[train_test_folds_p38[0]]
                 ]
validation_p38 = [df_p38.loc[train_val_folds_p38[0][1]],
                   df_p38.loc[train_val_folds_p38[1][1]],
                   df_p38.loc[train_val_folds_p38[2][1]],
                   df_p38.loc[train_val_folds_p38[3][1]],
                   df_p38.loc[train_val_folds_p38[4][1]],
                   df_p38.loc[train_val_folds_p38[5][1]],
                   df_p38.loc[train_test_folds_p38[1]]
                   ]

training_akt1 = [df_akt1.loc[train_val_folds_akt1[0][0]],
                 df_akt1.loc[train_val_folds_akt1[1][0]],
                 df_akt1.loc[train_val_folds_akt1[2][0]],
                 df_akt1.loc[train_val_folds_akt1[3][0]],
                 df_akt1.loc[train_val_folds_akt1[4][0]],
                 df_akt1.loc[train_val_folds_akt1[5][0]],
                 df_akt1.loc[train_test_folds_akt1[0]]
                 ]
validation_akt1 = [df_akt1.loc[train_val_folds_akt1[0][1]],
                   df_akt1.loc[train_val_folds_akt1[1][1]],
                   df_akt1.loc[train_val_folds_akt1[2][1]],
                   df_akt1.loc[train_val_folds_akt1[3][1]],
                   df_akt1.loc[train_val_folds_akt1[4][1]],
                   df_akt1.loc[train_val_folds_akt1[5][1]],
                   df_akt1.loc[train_test_folds_akt1[1]]
                   ]

training_pi3k = [df_pi3k.loc[train_val_folds_pi3k[0][0]],
                 df_pi3k.loc[train_val_folds_pi3k[1][0]],
                 df_pi3k.loc[train_val_folds_pi3k[2][0]],
                 df_pi3k.loc[train_val_folds_pi3k[3][0]],
                 df_pi3k.loc[train_val_folds_pi3k[4][0]],
                 df_pi3k.loc[train_val_folds_pi3k[5][0]],
                 df_pi3k.loc[train_test_folds_pi3k[0]]
                 ]
validation_pi3k = [df_pi3k.loc[train_val_folds_pi3k[0][1]],
                   df_pi3k.loc[train_val_folds_pi3k[1][1]],
                   df_pi3k.loc[train_val_folds_pi3k[2][1]],
                   df_pi3k.loc[train_val_folds_pi3k[3][1]],
                   df_pi3k.loc[train_val_folds_pi3k[4][1]],
                   df_pi3k.loc[train_val_folds_pi3k[5][1]],
                   df_pi3k.loc[train_test_folds_pi3k[1]]
                   ]

In [21]:
metrics_all = list()
for i in range(len(training_pi3k)):
    metrics_all.append(get_baselines_performance(training_pi3k[i],validation_pi3k[i]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [22]:
for i in range(len(metrics_all)):
    print(metrics_all[i])

                      roc_auc     tn    fp     fn     tp       map  precision  \
knn_clf              0.695398  282.0  47.0   97.0  111.0  0.555542   0.702532   
random_forest        0.742372  305.0  24.0   92.0  116.0  0.633410   0.828571   
logistic_regression  0.702361  285.0  44.0   96.0  112.0  0.565359   0.717949   
svc                  0.541114  326.0   3.0  189.0   19.0  0.430845   0.863636   
xgboost              0.692885  293.0  36.0  105.0  103.0  0.562472   0.741007   
simple-NN            0.709076  268.0  61.0  109.0   99.0  0.625345   0.618750   

                       recall  accuracy  
knn_clf              0.533654  0.731844  
random_forest        0.557692  0.783985  
logistic_regression  0.538462  0.739292  
svc                  0.091346  0.642458  
xgboost              0.495192  0.737430  
simple-NN            0.475962  0.683426  
                      roc_auc     tn    fp     fn     tp       map  precision  \
knn_clf              0.734324  308.0  59.0   63.0  107.0 

In [23]:
pd.DataFrame(metrics_all[6]).to_csv("../../../../Desktop/binding/thesis english/Results/baselines_test_pi3k.csv")