In [1]:
import functools
import time

import keras
import keras.backend as K
import matplotlib.pyplot as plt
# import scikitplot as skplt
import numpy as np
import pandas as pd
import scikitplot as skplt
import seaborn as sns
import tensorflow as tf
import xgboost as xgb
from keras.layers import Dense, Dropout, Input, Lambda
from keras.models import Model, Sequential, load_model
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, accuracy_score, auc, auc, average_precision_score, average_precision_score, \
    confusion_matrix, confusion_matrix, pairwise_distances, precision_score, precision_score, recall_score, \
    recall_score, roc_auc_score, roc_auc_score, roc_curve, roc_curve
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from rdkit.Chem import AllChem
from data_analysis import calculate_metrics, get_rdkit_features
import dill
from sklearn.model_selection import train_test_split

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def get_keras_simple_nn(num_units=35, activation='relu', drop_rate=0.2):
    model = Sequential()
    model.add(Dense(num_units, activation=activation))
    model.add(Dropout(drop_rate))
    model.add(Dense(num_units, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


baselines_map = {
    'knn_clf': KNeighborsClassifier(n_neighbors=1, weights='distance'),
    'random_forest': RandomForestClassifier(),
    'logistic_regression': LogisticRegression(),
    'svc': SVC(gamma='auto'),
    'xgboost':
        xgb.XGBClassifier(
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=0.1,
            max_depth=6,
            min_child_weight=12,
            n_estimators=100,
            subsample=0.95
        ),
    'simple-NN': get_keras_simple_nn()
}


def get_baselines_performance(df_train, df_val, label_col='Binary', use_only=None):
    df_train = get_rdkit_features(df_train)
    df_val = get_rdkit_features(df_val)
    input_cols = [
        'BalabanJ', 'BertzCT', 'MaxAbsPartialCharge', 'MolLogP', 'MolWt', 'NumAliphaticCarbocycles',
        'NumRotatableBonds', 'RingCount', 'SlogP_VSA10', 'TPSA'
    ]
    if use_only is None:
        use_only = baselines_map.keys()
    metrics = {}
    for name, model in baselines_map.items():
        if name in use_only:
            if name == 'simple-NN':
                model.fit(df_train[input_cols].values, df_train[label_col].values, epochs=30, batch_size=32)
            else:
                model.fit(df_train[input_cols].values, df_train[label_col].values)

            y_pred = model.predict(df_val[input_cols].values).squeeze()
            y_true = df_val[label_col].values.squeeze()
            metrics[name] = calculate_metrics(y_true, y_pred)

    return pd.DataFrame(metrics).T

In [3]:
target_1 = 'p38'
base_path_1 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_1 = base_path_1+f'/data/{target_1}/data.csv'
df_p38=pd.read_csv(data_fpath_1).set_index('biolab_index')

with open(base_path_1+f'/data/{target_1}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_p38 = dill.load(in_f)

with open(base_path_1+f'/data/{target_1}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_p38 = dill.load(in_f)
    
target_2 = 'akt1'
base_path_2 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_2 = base_path_2+f'/data/{target_2}/data.csv'
df_akt1 = pd.read_csv(data_fpath_2).set_index('biolab_index')

with open(base_path_2+f'/data/{target_2}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_akt1 = dill.load(in_f)
with open(base_path_2+f'/data/{target_2}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_akt1 = dill.load(in_f)
    
target_3 = 'pi3k'
base_path_3 = f'C:/Users/tomas/Documents/GitHub/kinase_binding'

data_fpath_3 = base_path_3+f'/data/{target_3}/data.csv'
df_pi3k = pd.read_csv(data_fpath_3).set_index('biolab_index')

with open(base_path_3+f'/data/{target_3}/train_val_folds.pkl', "rb") as in_f:
    train_val_folds_pi3k = dill.load(in_f)
with open(base_path_3+f'/data/{target_3}/train_test_folds.pkl', "rb") as in_f:
    train_test_folds_pi3k = dill.load(in_f)

In [4]:
train_p38 = df_p38.loc[train_test_folds_p38[0]]            
val_p38 = df_p38.loc[train_test_folds_p38[1]]
                   

train_akt1 = df_akt1.loc[train_test_folds_akt1[0]]           
val_akt1 = df_akt1.loc[train_test_folds_akt1[1]]
                   

train_pi3k = df_pi3k.loc[train_test_folds_pi3k[0]]        
val_pi3k = df_pi3k.loc[train_test_folds_pi3k[1]]

In [5]:
#Random splits with sklearn (on our test set)
df_p38 = df_p38.reset_index(drop=True)
X_train_p38, X_val_p38, Y_train_p38, Y_val_p38 = train_test_split(df_p38.rdkit,
                                                                  df_p38.Binary,
                                                                  test_size = 0.15,
                                                                  train_size = 0.85,
                                                                  shuffle = True)
random_train_p38 = pd.DataFrame(X_train_p38)
random_train_p38['Binary'] = Y_train_p38
random_val_p38 = pd.DataFrame(X_val_p38)
random_val_p38['Binary'] = Y_val_p38
del X_train_p38,X_val_p38,Y_train_p38,Y_val_p38


df_akt1 = df_akt1.reset_index(drop=True)
X_train_akt1, X_val_akt1, Y_train_akt1, Y_val_akt1 = train_test_split(df_akt1.rdkit,
                                                                     df_akt1.Binary,
                                                                     test_size = 0.15,
                                                                     train_size = 0.85,
                                                                     shuffle = True)
random_train_akt1 = pd.DataFrame(X_train_akt1)
random_train_akt1['Binary'] = Y_train_akt1
random_val_akt1 = pd.DataFrame(X_val_akt1)
random_val_akt1['Binary'] = Y_val_akt1
del X_train_akt1,X_val_akt1,Y_train_akt1,Y_val_akt1


df_pi3k = df_pi3k.reset_index(drop=True)
X_train_pi3k, X_val_pi3k, Y_train_pi3k, Y_val_pi3k = train_test_split(df_pi3k.rdkit,
                                                                      df_pi3k.Binary,
                                                                      test_size = 0.15,
                                                                      train_size = 0.85,
                                                                      shuffle = True)
random_train_pi3k = pd.DataFrame(X_train_pi3k)
random_train_pi3k['Binary'] = Y_train_pi3k
random_val_pi3k = pd.DataFrame(X_val_pi3k)
random_val_pi3k['Binary'] = Y_val_pi3k
del X_train_pi3k,X_val_pi3k,Y_train_pi3k,Y_val_pi3k

training_p38 = [train_p38,random_train_p38]
validation_p38 = [val_p38,random_val_p38]

training_akt1 = [train_akt1,random_train_akt1]
validation_akt1 = [val_akt1,random_val_akt1]

training_pi3k = [train_pi3k,random_train_pi3k]
validation_pi3k = [val_pi3k,random_val_pi3k]

In [6]:
metrics_p38 = list()
metrics_akt1 = list()
metrics_pi3k = list()
for i in range(len(training_pi3k)):
    metrics_p38.append(get_baselines_performance(training_p38[i],validation_p38[i]))
    metrics_akt1.append(get_baselines_performance(training_akt1[i],validation_akt1[i]))
    metrics_pi3k.append(get_baselines_performance(training_pi3k[i],validation_pi3k[i]))

  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 3050/3050 [00:00<00:00, 127425.49it/s][A

  0%|                                                                                         | 0/3050 [00:00<?, ?it/s][A
 11%|████████▌                                                                    | 338/3050 [00:00<00:00, 3355.88it/s][A
 23%|█████████████████▉                                                           | 710/3050 [00:00<00:00, 3450.03it/s][A
 36%|███████████████████████████▏                                                | 1090/3050 [00:00<00:00, 3542.33it/s][A
 48%|████████████████████████████████████▋                                       | 1471/3050 [00:00<00:00, 3617.94it/s][A
 61%|██████████████████████████████████████████████                              | 1850/3050 [00:00<00:00, 3660.10it/s][A
 74%|█████████████

 43%|████████████████████████████████▍                                           | 1304/3050 [00:00<00:00, 6391.04it/s][A
 64%|████████████████████████████████████████████████▊                           | 1957/3050 [00:00<00:00, 6426.45it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 3050/3050 [00:00<00:00, 6506.27it/s][A
 55%|███████████████████████████████████████████▌                                    | 109/200 [00:05<00:38,  2.34it/s]
100%|██████████████████████████████████████████████████████████████████████████| 3050/3050 [00:00<00:00, 758576.09it/s][A

  0%|                                                                                         | 0/3050 [00:00<?, ?it/s][A
  6%|████▌                                                                        | 180/3050 [00:00<00:01, 1786.36it/s][A
 12%|█████████▏                                                                   | 364/3050 [00:00<00:01, 1798.24it/s][A
 18%|█████████████

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  _warn_prf(average, modifier, msg_start, len(result))
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1834/1834 [00:00<00:00, 306480.48it/s][A

  0%|                                                                                         | 0/1834 [00:00<?, ?it/s][A
 19%|██████████████▍                                                              | 343/1834 [00:00<00:00, 3405.14it/s][A
 38%|█████████████████████████████                                                | 692/1834 [00:00<00:00, 3422.79it/s][A
 56%|██████████████████████████████████████████▊                                 | 1034/1834 [00:00<00:00, 3414.46it/s][A
 75%|████████████████████████████████████████████████████████▉                   | 1375/1834 [00:00<00:00, 3405.65it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 183

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:05<00:00, 38.42it/s]
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████| 306/306 [00:00<00:00, 153395.13it/s][A

100%|██████████████████████████████████████████████████████████████████████████████| 306/306 [00:00<00:00, 4092.97it/s][A

  0%|                                                                                          | 0/306 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████████| 306/306 [00:00<00:00, 2855.55it/s][A
  9%|███████▎                                                                         | 18/200 [00:00<00:01, 91.29it/s]
  0%|                                                                                          | 0/306 [00:00<?, ?it/s][A
 38%|██████████████████

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  _warn_prf(average, modifier, msg_start, len(result))
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 3217/3217 [00:00<00:00, 248121.14it/s][A

  0%|                                                                                         | 0/3217 [00:00<?, ?it/s][A
 11%|████████▎                                                                    | 349/3217 [00:00<00:00, 3464.71it/s][A
 21%|████████████████▏                                                            | 678/3217 [00:00<00:00, 3402.64it/s][A
 33%|████████████████████████▊                                                   | 1052/3217 [00:00<00:00, 3457.78it/s][A
 45%|█████████████████████████████████▉                                          | 1436/3217 [00:00<00:00, 3435.69it/s][A
 57%|███████████████████████████████████████████▋                                | 184

 45%|██████████████████████████████████▏                                         | 1448/3217 [00:00<00:00, 2871.35it/s][A
 54%|█████████████████████████████████████████                                   | 1739/3217 [00:00<00:00, 2876.59it/s][A
 63%|███████████████████████████████████████████████▉                            | 2027/3217 [00:00<00:00, 2871.32it/s][A
 72%|██████████████████████████████████████████████████████▋                     | 2316/3217 [00:00<00:00, 2870.63it/s][A
 81%|█████████████████████████████████████████████████████████████▌              | 2607/3217 [00:00<00:00, 2879.66it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 3217/3217 [00:01<00:00, 2883.61it/s][A
 32%|█████████████████████████▌                                                       | 63/200 [00:06<01:40,  1.36it/s]
100%|███████████████████████████████████████████████████████████████████████████| 3217/3217 [00:00<00:00, 84025.03it/s][A

100%|█████████████

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  _warn_prf(average, modifier, msg_start, len(result))
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 3025/3025 [00:00<00:00, 252759.52it/s][A

  0%|                                                                                         | 0/3025 [00:00<?, ?it/s][A
 12%|█████████                                                                    | 355/3025 [00:00<00:00, 3521.37it/s][A
 24%|██████████████████▎                                                          | 718/3025 [00:00<00:00, 3545.66it/s][A
 35%|██████████████████████████▌                                                 | 1059/3025 [00:00<00:00, 3495.97it/s][A
 46%|███████████████████████████████████                                         | 1394/3025 [00:00<00:00, 3448.38it/s][A
 57%|███████████████████████████████████████████▎                                | 172

 12%|█████████                                                                    | 355/3025 [00:00<00:00, 3087.37it/s][A
 22%|████████████████▋                                                            | 654/3025 [00:00<00:00, 3052.60it/s][A
 31%|████████████████████████▏                                                    | 948/3025 [00:00<00:00, 3015.28it/s][A
 41%|███████████████████████████████▍                                            | 1249/3025 [00:00<00:00, 3007.10it/s][A
 51%|██████████████████████████████████████▉                                     | 1551/3025 [00:00<00:00, 3004.39it/s][A
 61%|██████████████████████████████████████████████▎                             | 1844/3025 [00:00<00:00, 2979.30it/s][A
 71%|█████████████████████████████████████████████████████▊                      | 2143/3025 [00:00<00:00, 2976.00it/s][A
 81%|█████████████████████████████████████████████████████████████▎              | 2440/3025 [00:00<00:00, 2967.68it/s][A
100%|███████████

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:01<00:00, 130.74it/s]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  _warn_prf(average, modifier, msg_start, len(result))
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1819/1819 [00:00<00:00, 303985.93it/s][A

  0%|                                                                                         | 0/1819 [00:00<?, ?it/s][A
 18%|█████████████▋                                                               | 322/1819 [00:00<00:00, 3196.62it/s][A
 35%|███████████████████████████▎                                                 | 644/1819 [00:00<00:00, 3196.63it/s][A
 55%|█████████████████████████████████████████▉                                  | 1003/1819 [00:00<00:00, 3219.10it/s][A
 73%|███████████████████████████████████████████████████████▍                    | 1326/1819 [00:00<00:00, 3218.65it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 181

 80%|████████████████████████████████████████████████████████████▋               | 1453/1819 [00:00<00:00, 1596.28it/s][A
 89%|███████████████████████████████████████████████████████████████████▎        | 1610/1819 [00:01<00:00, 1586.88it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 1819/1819 [00:01<00:00, 1599.99it/s][A
100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:05<00:00, 35.56it/s]
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████| 321/321 [00:00<00:00, 321867.46it/s][A

  0%|                                                                                          | 0/321 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████████| 321/321 [00:00<00:00, 1618.26it/s][A
  6%|█████▎          

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  _warn_prf(average, modifier, msg_start, len(result))
  0%|                                                                                          | 0/200 [00:00<?, ?it/s]
100%|██████████████████████████████████████████████████████████████████████████| 3190/3190 [00:00<00:00, 266562.33it/s][A

  0%|                                                                                         | 0/3190 [00:00<?, ?it/s][A
 13%|█████████▋                                                                   | 399/3190 [00:00<00:00, 3582.39it/s][A
 25%|███████████████████▌                                                         | 810/3190 [00:00<00:00, 3574.91it/s][A
 36%|███████████████████████████▎                                                | 1144/3190 [00:00<00:00, 3494.66it/s][A
 46%|███████████████████████████████████▎                                        | 1481/3190 [00:00<00:00, 3448.55it/s][A
 57%|███████████████████████████████████████████▌                                | 182

 56%|██████████████████████████████████████████▊                                 | 1796/3190 [00:00<00:00, 2912.53it/s][A
 67%|██████████████████████████████████████████████████▉                         | 2136/3190 [00:00<00:00, 2929.40it/s][A
 76%|██████████████████████████████████████████████████████████                  | 2438/3190 [00:00<00:00, 2944.61it/s][A
 85%|████████████████████████████████████████████████████████████████▉           | 2724/3190 [00:00<00:00, 2910.76it/s][A
100%|████████████████████████████████████████████████████████████████████████████| 3190/3190 [00:01<00:00, 2926.88it/s][A
 32%|█████████████████████████▌                                                       | 63/200 [00:06<01:41,  1.35it/s]
100%|███████████████████████████████████████████████████████████████████████████| 3190/3190 [00:00<00:00, 84171.58it/s][A

100%|██████████████████████████████████████████████████████████████████████████| 3190/3190 [00:00<00:00, 213254.97it/s][A

  0%|            

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
pd.DataFrame(metrics_p38[0]).to_csv("../../../../Desktop/binding/thesis english/Results/5-Baselines/test_p38.csv")
pd.DataFrame(metrics_p38[1]).to_csv("../../../../Desktop/binding/thesis english/Results/5-Baselines/random_p38.csv")

In [9]:
pd.DataFrame(metrics_akt1[0]).to_csv("../../../../Desktop/binding/thesis english/Results/5-Baselines/test_akt1.csv")
pd.DataFrame(metrics_akt1[1]).to_csv("../../../../Desktop/binding/thesis english/Results/5-Baselines/random_akt1.csv")

In [10]:
pd.DataFrame(metrics_pi3k[0]).to_csv("../../../../Desktop/binding/thesis english/Results/5-Baselines/test_pi3k.csv")
pd.DataFrame(metrics_pi3k[1]).to_csv("../../../../Desktop/binding/thesis english/Results/5-Baselines/random_pi3k.csv")