In [None]:
indexFullSet = "INDEX_general_PL_data.2020" #set to paths to your index files from PDBbind database
indexRefinedSet = "INDEX_refined_data.2020"
indexCoreSet = "CoreSet.dat"
dataDir = "PLECS_65536" #  path to folder containing plec fingerprints in ".npy" format with the pdbID as the name
pdbInfoPath = "PDBinfo.txt"
kinaseFilter = False # change to True for training without kinases in the training data

In [None]:
def parseIndexFile(indexFilePath):
    with open(indexFilePath, "r") as index_file:
            pdbIDs = []
            logKvalues = {}
            for line in index_file:
                if line.startswith('#'):
                  continue
                if line.split()[4].startswith(('Ki=','Kd=')): #remove if training on full set with IC50 data
                  pdbIDs.append(str(line.split()[0]))
                  logKvalues[str(line.split()[0])] = float(line.split()[3])
    return pdbIDs, logKvalues

def filterKinases(pdbInfo):
  kinases = []
  with open(pdbInfo) as file:
    for line in file.readlines():
      pdbID = line.split("\t")[0]
      pdbinfo = line.split("\t")[1]
      if "kinase" in pdbinfo.lower() and pdbID not in kinases:
        kinases.append(pdbID)
  return kinases

refinedIndex,logK = parseIndexFile(indexRefinedSet)
coreIndex,logKcore = parseIndexFile(indexCoreSet)
fullIndex,logKFull = parseIndexFile(indexFullSet)

In [None]:
import random
import os
import numpy as np
import tensorflow as tf
from keras import callbacks
seed = 46
plec_size = 65536 #change to plec size 16384 for small plecs
model_path = f"mlp_model_{plec_size}" # name of the saved model file
random.seed(seed)

trainData = []
testData = []
trainValues = []
testValues = []
testIDs = []
trainIDs = []


if not kinaseFilter:
    for pdbID in fullIndex:
        if not os.path.isfile(f"{dataDir}/{pdbID}.npy"):
            continue
        plec = np.load(f"{dataDir}/{pdbID}.npy")
        if pdbID in coreIndex:
            testData.append(plec)
            testValues.append(logKFull[pdbID])
            testIDs.append(pdbID)
        elif pdbID in refinedIndex: # change to fullIndex for training on the filtered general set
            trainData.append(plec)
            trainValues.append(logKFull[pdbID])
            trainIDs.append(pdbID)

else:
    kinases = filterKinases(pdbInfoPath)
    for pdbID in refinedIndex:
        if not os.path.isfile(f"{dataDir}/{pdbID}.npy"):
            continue
        plec = np.load(f"{dataDir}/{pdbID}.npy")
        if pdbID in kinases:
            testData.append(plec)
            testValues.append(logKFull[pdbID])
            testIDs.append(pdbID)
        elif pdbID in refinedIndex and pdbID not in coreIndex:
            trainData.append(plec)
            trainValues.append(logKFull[pdbID])
            trainIDs.append(pdbID)

data = [(plec,val) for plec,val in zip(trainData,trainValues)]
random.shuffle(data)
trainData = [d[0] for d in data]
trainValues = [d[1] for d in data]
splitSize = int(len(trainData)/10)

validData = np.array(trainData[:splitSize])
validValues = np.array(trainValues[:splitSize])

trainData = np.array(trainData[splitSize:])
trainValues = np.array(trainValues[splitSize:])
testData = np.array(testData)
testValues = np.array(testValues)


In [None]:
size = 200
def neuralNet():
  model = tf.keras.Sequential([
  tf.keras.layers.Dense(size, activation='relu'),
  tf.keras.layers.Dense(size, activation='relu'),
  tf.keras.layers.Dense(size, activation='relu'),
  tf.keras.layers.Dense(1)
  ])

  model.compile(optimizer = "adam", loss = "mean_squared_error")
  return model

model = neuralNet()
earlystopping = callbacks.EarlyStopping(monitor="val_loss", mode = "min", patience = 50, restore_best_weights=True)

model.fit(trainData,trainValues,epochs = 400, batch_size = 50, validation_data = (validData,validValues), shuffle=True, callbacks = [earlystopping])

model.save(model_path)


def statistics(data,values):
  predictions = model.predict(data)
  predictions = [float(val) for val in predictions]
  mseLoss = tf.keras.losses.MeanSquaredError()
  rmse = np.sqrt(mseLoss(predictions, values).numpy())
  pearson = np.corrcoef(predictions, values)[0][1]
  return rmse, pearson, predictions


rmse, pearson, predictions = statistics(testData,testValues)

print(f"RMSE: {rmse}")
print(f"Pearson's r: {pearson}")
print(predictions)