In [None]:
!mkdir data
!unzip drive/MyDrive/PLECS_16384.zip -d data/

In [None]:
import random
import os
import numpy as np
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.feature_selection import r_regression


indexFullSet = "INDEX_general_PL_data.2020" 
indexRefinedSet = "INDEX_refined_data.2020"
indexCoreSet = "CoreSet.dat"
dataDir = "PLECS_16384"  # path to folder containing plec fingerprints in ".npy" format with the pdbID as the name


def parseIndexFile(indexFilePath):
    with open(indexFilePath, "r") as index_file:
            pdbIDs = []
            logKvalues = {}
            for line in index_file:
                if line.startswith('#'):
                  continue
                if line.split()[4].startswith(('Ki=','Kd=')):
                  pdbIDs.append(str(line.split()[0]))
                  logKvalues[str(line.split()[0])] = float(line.split()[3])
    return pdbIDs, logKvalues

refinedIndex,logK = parseIndexFile(indexRefinedSet)
coreIndex,logKcore = parseIndexFile(indexCoreSet)
fullIndex,logKFull = parseIndexFile(indexFullSet)

trainData = []
testData = []
trainValues = []
testValues = []
testIDs = []
trainIDs = []

for pdbID in fullIndex:
    if not os.path.isfile(f"{dataDir}/{pdbID}.npy"):
         continue
    plec = np.load(f"{dataDir}/{pdbID}.npy")
    if pdbID in coreIndex:
        testData.append(plec)
        testValues.append(logKFull[pdbID])
        testIDs.append(pdbID)
    elif pdbID in refinedIndex:
        trainData.append(plec)
        trainValues.append(logKFull[pdbID])
        trainIDs.append(pdbID)


data = [(plec,val) for plec,val in zip(trainData,trainValues)]
random.shuffle(data)
trainData = [d[0] for d in data]
trainValues = [d[1] for d in data]

testData = np.array(testData)
testValues = np.array(testValues)


In [None]:
regr = SVR()
regr.fit(trainData, trainValues)
predictions = regr.predict(testData)
print(np.sqrt(mean_squared_error(predictions,testValues)))
print(r2_score(testValues,predictions))
print(r_regression(testValues.reshape(-1, 1),predictions))
print(predictions)