# Test Script

In [1]:
import os
import torch
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from joblib import dump,load
from utils import StepwiseDecay as SWD

from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, ExpSineSquared, Matern, RBF
from sklearn.model_selection import GridSearchCV
from torch import nn

from modAL.models import BayesianOptimizer, ActiveLearner
from modAL.acquisition import max_EI, max_PI, max_UCB, optimizer_EI,optimizer_PI, optimizer_UCB

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [2]:
# parameters
fingerprint = 'morgan'
model = 'GPR'
file_path = 'data/data_temp/default_{}'.format(fingerprint)
config_file = 'config/'
use_unified_file = True
decay_list = [1.0,0.75,0.5,0.25,0.125,0.1,0.05]

## Helper Functions

In [3]:
try:
    if use_unified_file == True:
        df = pd.read_feather(file_path + "/complete_file_{}.feather".format(fingerprint))
        # df.head()
        # df_smiles = df.iloc[:,9:]
        # df_smiles.head()
        df_nan = pd.read_parquet(file_path + "/assay_id/assay_id_null_file.parquet")
        df_assays = pd.read_parquet(file_path + "/assay_id/assay_id_file.parquet")

    elif use_unified_file == False:
        df_fingerprint = pd.read_parquet(file_path + "/fingerprint/{}_fingerprint_file.parquet".format(fingerprint))
        df = pd.read_parquet(file_path + "/preprocessed/preprocessed_file.parquet")
        df_nan = pd.read_parquet(file_path + "/assay_id/assay_id_null_file.parquet")
        df_assays = pd.read_parquet(file_path + "/assay_id/assay_id_file.parquet")
    else:
        print("Incorrect value for 'use_unified_file' parameter passed. Please recheck.")
        pass
except:
    print("Data File not found")

In [4]:
df_nan.loc[df_nan['squared_pearson_trn'].isnull()]

Unnamed: 0,assay_id,squared_pearson_trn,squared_pearson_tst,assay_length_trn,assay_length_tst,assay_length_total
1,303216,,,45,15,60
2,303260,,,45,15,60
4,737235,,,45,15,60


In [5]:
# df = df.drop(nan_assays)
for i in df_nan.loc[df_nan['squared_pearson_trn'].isnull()]['assay_id']:
    df = df.drop(labels = df.loc[df['assay_id']==i].index)
df.loc[df['assay_id']==303216].head()
print('x-----x-----x-----x')
df.loc[df['assay_id']==303260].head()
print('x-----x-----x-----x')
df.loc[df['assay_id']==737235].head()
print('x-----x-----x-----x')

x-----x-----x-----x
x-----x-----x-----x
x-----x-----x-----x


In [6]:
df.head()

Unnamed: 0,index,CompoundID,assay_id,expt_pIC50,max2‐pQSAR_pIC50,Clustering,smiles,#cmpd,assay_type,target_family,...,smiles_morgan_2038,smiles_morgan_2039,smiles_morgan_2040,smiles_morgan_2041,smiles_morgan_2042,smiles_morgan_2043,smiles_morgan_2044,smiles_morgan_2045,smiles_morgan_2046,smiles_morgan_2047
0,0,CHEMBL305106,157,11.0,10.7912,TRN,O=c1c(Br)ccc2n1CC1CNCC2C1,65,B,Ion channel,...,0,0,0,0,0,0,0,0,0,0
1,61,CHEMBL65324,157,8.6,8.6646,TRN,O=c1c(Cl)cc(Cl)c2n1CC1CNCC2C1,65,B,Ion channel,...,0,0,0,0,0,0,0,0,0,0
2,59,CHEMBL303358,157,7.91,9.0524,TST,C1=C(c2cncnc2)C2CCN1CC2,65,B,Ion channel,...,0,0,0,0,0,0,0,0,0,0
3,58,CHEMBL62858,157,10.77,10.6356,TRN,O=c1c(I)ccc2n1CC1CNCC2C1,65,B,Ion channel,...,0,0,0,0,0,0,0,0,0,0
4,57,CHEMBL305325,157,8.1,7.3667,TRN,C1=C(c2ccnnc2)C2CCC(C1)N2,65,B,Ion channel,...,0,0,0,0,0,0,0,0,0,0


In [7]:
if use_unified_file==True:
    df.iloc[:,9:]
else:
    df_fingerprint.head()

In [8]:
# Loading JSON config file
try:
    with open('config/{}.json'.format(model)) as f:
        params_config = json.load(f)
        print('JSON config file for {} successfully loaded'.format(model))
except FileNotFoundError:
    print('Config file for model {} is missing.Resorting to default params'.format(model))
    with open('config/{}_defaul.json'.format(model)) as f:
        params_config = json.load(f)

# Creating List of Classifiers
clfs = []
acquisition_list = []
clf_list = []
count = 0

for kernel in params_config["kernel"]:
    if kernel == "Matern()":
        clf = GPR(kernel=Matern(length_scale=params_config["matern_params"]["length_scale"],
                                length_scale_bounds=(params_config["matern_params"]["length_scale_bounds"]["low"],params_config["matern_params"]["length_scale_bounds"]["high"]),
                                nu=params_config["matern_params"]["nu"]),
                    alpha=(params_config["alpha"]))
    elif kernel == "RBF()":
        clf = GPR(kernel=RBF(length_scale=params_config["rbf_params"]["length_scale"],
                                length_scale_bounds=(params_config["rbf_params"]["length_scale_bounds"]["low"],params_config["rbf_params"]["length_scale_bounds"]["high"])),
                    alpha=(params_config["alpha"]))
    else:
        clf = GPR(alpha=(params_config["alpha"]))
    clfs.append(clf)

for function in params_config["acquisition"]:
    if function=="max_EI":
        acquisition = max_EI
    elif function=="max_PI":
        acquisition = max_PI
    elif function=="max_UCB":
        acquisition = max_UCB
    else:
        acquisition = max_EI
    acquisition_list.append(acquisition)

for model_selected in clfs:
    for ac_func in acquisition_list:
        clf = {
            "model":model_selected,
            "acquisition_function":ac_func
        }
        clf_list.append(clf)
        clf_list.append("model_{}".format(count))
        count+=1

JSON config file for GPR successfully loaded


In [9]:
clfs

[GaussianProcessRegressor(alpha=0.0001, kernel=Matern(length_scale=1, nu=1.5)),
 GaussianProcessRegressor(alpha=0.0001, kernel=RBF(length_scale=1))]

In [10]:
len(df["assay_id"].unique())

4273

In [11]:
pearson_trn = [[0 for i in range(len(clf_list)+3)] for j in range(len(df["assay_id"].unique()))]
pearson_tst = [[0 for i in range(len(clf_list)+3)] for j in range(len(df["assay_id"].unique()))]

In [12]:
pd.DataFrame(pearson_trn).head()

Unnamed: 0,0,1,2,3,4
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [13]:
df.iloc[:,10:].iloc[[0,1,2]]

Unnamed: 0,smiles_morgan_0,smiles_morgan_1,smiles_morgan_2,smiles_morgan_3,smiles_morgan_4,smiles_morgan_5,smiles_morgan_6,smiles_morgan_7,smiles_morgan_8,smiles_morgan_9,...,smiles_morgan_2038,smiles_morgan_2039,smiles_morgan_2040,smiles_morgan_2041,smiles_morgan_2042,smiles_morgan_2043,smiles_morgan_2044,smiles_morgan_2045,smiles_morgan_2046,smiles_morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
df.iloc[:,3].head()

0    11.00
1     8.60
2     7.91
3    10.77
4     8.10
Name: expt_pIC50, dtype: float32

In [15]:
list_assay = np.sort(df["assay_id"].unique())

In [16]:
list_assay.max()

1642543

In [17]:
if os.path.isdir('models/{0}_{1}/'.format(model,fingerprint)) == False:
    os.mkdir('models/{0}_{1}/'.format(model,fingerprint))

if os.path.isdir('data/data_results/{0}_{1}/'.format(model,fingerprint)) == False:
    os.mkdir('data/data_results/{0}_{1}/'.format(model,fingerprint))

In [18]:
num_iterations=params_config["iterations"]

In [19]:
list_subset_sizes = []
list_total_sizes = []
row = 0
column = 0
flag_first_acc = True
pearson_values_graph = []

try:
    decay_tracker = SWD(decay_list)
    count = 0
    for assay_id in df['assay_id'].unique():
        df_current = df.loc[df['assay_id']==assay_id]
        df_train = df_current.loc[df['Clustering']=='TRN']
        df_tst = df_current.loc[df['Clustering']=='TST']
        column=0
        # if df_train.shape[0] < df_train.shape[1]:
        #     continue
            
        subset_size = int(decay_tracker.calculate(df_train.shape[0])*df_train.shape[0])
        list_subset_sizes.append(subset_size)
        list_total_sizes.append(df_train.shape[0])

        print("\nThe current assay id is {} and initialisation size is {}\n".format(assay_id, subset_size))
        pearson_trn[row][column] = assay_id
        pearson_tst[row][column] = assay_id
        column+=1

        X = np.array(df_train.iloc[:,10:])[:subset_size]
        y = np.array(df_train.iloc[:,3])[:subset_size]

        # X = df_train.iloc[:,10:]
        # y = df_train.iloc[:,3]
        x_initial = X[-(int(df_train.shape[0]*0.1)):-1]
        y_initial = y[-(int(df_train.shape[0]*0.1)):-1]

        # X = np.delete(x_initial, axis=0)
        # y = np.delete(y_initial, axis=0)
        if use_unified_file==True:
            for gpr_model in clf_list:
                print("Data is for model {} with acc_func {}".format(gpr_model["model"], gpr_model["acquisition_function"]))
                clf = gpr_model["model"]
                learner = BayesianOptimizer(
                    estimator=clf,
                    query_strategy=gpr_model["acquisition_function"],
                    X_training=x_initial, y_training=y_initial
                )

                for n_query in range(num_iterations):
                    if flag_first_assay ==True:
                        predictions_first_assay = learner.predict(np.array(df_train.iloc[:,10:]))
                        pearson_values_graph.append(np.corrcoef(np.array(df_train.iloc[:,3]), predictions_first_assay)[0,1]**2)
                    query_idx,query_inst = learner.query(X, n_instances=20)
                    # print("Iteration num = {} || Query index = {} || Y val = {}".format(n_query,query_idx, y[query_idx]))

                    learner.teach(X=query_inst,y=y[query_idx],only_new=True)
                
                predictions = learner.predict(np.array(df_train.iloc[:,10:]))
                result_trn = np.corrcoef(np.array(df_train.iloc[:,3]), predictions)[0,1]**2

                predictions = learner.predict(np.array(df_tst.iloc[:,10:]))
                result_tst = np.corrcoef(np.array(df_tst.iloc[:,3]), predictions)[0,1]**2

                pearson_trn[row][column] = subset_size
                pearson_test[row][column] = subset_size
                column+=1

                pearson_trn[row][column] = df_train.shape[0]
                pearson_test[row][column] = df_test.shape[0]
                column+=1

                pearson_trn[row][column] = result_trn
                pearson_tst[row][column] = result_tst
                column+=1
                flag_first_assay=False
        else:
            print("Feature is in the works.")   
        row+=1
        
        # print(df_train.shape)
        # print('The initialisation size is set as {}'.format(init_size))
        count+=1
        if count >25:
            break
except NameError:
    print('Key Error')


The current assay id is 157 and initialisation size is 48

Data is for model GaussianProcessRegressor(alpha=0.0001, kernel=Matern(length_scale=1, nu=1.5)) with acc_func <function max_EI at 0x0000028177EFF280>
Iteration num = 0 || Query index = [18 23  2  3  4 43 41  7 33  9 10 25 12 13 14 24 19 47 31 30] || Y val = [10.57  6.68 10.77  8.1  10.35  9.59  9.77 10.29  9.15 11.1  10.6   8.7
 10.55  9.08  8.43  8.8   9.82  6.4   7.51  5.38]
Iteration num = 1 || Query index = [42 37 35 46 21 20 39 44  8  9  0 15  1 40 45 29 38 22  5  6] || Y val = [ 9.91 10.44  9.64 10.65  9.28  9.06  8.6  10.59  8.81 11.1  11.   10.55
  8.6   7.97  9.51  5.78  8.24  9.26  8.05  8.02]
Iteration num = 2 || Query index = [ 9 10  2  3  4 18 13 12 11 26 31 32 34 36 30 47 19 25 14 33] || Y val = [11.1  10.6  10.77  8.1  10.35 10.57  9.08 10.55  7.72  8.    7.51  9.85
  9.18  7.92  5.38  6.4   9.82  8.7   8.43  9.15]
Iteration num = 3 || Query index = [ 9 21 46 37 44 15  0 35  1 39 20 42  8 38  5 41 40 45 16 29] |

In [20]:
for 
pd.DataFrame(pearson_trn).to_csv('data/data_results/{0}_{1}/pearsons_training_set_{2}.csv'.format(model,fingerprint,num_iterations),index=False)
pd.DataFrame(pearson_tst).to_csv('data/data_results/{0}_{1}/pearsons_test_set_{2}.csv'.format(model,fingerprint,num_iterations),index=False)
# with open('data/data_predictions/{0}_{1}/model_list.json'.format(model,fingerprint), 'w') as outfile:
#     json.dump(clf_list,outfile)