# Test Script

In [None]:
import os
import torch
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from joblib import dump,load
from utils import StepwiseDecay as SWD
import pickle
import random
import time

# from sklearn.gaussian_process import GaussianProcessRegressor as GPR
# from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, ExpSineSquared, Matern, RBF
from sklearn.decomposition import TruncatedSVD
# from sklearn.model_selection import GridSearchCV
from torch import nn
import torch
from nnr_custom import Torch_Model_BasicRegressor as NNR

from modAL.models import BayesianOptimizer, ActiveLearner
from modAL.acquisition import max_EI, max_PI, max_UCB, optimizer_EI,optimizer_PI, optimizer_UCB

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [None]:
# parameters
fingerprint = 'morgan'
model = 'NNR'
file_name = "complete_file_morgan.feather"
config_file_name = 'NNR.json'
use_unified_file = True
decay_list = [1.0,0.75,0.5,0.25,0.125,0.1,0.05]
use_decomposition = False

## Helper Functions

In [None]:
random.seed(42)
file_path = '/data_temp/default_{}/'.format(fingerprint)

try:
    if use_unified_file == True:
        try:
            df = pd.read_feather("../data" + file_path + file_name)
        except:
            df = pd.read_parquet("../data" + file_path + file_name)
        df_nan = pd.read_parquet("../data" + file_path + "assay_id/assay_id_null_file.parquet")
        df_assays = pd.read_parquet("../data" + file_path + "assay_id/assay_id_file.parquet")

    elif use_unified_file == False:
        df_fingerprint = pd.read_parquet("../" + file_path + "/fingerprint/{}_fingerprint_file.parquet".format(fingerprint))
        df = pd.read_parquet("../" + file_path + "/preprocessed/preprocessed_file.parquet")
        df_nan = pd.read_parquet("../" + file_path + "/assay_id/assay_id_null_file.parquet")
        df_assays = pd.read_parquet("../" + file_path + "/assay_id/assay_id_file.parquet")
    else:
        print("Incorrect value for 'use_unified_file' parameter passed. Please recheck.")
        pass
except:
    print("Data File not found")

In [None]:
df_nan.loc[df_nan['squared_pearson_trn'].isnull()]

Unnamed: 0,assay_id,squared_pearson_trn,squared_pearson_tst,assay_length_trn,assay_length_tst,assay_length_total
1,303216,,,45,15,60
2,303260,,,45,15,60
4,737235,,,45,15,60


In [None]:
# df = df.drop(nan_assays)
for i in df_nan.loc[df_nan['squared_pearson_trn'].isnull()]['assay_id']:
    df = df.drop(labels = df.loc[df['assay_id']==i].index)
df.loc[df['assay_id']==303216].head()
print('x-----x-----x-----x')
df.loc[df['assay_id']==303260].head()
print('x-----x-----x-----x')
df.loc[df['assay_id']==737235].head()
print('x-----x-----x-----x')

x-----x-----x-----x
x-----x-----x-----x
x-----x-----x-----x


In [None]:
# Loading JSON config file
try:
    with open('../config/' + config_file_name) as f:
        params_config = json.load(f)
        print('JSON config file for {} successfully loaded'.format(model))
except FileNotFoundError:
    print('Config file for model {} is missing.Resorting to default params'.format(model))
    with open('../config/{}_default.json'.format(model)) as f:
        params_config = json.load(f)

if os.path.isdir('../models/{0}_{1}/'.format(model,fingerprint)) == False:
    os.mkdir('../models/{0}_{1}/'.format(model,fingerprint))

if os.path.isdir('../data/data_results/{0}_{1}/'.format(model,fingerprint)) == False:
    os.mkdir('../data/data_results/{0}_{1}/'.format(model,fingerprint))

if os.path.isdir('../data/data_results/{0}_{1}/plots'.format(model,fingerprint)) == False:
    os.mkdir('../data/data_results/{0}_{1}/plots'.format(model,fingerprint))

# Creating List of Classifiers
clfs = []
acquisition_list = []
clf_list = []
count = 0
clf_list_names = []

for neural_net in params_config["kernel"]:
    if neural_net == "BasicNNR()":
        clf_type = NNR()
    else:
        clf_type = NNR()
    clf = {
        "type" : clf_type,
        "name" : neural_net
    }
    clfs.append(clf)

for function in params_config["acquisition"]:
    if function=="max_EI":
        acquisition_type = max_EI
    elif function=="max_PI":
        acquisition_type = max_PI
    elif function=="max_UCB":
        acquisition_type = max_UCB
    else:
        acquisition_type = max_EI
    acquisition = {
        "type" : acquisition_type,
        "name": function
    }
    acquisition_list.append(acquisition)

if params_config["optimizer"] == "Adam":
    default_optimizer = torch.optim.Adam(lr = params_config)
for model_selected in clfs:
    for ac_func in acquisition_list:
        clf = {
            "model" :model_selected["type"],
            "model_name" : model_selected["name"],
            "acquisition_function" :ac_func["type"],
            "acquisition_function_name" : ac_func["name"]
        }
        clf_list.append(clf)
        clf_list_names.append("model_{}_{}".format(clf["model_name"], clf["acquisition_function_name"]))
        count+=1

clf_list_names = ['assay_id','subset_size_trn','total_length'] + clf_list_names

num_iterations=params_config["iterations"]

JSON config file for GPR successfully loaded


In [None]:
pearson_trn = [[0 for i in range(len(clf_list)+3)] for j in range(len(df["assay_id"].unique()+1))]
pearson_tst = [[0 for i in range(len(clf_list)+3)] for j in range(len(df["assay_id"].unique()+1))]

pearson_values_graph = [[0 for i in range(len(acquisition_list))] for j in range(num_iterations)]


In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
list_subset_sizes = []
list_total_sizes = []
row = 0
column = 0
pvg_column = 0
pvg_row = 0

flag_first_assay = 688239
# flag_first_assay = 517
# flag_first_assay = 70695

first_start = time.time()
# try:
decay_tracker = SWD(decay_list)
count = 0
device = "cuda" if torch.cuda.is_available() else "cpu"

for assay_id in df['assay_id'].unique():
    if assay_id == np.int64(flag_first_assay):
        print('Reached selected assay at time = {}'.format(time.time() - first_start))
    
    df_current = df.loc[df['assay_id']==assay_id]
    df_train = df_current.loc[df['Clustering']=='TRN']
    df_tst = df_current.loc[df['Clustering']=='TST']
    column=0
    pvg_column = 0
    # if df_train.shape[0] < 100:
    #     continue
    # if assay_id != np.int64(flag_first_assay):
    #     continue
    start = time.time()

    subset_size = int(decay_tracker.calculate(df_train.shape[0])*df_train.shape[0])
    list_subset_sizes.append(subset_size)
    list_total_sizes.append(df_train.shape[0])
    if assay_id == np.int64(flag_first_assay):
        print("\nThe current assay id is {} and initialisation size is {}\n".format(assay_id, subset_size))
    pearson_trn[row][column] = assay_id
    pearson_tst[row][column] = assay_id
    column+=1

    X = torch.tensor(df_train.iloc[:,10:]).to(device)[:subset_size]
    y = torch.tensor(df_train.iloc[:,3]).to(device)[:subset_size]

    start_2 = time.time()

    # X = torch.tensor(df_train.iloc[:,10:]).to(device)[:subset_size]
    # y = torch.tensor(df_train.iloc[:,3]).to(device)[:subset_size]
    
    if use_decomposition == True:
        transformer = TruncatedSVD(n_components=10, random_state=42).fit(X)
        X = transformer.transform(X)
        with open('../models/{0}_{1}/transformer_{2}.pickle'.format(model,fingerprint,flag_first_assay),'wb') as f:
                pickle.dump(transformer,f)

    print("Subset selected in {}".format(time.time() - start_2))
    # # train_idx = np.random.choice(range(X.shape[0]), size=int(X.shape[0]*0.1), replace=False)
    # train_idx = X.multinomial(num_samples=int(X.shape[0]*0.01), replacement=False)
    # x_initial = X[train_idx]
    # y_initial = y[train_idx]
    # print("Selecte initial training subset {}".format(time.time() - start_2))
    # # x_initial = X[-(int(df_train.shape[0]*0.1)):-1]
    # # y_initial = y[-(int(df_train.shape[0]*0.1)):-1]

    # X = np.delete(X,train_idx, axis=0)
    # y = np.delete(y,train_idx)

    # X = np.array(pd.DataFrame(X).reset_index(drop=True))
    # y = np.array(pd.DataFrame(y).reset_index(drop=True))

    pearson_trn[row][column] = subset_size
    pearson_tst[row][column] = subset_size
    column+=1

    pearson_trn[row][column] = df_train.shape[0]
    pearson_tst[row][column] = df_tst.shape[0]
    column+=1

    # if flag_first_assay==True:
    #     [[0 for i in range(len(acquisition_list))] for j in range(num_iterations)]
    
    if use_unified_file==True:
        for gpr_model in clf_list:
            if assay_id==np.int64(flag_first_assay):
                print("Data is for model {} with acc_func {}".format(gpr_model["model_name"], gpr_model["acquisition_function_name"]))
            clf = gpr_model["model"]
            print("Before model creation : {}".format(time.time() - start_2))
            learner = BayesianOptimizer(
                estimator=clf,
                query_strategy=gpr_model["acquisition_function"],
                X_training=x_initial, y_training=y_initial
            )
            print("After model creation and training {}".format(time.time() - start_2))
            pvg_row=0
            for n_query in range(num_iterations):
                if assay_id==np.int64(flag_first_assay):
                    # first_assay = assay_id
                    if use_decomposition == True:
                        with open('../models/{0}_{1}/transformer_{2}.pickle'.format(model,fingerprint,flag_first_assay),'rb') as f:
                            transformer = pickle.load(f)
                        predictions_first_assay = learner.predict(transformer.transform(np.array(df_train.iloc[:,10:])))
                    else:
                        predictions_first_assay = learner.predict(np.array(df_train.iloc[:,10:]))
                    pearson_values_graph[pvg_row][pvg_column] = round(np.corrcoef(np.array(df_train.iloc[:,3]), predictions_first_assay)[0,1]**2,5)
                    pvg_row+=1
                try:
                    query_idx,query_inst = learner.query(X, n_instances=20)
                except AssertionError:
                    print("Encountered a case where the number of intances is lower than utility")
                    # continue
                # print("Iteration num = {} || Query index = {} || Y val = {}".format(n_query,query_idx, y[query_idx]))
                print("Time taken to query {}".format(time.time() - start_2))
                learner.teach(X=query_inst,y=y[query_idx])
                # print(query_idx)
                # X = np.delete(X,query_idx, axis=0)
                # y = np.delete(y,query_idx.astype(int))
            print('Training Process for acc func {} has been completed in '.format(gpr_model["acquisition_function_name"],time.time()-start))    
            # specific_model_count = 1
            
            with open('../models/{0}_{1}/{2}_{3}_{4}.pickle'.format(model,fingerprint,gpr_model["model_name"],gpr_model["acquisition_function_name"],flag_first_assay),'wb') as f:
                pickle.dump(learner,f)
            
            # with open('../models/{0}_{1}/{2}_{3}_{4}.pickle'.format(model,fingerprint,gpr_model["model_name"],gpr_model["acquisition_function_name"],assay_id),'wb') as f:
            #      learner = pickle.load(f)
            if use_decomposition == True:
                with open('../models/{0}_{1}/transformer_{2}.pickle'.format(model,fingerprint,flag_first_assay),'rb') as f:
                    transformer = pickle.load(f)    
                predictions = learner.predict(transformer.transform(np.array(df_train.iloc[:,10:])))
            else:
                predictions = learner.predict(np.array(df_train.iloc[:,10:]))
            result_trn = round(np.corrcoef(np.array(df_train.iloc[:,3]), predictions)[0,1]**2,5)

            if use_decomposition == True:
                with open('../models/{0}_{1}/transformer_{2}.pickle'.format(model,fingerprint,flag_first_assay),'rb') as f:
                    transformer = pickle.load(f)     
                predictions = learner.predict(transformer.transform(np.array(df_tst.iloc[:,10:])))
            else:
                predictions = learner.predict(np.array(df_tst.iloc[:,10:]))
            result_tst = round(np.corrcoef(np.array(df_tst.iloc[:,3]), predictions)[0,1]**2,5)

            pearson_trn[row][column] = result_trn
            pearson_tst[row][column] = result_tst
            
            column+=1
            pvg_column+=1
    else:
        print("Feature is in the works.")   
    row+=1
    count+=1
    if count >= 3:
            break
print('Reached selected assay at time = {}'.format(time.time() - start))
# except NameError:
#     print('Key Error')

Reached selected assay at time = 251.78277468681335

The current assay id is 688239 and initialisation size is 3299

Subset selected in 0.9444200992584229
Selecte initial training subset 0.9454185962677002
Data is for model Matern() with acc_func max_UCB
Before model creation : 0.946418046951294
After model creation and training 1.0214197635650635
Time taken to query 3.726145029067993
Time taken to query 6.578185319900513
Time taken to query 9.352575302124023
Time taken to query 12.163638591766357
Time taken to query 15.3292555809021
Time taken to query 18.474580764770508
Time taken to query 21.58100438117981
Time taken to query 24.700552225112915
Time taken to query 27.998998641967773
Time taken to query 31.433273792266846
Time taken to query 34.93605351448059
Time taken to query 38.69040560722351
Time taken to query 42.30279755592346
Time taken to query 46.196889877319336
Time taken to query 50.230305194854736
Time taken to query 54.25097441673279
Time taken to query 58.3738377094268

In [None]:
print('Reached selected assay at time = {}'.format(time.time() - start))

Reached selected assay at time = 785.049289226532


# row+=1
for i in range(3,column):
    # value_trn = np.mean(np.array(pearson_trn)[:,i])
    # value_tst = np.mean(np.array(pearson_trn)[:,i])
    
    column_trn = np.array(pearson_trn)[:,i]
    column_trn = column_trn[:row]

    column_tst = np.array(pearson_tst)[:,i]
    column_tst = column_tst[:row]

    pearson_trn[row][i] = np.mean(column_trn)
    pearson_tst[row][i] = np.mean(column_tst)

In [None]:
pearson_values_graph

[[3e-05, 3e-05, 3e-05],
 [6e-05, 5e-05, 4e-05],
 [4e-05, 6e-05, 6e-05],
 [8e-05, 7e-05, 0.0001],
 [0.0001, 9e-05, 9e-05],
 [9e-05, 0.00012, 0.00014],
 [0.00012, 0.00017, 0.00015],
 [0.00011, 0.00015, 0.00015],
 [0.00011, 0.00014, 0.00015],
 [0.00011, 0.00019, 0.00014],
 [0.00011, 0.00014, 0.00012],
 [0.00012, 0.00016, 0.00012],
 [0.00011, 0.00016, 0.00015],
 [0.00011, 0.00017, 0.00018],
 [0.00012, 0.0002, 0.00024],
 [0.00012, 0.00018, 0.00023],
 [0.00011, 0.0002, 0.00029],
 [0.00011, 0.0003, 0.00034],
 [0.00011, 0.00034, 0.00033],
 [0.00011, 0.00037, 0.00038],
 [0.0001, 0.00035, 0.00037],
 [0.00011, 0.00037, 0.00035],
 [0.00011, 0.00042, 0.00035],
 [0.00011, 0.00042, 0.00039],
 [0.00011, 0.00045, 0.0004]]

In [1]:
pd.DataFrame(pearson_trn, columns=clf_list_names).to_csv('../data/data_results/{0}_{1}/pearsons_training_set_{2}.csv'.format(model,fingerprint,num_iterations),index=False)
pd.DataFrame(pearson_tst, columns=clf_list_names).to_csv('../data/data_results/{0}_{1}/pearsons_test_set_{2}.csv'.format(model,fingerprint,num_iterations),index=False)

pearson_values_graph = pd.DataFrame(pearson_values_graph, columns=params_config["acquisition"])
pearson_values_graph.to_csv('../data/data_results/{0}_{1}/squared_pearson_{2}.csv'.format(model,fingerprint,flag_first_assay),index=False)

colours = ['-r','--b',':g','^y','+p']
colour_count = 0
font_custom = {
    "family" : "sans-serif",
    "color" : "darkblue",
    "size" : "10"
    }

plt.title("Training results for assay {}".format(flag_first_assay), fontdict=font_custom, loc='center')
plt.xlabel("Iteration number", fontdict=font_custom)
plt.ylabel("Pearson's coefficient values", fontdict=font_custom)


for (column_name,column_contents) in pearson_values_graph.iteritems():
    plt.plot([x for x in range(len(column_contents))],column_contents, colours[colour_count], label='{}'.format(column_name))
    colour_count +=1
plt.grid(color = 'lightgreen', linestyle = '--', linewidth =0.25)
plt.legend()
plt.savefig('../data/data_results/{0}_{1}/plots/training_cycles_{2}.jpg'.format(model,fingerprint,flag_first_assay),
                format='jpg',
               )
plt.show()

NameError: name 'pd' is not defined