In [28]:
import pandas as pd

from acquirer.acquirer import *
from acquirer.acquirer import *
from model.model import *
from molecule_pool.molecule_pool import MoleculePool


BATCH_SIZE = 500
MAX_ITERATIONS = 6
NB_EXPERIMENTS = 5
k = 500

## Q1 & Q2 Implement the first brick, run it and output results

In [2]:
df = pd.read_csv('../data/Enamine50k_rdkit2d.csv').to_numpy()
df = df[:, 1:]

molecule_pool = MoleculePool(df)


models = [RandomForest({'n_estimators': 100,
                      'max_depth': 8,
                      }), 
          NN({'input_shape': 200, 'hidden_shape': 100, 'output_shape': 1}, epoch=100)]

acquisition_functions = [RandomSearch(batch_size=BATCH_SIZE),
                         Greedy(batch_size=BATCH_SIZE),
                        UBC(batch_size=BATCH_SIZE)]


In [None]:
verbose = True
results = np.empty((2,7,NB_EXPERIMENTS+1,MAX_ITERATIONS+1))


for i, model in enumerate(models):
    for j, acquisition_function in enumerate(acquisition_functions):
        
        print('Model used  :', model.name)
        print('Acquisition function used  :', acquisition_function.name)
        print('Batch Size  :', BATCH_SIZE)
        
        for experiment in range(NB_EXPERIMENTS):
            print('Initialize training set...')
            train_set, test_set = molecule_pool.initialize_batch(batch_size=BATCH_SIZE)
            
            # Get the name of top k molecule according to the docking score
            idx_best = molecule_pool.sort_idx_by_true_score()[:k]
            top_k_mol = set(molecule_pool.df[idx_best, 0])
            top_k_found = train_set.get_top_k(k, top_k_mol)
            
            if verbose:
                print("% of top {} molecules found :".format(k), (len(top_k_found)/k)*100, '%')
                print('='*50)

            if model.name == 'NN' and acquisition_function.name == 'UBC':
                print('Variance not yet implemented for NN')
                continue

            iteration = 0
            results[i, j, iteration] =  (len(top_k_found)/k)*100

            while iteration < MAX_ITERATIONS:
                if verbose:
                    print('ITERATION', iteration)
                    print('Train set shape : ', train_set.data.shape)
                    print('Training the model...')
                    
                # Save time by not training the model when using a randomsearch acquisiton
                if not acquisition_function.name 'RandomSearch':
                    model.train(train_set)


                score = model.predict(test_set, acquisition_function.require_var)

                most_promising_mol = acquisition_function.select_train_set(test_set)

                new_train_mol = np.concatenate((train_set.df[:, 0], most_promising_mol.df[:, 0]))

                train_set, test_set = molecule_pool.create_batch(new_train_mol)

                iteration += 1


                top_k_found = train_set.get_top_k(k, top_k_mol)

                if verbose:
                    print('Get top k molecules...')
                    print("% of top {} molecules found :".format(k), (len(top_k_found)/k)*100, '%')
                    print('='*50)

                results[i, j, experiment, iteration] =  (len(top_k_found)/k)*100

Model used  : RandomForestRegressor
Acquisition function used  : RandomSearch
Batch Size  : 500
Initialize training set...
% of top 500 molecules found : 1.0 %
ITERATION 0
Train set shape :  (500, 202)
Training the model...
R2 score on train:  0.9247520332467901
Get top k molecules...
% of top 500 molecules found : 2.4 %
ITERATION 1
Train set shape :  (1000, 202)
Training the model...
R2 score on train:  0.8959629225558706
Get top k molecules...
% of top 500 molecules found : 3.5999999999999996 %
ITERATION 2
Train set shape :  (1500, 202)
Training the model...
R2 score on train:  0.880557134645713
Get top k molecules...
% of top 500 molecules found : 4.3999999999999995 %
ITERATION 3
Train set shape :  (2000, 202)
Training the model...
R2 score on train:  0.8629641947842547
Get top k molecules...
% of top 500 molecules found : 5.0 %
ITERATION 4
Train set shape :  (2500, 202)
Training the model...
R2 score on train:  0.8467226965689159
Get top k molecules...
% of top 500 molecules found 

In [19]:
df_result = pd.DataFrame()

df_result["RandomForest"] = [results[0][i] for i in range(len(acquisition_functions))]
df_result["NN"] = [results[1][i] for i in range(len(acquisition_functions))]

In [20]:
df_result

Unnamed: 0,RandomForest,NN
0,"[0.0, 1.2, 2.4, 3.8, 5.6000000000000005, 6.600...","[3.3007754224645115e+180, 2.0, 3.2, 3.2, 4.6, ..."
1,"[4.672378692210559e+164, 26.6, 37.0, 50.2, 58....","[2.6606464780332684e-260, 27.0, 42.6, 55.40000..."
2,"[9.936735247647633e+247, 22.400000000000002, 3...","[5.219694034256432e+180, 1.7145746322070626e+2..."
