In [None]:
%pip install PyTDC

In [None]:
%pip install rdkit

In [None]:
%pip install DeepPurpose

In [None]:
%pip install git+https://github.com/bp-kelley/descriptastorus

In [None]:
%pip install pandas-flavor

In [None]:
%pip install dgl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm #progress bar
import rdkit
from rdkit import Chem #Chemistry
from rdkit.Chem import rdMolDescriptors #molecular descriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
rdkit.__version__

import xgboost as xgb
import sklearn
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import fbeta_score, make_scorer
from xgboost.sklearn import XGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split #ML training
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error #ML stats
from yellowbrick.regressor import prediction_error, ResidualsPlot
from tdc.single_pred import ADME
from tdc.benchmark_group import admet_group

import DeepPurpose
from DeepPurpose import CompoundPred as models
from DeepPurpose.utils import *
from tdc import BenchmarkGroup
import dgl
import warnings
warnings.filterwarnings("ignore")

In [None]:
group = admet_group(path = 'data/')
predictions_list_xgb = []
best_params_list_xgb = []
predictions_list_rf = []
predictions_list_svm = []
best_params_list_svm = []
predictions_list_adb = []
predictions_list_cnn = []

for seed in [1, 2, 3, 4, 5]:
    predictions_xgb = {}
    predictions_rf = {}
    predictions_svm = {}
    predictions_adb = {}
    predictions_cnn = {}

    benchmark = group.get('Caco2_Wang')
    name = benchmark['name']
    train_val, test = benchmark['train_val'], benchmark['test']
    train, valid = group.get_train_valid_split(benchmark = name, split_type = 'default', seed = seed)

    PandasTools.AddMoleculeColumnToFrame(train, smilesCol='Drug')
    radius=2
    nBits=1024
    ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,radius=radius, nBits=nBits) for x in train['ROMol']]
    ecfp6_name = [f'Bit_{i}' for i in range(nBits)]
    ecfp6_bits = [list(l) for l in ECFP6]
    Y = train['Y']
    train = pd.DataFrame(ecfp6_bits, index = train.Drug, columns=ecfp6_name).reset_index(drop = False)
    train['Y'] = Y

    PandasTools.AddMoleculeColumnToFrame(benchmark['test'], smilesCol='Drug')
    radius=2
    nBits=1024
    ECFP6 = [AllChem.GetMorganFingerprintAsBitVect(x,radius=radius, nBits=nBits) for x in benchmark['test']['ROMol']]
    ecfp6_name = [f'Bit_{i}' for i in range(nBits)]
    ecfp6_bits = [list(l) for l in ECFP6]
    Y = benchmark['test']['Y']
    benchmark['test'] = pd.DataFrame(ecfp6_bits, index = benchmark['test'].Drug, columns=ecfp6_name).reset_index(drop = False)
    benchmark['test']['Y'] = Y

    train_X = train.drop(columns = ["Drug","Y"])
    train_y = train.Y
    test_X = benchmark['test'].drop(columns = ["Drug","Y"])
    test_y = benchmark['test'].Y

    
    #XGBoost + Morgan
    xgb_parameters = {'objective':['reg:squarederror'],
              'booster':['gbtree','gblinear'],
              'learning_rate': [0.1],
              'max_depth': [7,10,15,20],
              'min_child_weight': [10,15,20,25],
              'colsample_bytree': [0.8, 0.9, 1],
              'n_estimators': [300,400,500,600],
              "reg_alpha"   : [0.5,0.2,1],
              "reg_lambda"  : [2,3,5],
              "gamma"       : [1,2,3]}

    xgb_model = XGBRegressor()
    grid_obj_xgb = RandomizedSearchCV(xgb_model, xgb_parameters, cv=5, n_iter=15, scoring='neg_mean_absolute_error', verbose=5, n_jobs=1)
    grid_obj_xgb.fit(train_X, train_y, verbose = 1)
    y_pred_test_xgb = grid_obj_xgb.predict(test_X)
    bp_xgb = grid_obj_xgb.best_params_
    predictions_xgb[name] = y_pred_test_xgb
    predictions_list_xgb.append(predictions_xgb)
    best_params_list_xgb.append(bp_xgb)

    #Random Forest + Morgan
    rf_model = RandomForestRegressor()
    rf_model.fit(train_X, train_y)
    y_pred_test_rf = rf_model.predict(test_X)
    predictions_rf[name] = y_pred_test_rf
    predictions_list_rf.append(predictions_rf)

    #SVM + Morgan
    svm_parameters = {
        'C': [0.1, 1, 10, 100, 1000],
        'epsilon': [0.0001, 0.001, 0.01, 0.1, 0.5],
        'gamma': [0.0001, 0.001, 0.01, 0.1, 1]
    }
    svm_model = SVR(kernel="rbf")
    grid_obj_svm = RandomizedSearchCV(svm_model, svm_parameters, cv=5, n_iter=15, scoring='neg_mean_absolute_error', verbose=5, n_jobs=1)
    grid_obj_svm.fit(train_X, train_y)
    y_pred_test_svm = grid_obj_svm.predict(test_X)
    bp_svm = grid_obj_svm.best_params_
    predictions_svm[name] = y_pred_test_svm
    predictions_list_svm.append(predictions_svm)
    best_params_list_svm.append(bp_svm)
    
    # AdaBoost + Morgan
    DTR = DecisionTreeRegressor(max_depth=4)
    adb_model = AdaBoostRegressor(n_estimators=300, base_estimator=DTR, learning_rate=1)
    adb_model.fit(train_X, train_y)
    y_pred_test_adb = adb_model.predict(test_X)
    predictions_adb[name] = y_pred_test_adb
    predictions_list_adb.append(predictions_adb)

    '''
    #for GCN, refer to https://github.com/kexinhuang12345/DeepPurpose
    #GCN
    drug_encoding = 'DGL_GCN'
    benchmark = group.get('Half_Life_Obach')
    train_gcn, valid_gcn = group.get_train_valid_split(benchmark = benchmark['name'], split_type = 'default', seed = seed)
    train_gcn = data_process(X_drug = train_gcn.Drug.values, y = train_gcn.Y.values, drug_encoding = drug_encoding, split_method='no_split')
    val_gcn = data_process(X_drug = valid_gcn.Drug.values, y = valid_gcn.Y.values, drug_encoding = drug_encoding, split_method='no_split')
    test_gcn = data_process(X_drug = benchmark['test'].Drug.values, y = benchmark['test'].Y.values, drug_encoding = drug_encoding, split_method='no_split')
    config = generate_config(drug_encoding = drug_encoding,
                            cls_hidden_dims = [512],
                            train_epoch = 10,
                            LR = 0.001,
                            batch_size = 128,
                            )

    gcn_model = models.model_initialize(**config)
    gcn_model.train(train_gcn, val_gcn, test_gcn, verbose = True)
    y_pred_test_gcn = gcn_model.predict(test_gcn)
    predictions_gcn[name] = y_pred_test_gcn
    predictions_list_gcn.append(predictions_gcn)
    '''

    #CNN + Morgan
    train_X = train_X.to_numpy()
    test_X = test_X.to_numpy()
    train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
    test_X = test_X.reshape(test_X.shape[0], test_X.shape[1], 1)
    cnn_model = Sequential()
    cnn_model.add(Conv1D(32, 2, activation="relu", input_shape=(1024,1)))
    cnn_model.add(Flatten())
    cnn_model.add(Dense(64, activation="relu"))
    cnn_model.add(Dense(1))
    cnn_model.compile(loss="mse", optimizer="adam")
    cnn_model.fit(train_X, train_y, batch_size=12, epochs=10, verbose=0)
    y_pred_test_cnn = cnn_model.predict(test_X)

    predictions_cnn[name] = y_pred_test_cnn
    predictions_list_cnn.append(predictions_cnn)

    cnn_model = models.model_initialize(**config)
    cnn_model.train(train_cnn, val_cnn, test_cnn, verbose = True)
    y_pred_test_cnn = cnn_model.predict(test_cnn)
    predictions_cnn[name] = y_pred_test_cnn
    predictions_list_cnn.append(predictions_cnn)


xgb_results = group.evaluate_many(predictions_list_xgb)
rf_results = group.evaluate_many(predictions_list_rf)
svm_results = group.evaluate_many(predictions_list_svm)
adb_results = group.evaluate_many(predictions_list_adb)
cnn_results = group.evaluate_many(predictions_list_cnn)


In [None]:
xgb_results

In [None]:
rf_results

In [None]:
svm_results

In [None]:
adb_results

In [None]:
cnn_results

In [None]:
predictions_list_cnn = [{'caco2_wang': np.array([-5.156512 ,
         -4.7750106,
         -4.69257  ,
         -5.5110726,
         -5.5380106,
         -5.452009 ,
         -4.875872 ,
         -4.78503  ,
         -4.7569757,
         -4.701529 ,
         -5.588609 ,
         -4.8879743,
         -5.3090467,
         -4.731854 ,
         -5.3081083,
         -4.7972393,
         -6.207682 ,
         -7.0020924,
         -6.8926005,
         -6.8926005,
         -5.62865  ,
         -5.62865  ,
         -6.1759152,
         -5.3982663,
         -4.6318417,
         -4.0530353,
         -5.6614165,
         -5.400821 ,
         -5.5267406,
         -7.9806247,
         -7.9958735,
         -7.690019 ,
         -4.604361 ,
         -7.020907 ,
         -5.704796 ,
         -5.6502604,
         -5.704796 ,
         -5.3595395,
         -5.996555 ,
         -5.6095815,
         -5.706033 ,
         -5.563432 ,
         -5.2755585,
         -5.049094 ,
         -4.378165 ,
         -4.378165 ,
         -4.378165 ,
         -4.378165 ,
         -4.38465  ,
         -4.378165 ,
         -4.378165 ,
         -4.38465  ,
         -4.378165 ,
         -4.378165 ,
         -4.378165 ,
         -4.378165 ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.38465  ,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.1433287,
         -4.378165 ,
         -4.378165 ,
         -4.38465  ,
         -4.38465  ,
         -4.38465  ,
         -4.1433287,
         -3.836563 ,
         -6.8437963,
         -4.4472365,
         -4.525126 ,
         -6.159801 ,
         -6.338123 ,
         -6.24768  ,
         -5.270736 ,
         -4.731039 ,
         -5.584938 ,
         -6.0498705,
         -5.5461135,
         -5.648233 ,
         -4.8976216,
         -4.5603814,
         -4.4116983,
         -4.965141 ,
         -4.31502  ,
         -5.360982 ,
         -5.3335295,
         -6.3460097,
         -4.6385922,
         -5.324503 ,
         -4.5954885,
         -4.7058063,
         -4.281633 ,
         -5.8021674,
         -3.8713632,
         -6.234755 ,
         -5.562221 ,
         -5.578177 ,
         -5.8309894,
         -5.976305 ,
         -5.976305 ,
         -6.3623223,
         -4.323093 ,
         -4.151713 ,
         -5.0390477,
         -7.002945 ,
         -4.9884543,
         -4.9435425,
         -4.3634677,
         -4.1874557,
         -5.665638 ,
         -5.187721 ,
         -5.2357063,
         -5.2039466,
         -5.5282874,
         -5.443805 ,
         -5.1158123,
         -4.8482757,
         -4.8608565,
         -5.827716 ,
         -6.020002 ,
         -6.824297 ,
         -4.269425 ,
         -5.8726707,
         -6.0283885,
         -5.710245 ,
         -5.932599 ,
         -4.98694  ,
         -4.98694  ,
         -5.9395947,
         -6.2105417,
         -6.886005 ,
         -5.803617 ,
         -5.2684236,
         -5.563751 ,
         -4.4634166,
         -5.563751 ,
         -5.7749887,
         -4.4173617,
         -5.12423  ,
         -5.218717 ,
         -4.938667 ,
         -4.9567394,
         -5.38038  ,
         -5.498334 ,
         -5.554815 ,
         -6.0184207,
         -4.0673866,
         -5.301716 ,
         -5.301716 ,
         -5.257684 ,
         -6.208109 ,
         -3.912148 ,
         -3.912148 ,
         -5.0019994,
         -6.78348  ,
         -5.205254 ,
         -4.7311397,
         -6.0583844,
         -6.0583844,
         -5.1077456])},
 {'caco2_wang': np.array([-5.249175 ,
         -4.8677325,
         -4.415555 ,
         -5.2308397,
         -5.401132 ,
         -4.851208 ,
         -4.567685 ,
         -4.419233 ,
         -4.43552  ,
         -4.483015 ,
         -5.003654 ,
         -5.060755 ,
         -4.8397565,
         -4.6616993,
         -5.573474 ,
         -4.812648 ,
         -6.1635666,
         -6.3132267,
         -6.3926754,
         -6.3926754,
         -4.7173586,
         -4.7173586,
         -6.424727 ,
         -5.0785675,
         -4.369035 ,
         -3.7447395,
         -5.416135 ,
         -5.242184 ,
         -5.370327 ,
         -7.581908 ,
         -7.8030777,
         -7.472358 ,
         -4.8690577,
         -6.9468517,
         -5.923766 ,
         -5.710638 ,
         -5.923766 ,
         -4.6255493,
         -5.108897 ,
         -4.6405363,
         -4.607298 ,
         -4.3481827,
         -4.1416683,
         -4.083835 ,
         -3.8864133,
         -3.8864133,
         -3.8864133,
         -3.8864133,
         -3.9781466,
         -3.8864133,
         -3.8864133,
         -3.9781466,
         -3.8864133,
         -3.8864133,
         -3.8864133,
         -3.8864133,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781463,
         -3.9781463,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.9781466,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.8163002,
         -3.8864133,
         -3.8864133,
         -3.9781466,
         -3.9781466,
         -3.9781466,
         -3.8163002,
         -3.687337 ,
         -6.134657 ,
         -4.3704057,
         -4.276641 ,
         -6.0622625,
         -6.4337716,
         -6.1867294,
         -5.4659586,
         -4.4862175,
         -5.4366674,
         -5.788502 ,
         -5.809925 ,
         -5.438352 ,
         -4.7236214,
         -4.5177617,
         -4.253625 ,
         -4.609004 ,
         -4.188422 ,
         -5.641949 ,
         -4.245143 ,
         -6.0691085,
         -4.6081104,
         -5.3133855,
         -4.4441886,
         -4.5215135,
         -4.0105324,
         -5.111226 ,
         -3.8675582,
         -6.036304 ,
         -5.32047  ,
         -5.249956 ,
         -5.7401123,
         -5.5159864,
         -5.5159864,
         -5.8134317,
         -4.0351133,
         -4.326476 ,
         -4.5721874,
         -6.9909363,
         -5.069372 ,
         -4.3482904,
         -4.280182 ,
         -3.8951159,
         -5.052601 ,
         -4.743198 ,
         -5.0753555,
         -5.345169 ,
         -5.371518 ,
         -5.6271844,
         -4.9923725,
         -4.672247 ,
         -4.606264 ,
         -6.088415 ,
         -6.2370377,
         -5.9594374,
         -3.9292777,
         -5.699339 ,
         -5.946335 ,
         -5.6204557,
         -5.623432 ,
         -4.5151105,
         -4.5151105,
         -5.8725004,
         -6.0846725,
         -5.8708777,
         -5.2459235,
         -4.952188 ,
         -5.200848 ,
         -3.8926227,
         -5.200848 ,
         -5.3873415,
         -4.205205 ,
         -4.788114 ,
         -4.8129168,
         -4.7455854,
         -4.0482874,
         -4.3085895,
         -4.4005356,
         -4.29613  ,
         -6.117156 ,
         -3.7391293,
         -5.8857484,
         -5.8857484,
         -5.168621 ,
         -6.044229 ,
         -3.7668762,
         -3.7668762,
         -4.867215 ,
         -6.749174 ,
         -4.633834 ,
         -4.587734 ,
         -5.8701687,
         -5.8701687,
         -4.685318])},
 {'caco2_wang': np.array([-5.223132 ,
         -5.1707563,
         -4.8357286,
         -5.1439123,
         -5.4174414,
         -5.335557 ,
         -4.8911295,
         -4.616206 ,
         -4.3739233,
         -4.531184 ,
         -5.1839495,
         -5.322136 ,
         -5.4268656,
         -4.53076  ,
         -5.2942076,
         -4.739489 ,
         -6.3356466,
         -6.938785 ,
         -6.8493824,
         -6.8493824,
         -5.125392 ,
         -5.125392 ,
         -6.23295  ,
         -5.297561 ,
         -4.6625156,
         -3.897419 ,
         -5.7397523,
         -5.588104 ,
         -5.677499 ,
         -7.7583213,
         -7.9000854,
         -7.559433 ,
         -4.370484 ,
         -6.561119 ,
         -5.2837443,
         -5.614047 ,
         -5.2837443,
         -4.9229794,
         -5.648877 ,
         -5.2362366,
         -4.8304453,
         -4.61506  ,
         -4.418337 ,
         -4.525177 ,
         -4.1982465,
         -4.1982465,
         -4.1982465,
         -4.1982465,
         -4.330162 ,
         -4.1982465,
         -4.1982465,
         -4.330162 ,
         -4.1982465,
         -4.1982465,
         -4.1982465,
         -4.1982465,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -4.330162 ,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -3.9453902,
         -4.1982465,
         -4.1982465,
         -4.330162 ,
         -4.330162 ,
         -4.330162 ,
         -3.9453902,
         -3.7596853,
         -6.49586  ,
         -4.9567175,
         -4.4351482,
         -6.0140085,
         -6.001668 ,
         -5.879879 ,
         -5.1100416,
         -4.461899 ,
         -4.9971957,
         -5.1438203,
         -5.5435066,
         -5.014742 ,
         -4.681775 ,
         -4.4556932,
         -4.2252483,
         -4.6251025,
         -4.1970167,
         -5.3046346,
         -5.470389 ,
         -5.901613 ,
         -4.5020533,
         -4.6604214,
         -4.399575 ,
         -4.301469 ,
         -4.067607 ,
         -5.116886 ,
         -3.7551847,
         -6.2772875,
         -5.180294 ,
         -5.245137 ,
         -5.486878 ,
         -5.996171 ,
         -5.996171 ,
         -5.9198537,
         -4.203861 ,
         -4.9564543,
         -4.79978  ,
         -6.3369675,
         -4.9809294,
         -4.591828 ,
         -4.2761493,
         -4.315629 ,
         -5.152665 ,
         -4.715898 ,
         -5.5089245,
         -5.6345215,
         -5.5471454,
         -5.619231 ,
         -5.245266 ,
         -5.205987 ,
         -4.54623  ,
         -5.6065598,
         -5.3327436,
         -6.329245 ,
         -3.9287634,
         -5.789406 ,
         -5.8847694,
         -5.7153354,
         -5.6818647,
         -4.471728 ,
         -4.471728 ,
         -5.9730434,
         -5.9579277,
         -6.4134517,
         -5.1650224,
         -4.9640827,
         -5.35833  ,
         -4.4504824,
         -5.35833  ,
         -5.6037984,
         -4.4153943,
         -4.777815 ,
         -4.6249485,
         -4.330177 ,
         -5.301612 ,
         -5.5386467,
         -5.7651186,
         -5.4712725,
         -5.613489 ,
         -4.399495 ,
         -5.316235 ,
         -5.316235 ,
         -5.087865 ,
         -5.554225 ,
         -3.8565621,
         -3.8565621,
         -4.736654 ,
         -7.029964 ,
         -4.7766237,
         -4.1584897,
         -6.17981  ,
         -6.17981  ,
         -4.470648])},
 {'caco2_wang': np.array([-5.3148093,
         -4.8013988,
         -4.6427083,
         -4.9119935,
         -4.985024 ,
         -4.7280855,
         -4.9692435,
         -4.6733212,
         -4.594565 ,
         -4.9847293,
         -5.454364 ,
         -5.4709244,
         -5.606468 ,
         -4.89794  ,
         -4.7117248,
         -4.388366 ,
         -6.509015 ,
         -6.4898024,
         -6.4834924,
         -6.4834924,
         -4.8869953,
         -4.8869953,
         -5.9922843,
         -5.4473853,
         -4.430287 ,
         -3.4917953,
         -5.313174 ,
         -5.168526 ,
         -5.5287824,
         -7.8300424,
         -7.8018317,
         -7.6935554,
         -4.482273 ,
         -6.977776 ,
         -5.595795 ,
         -5.6939063,
         -5.595795 ,
         -5.613915 ,
         -6.1973205,
         -5.830271 ,
         -5.292072 ,
         -5.165783 ,
         -4.9398947,
         -4.873306 ,
         -3.9959223,
         -3.9959223,
         -3.9959223,
         -3.9959223,
         -4.056785 ,
         -3.9959223,
         -3.9959223,
         -4.056785 ,
         -3.9959223,
         -3.9959223,
         -3.9959223,
         -3.9959223,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.0567856,
         -4.0567856,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -4.056785 ,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -3.7942362,
         -3.9959223,
         -3.9959223,
         -4.056785 ,
         -4.056785 ,
         -4.056785 ,
         -3.7942362,
         -4.082118 ,
         -6.724182 ,
         -4.213937 ,
         -4.3413243,
         -5.897125 ,
         -6.338936 ,
         -6.31081  ,
         -5.3237166,
         -4.4619   ,
         -5.167372 ,
         -5.6884866,
         -5.296679 ,
         -5.1787114,
         -4.6725535,
         -4.501495 ,
         -4.31805  ,
         -4.719334 ,
         -4.2866488,
         -5.4988446,
         -5.1330814,
         -5.7064724,
         -4.484163 ,
         -4.6132665,
         -4.4867473,
         -4.8969707,
         -4.2784543,
         -5.1332517,
         -4.120525 ,
         -5.5537357,
         -5.092604 ,
         -4.7737775,
         -5.4639015,
         -6.13738  ,
         -6.13738  ,
         -6.901995 ,
         -3.9876924,
         -5.0606155,
         -4.9533734,
         -6.9088264,
         -4.7330117,
         -4.365721 ,
         -4.1891007,
         -4.0862465,
         -4.8319054,
         -4.67438  ,
         -5.5299363,
         -5.6648283,
         -5.4910755,
         -5.657256 ,
         -5.1066184,
         -5.092929 ,
         -4.841181 ,
         -5.356753 ,
         -5.557392 ,
         -6.54144  ,
         -4.032084 ,
         -5.7422853,
         -6.181167 ,
         -5.5731688,
         -5.7444983,
         -4.4059277,
         -4.4059277,
         -5.7499633,
         -6.2945166,
         -6.6509495,
         -5.259339 ,
         -4.9118333,
         -5.0610785,
         -4.3671   ,
         -5.0610785,
         -5.234315 ,
         -4.3855968,
         -5.1033072,
         -5.531768 ,
         -5.363855 ,
         -4.8229814,
         -5.538952 ,
         -5.642186 ,
         -5.8940396,
         -5.569441 ,
         -4.3721023,
         -5.065129 ,
         -5.065129 ,
         -4.9627743,
         -6.0779204,
         -3.6870995,
         -3.6870995,
         -4.5854774,
         -6.7441926,
         -4.7042847,
         -4.0345883,
         -6.0179057,
         -6.0179057,
         -5.000649])},
 {'caco2_wang': np.array([-5.918768 ,
         -5.0597897,
         -4.725922 ,
         -5.2309217,
         -5.4545603,
         -5.641961 ,
         -4.5264425,
         -4.423626 ,
         -4.399615 ,
         -4.443289 ,
         -5.442047 ,
         -5.551286 ,
         -5.017702 ,
         -4.6105146,
         -5.1308966,
         -4.4723673,
         -6.7503967,
         -6.7035446,
         -6.661804 ,
         -6.661804 ,
         -5.2565517,
         -5.2565517,
         -6.3356285,
         -5.1783924,
         -4.5967526,
         -4.0543613,
         -5.223436 ,
         -5.09758  ,
         -5.344864 ,
         -7.8110285,
         -7.711076 ,
         -7.6659117,
         -4.8945084,
         -6.730221 ,
         -5.326463 ,
         -5.407279 ,
         -5.326463 ,
         -5.4605064,
         -5.9773884,
         -5.5537157,
         -5.4946003,
         -5.228772 ,
         -5.038639 ,
         -4.906605 ,
         -4.3652773,
         -4.3652773,
         -4.3652773,
         -4.3652773,
         -4.4428835,
         -4.3652773,
         -4.3652773,
         -4.4428835,
         -4.3652773,
         -4.3652773,
         -4.3652773,
         -4.3652773,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.442883 ,
         -4.442883 ,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.4428835,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.2224207,
         -4.3652773,
         -4.3652773,
         -4.4428835,
         -4.4428835,
         -4.4428835,
         -4.2224207,
         -3.8727572,
         -6.7984157,
         -4.6342854,
         -4.7114835,
         -6.003698 ,
         -6.3461213,
         -6.076661 ,
         -5.3013554,
         -4.653742 ,
         -5.382902 ,
         -5.579344 ,
         -5.282798 ,
         -5.2433205,
         -4.9008074,
         -4.724396 ,
         -4.457726 ,
         -4.8968906,
         -4.4149437,
         -5.398672 ,
         -4.260214 ,
         -6.137517 ,
         -4.5511193,
         -5.5089607,
         -4.71244  ,
         -4.908088 ,
         -4.5451775,
         -5.7240205,
         -4.436619 ,
         -6.318935 ,
         -5.921505 ,
         -5.9444513,
         -6.309045 ,
         -6.1124644,
         -6.1124644,
         -6.349851 ,
         -4.2127   ,
         -5.0883427,
         -4.7193184,
         -6.639204 ,
         -5.0634937,
         -4.4036574,
         -4.122469 ,
         -4.4482875,
         -5.6442223,
         -5.1678925,
         -5.1937227,
         -5.6725   ,
         -5.3000736,
         -5.712038 ,
         -4.9832897,
         -4.883137 ,
         -4.58675  ,
         -5.8095284,
         -5.686861 ,
         -6.460298 ,
         -3.6515007,
         -5.7627716,
         -5.9499445,
         -5.5782275,
         -5.6534233,
         -4.615272 ,
         -4.615272 ,
         -6.0609703,
         -6.215336 ,
         -6.8883   ,
         -5.6139574,
         -5.3518715,
         -5.3928814,
         -4.4466777,
         -5.3928814,
         -5.6837025,
         -4.4443583,
         -4.9117503,
         -5.2013783,
         -4.915949 ,
         -4.2319803,
         -4.5052643,
         -4.6790886,
         -4.4655647,
         -6.2490153,
         -4.599625 ,
         -5.9687576,
         -5.9687576,
         -5.4077377,
         -5.94464  ,
         -3.8981864,
         -3.8981864,
         -5.153685 ,
         -6.5687895,
         -4.826912 ,
         -4.6389875,
         -5.836522 ,
         -5.836522 ,
         -5.2039204])}]

In [None]:
y_test = benchmark['test'].Y
y_test = np.array(y_test)


In [None]:
def rank_object(array): # convert scores into ranks
    arg_a = np.argsort(array)
    b = np.flip(np.arange(len(arg_a)))
    a = np.zeros_like(arg_a)
    for i in range(len(arg_a)):
        a[arg_a[i]] = b[i]
    return a

In [None]:
def score_to_rank(array):
  res = np.argsort(np.flip(np.argsort(array)))+1
  return res

In [None]:
def normalize(array): # define function for normalization of scores
    maximum = np.max(array)
    minimum = np.min(array)
    norm_list = []
    for i in range(len(array)):
        norm_list.append((array[i]-minimum)/(maximum-minimum))
    return np.array(norm_list)

In [None]:
predictions_xgb = []
predictions_rf = []
predictions_svm = []
predictions_adb = []
predictions_cnn = []
scoreSys = ['xgb', 'rf', 'svm', 'adb', 'cnn']

for sys in scoreSys:
  for seed in range(len(globals()['predictions_list_%s' % sys])):
    globals()['predictions_%s' % sys].append(list(globals()['predictions_list_%s' % sys][seed].values())[0])

In [None]:
# RSC graphs
colors = ['r--', 'm--', 'b--', 'g--', 'y--']
ranks = np.flip(np.arange(len(predictions_xgb[0])))

for seed in range(len(globals()['predictions_list_%s' % sys])):
  for z in range(len(scoreSys)):
    globals()['line%s' % (z+1)] = plt.plot(ranks, np.sort(normalize(globals()['predictions_%s' % scoreSys[z]][seed])), colors[z], label = scoreSys[z])
    plt.legend(loc = 'upper right')
    plt.title('RSC of Scoring Systems_Predictions_seed%s' % seed)
    plt.xlabel('Rank Values')
    plt.ylabel('Normalized Scores')
  plt.show()


In [None]:
ds_score = [[] for _ in range(5)]

In [None]:
for sys in scoreSys:
  for seed in range(len(ds_score)):
    loc = scoreSys.index(sys)
    scoreSys.remove(sys)
    ds = 0
    for i in range(len(scoreSys)):
      ds += np.sum(np.square(normalize(np.sort(globals()['predictions_%s' % sys][seed]))-normalize(np.sort(globals()['predictions_%s' % scoreSys[i]][seed]))))
    ds = ds/len(scoreSys)
    scoreSys.insert(loc, sys)
    ds_score[seed].append(ds)

In [None]:
ds_score

In [None]:
ds_rank = np.reciprocal(ds_score)

In [None]:
ds_rank

In [None]:
def mean_absolute_error(y_pred, y_test):
  res = np.mean(np.abs(y_pred - y_test))
  return res

In [None]:
ps_score = [[] for _ in range(5)]

In [None]:
def spearman_corr(y_pred_rank, y_test_rank):
  n = len(y_pred_rank)
  res = 1 - 6 * np.sum((y_pred_rank - y_test_rank)**2) / (n*(n**2-1))
  return res

In [None]:
for sys in scoreSys:
  for seed in range(len(ps_score)):
    ps = mean_absolute_error(globals()['predictions_%s' % sys][seed], y_test)
    ps_score[seed].append(ps)

In [None]:
ps_score

In [None]:
def powerset(s):
    x = len(s)
    ls = []
    for i in range(1 << x):
        ls.append([s[j] for j in range(x) if (i & (1 << j))])
    return ls[1:]

models = powerset(scoreSys)

def myFunc(e):
  return len(e)

models.sort(key=myFunc)

models_list = []
for i in range(len(models)):
  if len(models[i]) == 1:
    models_list.append(models[i][0])
  elif len(models[i]) == 2:
    models_list.append(models[i][0]+'&'+models[i][1])
  elif len(models[i]) == 3:
    models_list.append(models[i][0]+'&'+models[i][1]+'&'+models[i][2])
  elif len(models[i]) == 4:
    models_list.append(models[i][0]+'&'+models[i][1]+'&'+models[i][2]+'&'+models[i][3])
  elif len(models[i]) == 5:
    models_list.append(models[i][0]+'&'+models[i][1]+'&'+models[i][2]+'&'+models[i][3]+'&'+models[i][4])

In [None]:
models_list

# Perform average score combinations

In [None]:
avg_score_combine_seed1 = pd.DataFrame({'xgb':predictions_xgb[0], 'rf':predictions_rf[0], 'svm':predictions_svm[0], 'adb':predictions_adb[0], 'cnn':predictions_cnn[0]})
avg_score_combine_seed2 = pd.DataFrame({'xgb':predictions_xgb[1], 'rf':predictions_rf[1], 'svm':predictions_svm[1], 'adb':predictions_adb[1], 'cnn':predictions_cnn[1]})
avg_score_combine_seed3 = pd.DataFrame({'xgb':predictions_xgb[2], 'rf':predictions_rf[2], 'svm':predictions_svm[2], 'adb':predictions_adb[2], 'cnn':predictions_cnn[2]})
avg_score_combine_seed4 = pd.DataFrame({'xgb':predictions_xgb[3], 'rf':predictions_rf[3], 'svm':predictions_svm[3], 'adb':predictions_adb[3], 'cnn':predictions_cnn[3]})
avg_score_combine_seed5 = pd.DataFrame({'xgb':predictions_xgb[4], 'rf':predictions_rf[4], 'svm':predictions_svm[4], 'adb':predictions_adb[4], 'cnn':predictions_cnn[4]})

In [None]:
def avg_score_combine(models_list, single_score):
  for j in models_list[len(scoreSys):]:
    if len(j.split('&')) == 2:
      single_score[j] = (single_score[j.split('&')[0]]+single_score[j.split('&')[1]]) / 2
    elif len(j.split('&')) == 3:
      single_score[j] = (single_score[j.split('&')[0]]+single_score[j.split('&')[1]]+single_score[j.split('&')[2]]) / 3
    elif len(j.split('&')) == 4:
      single_score[j] = (single_score[j.split('&')[0]]+single_score[j.split('&')[1]]+single_score[j.split('&')[2]]+single_score[j.split('&')[3]]) / 4
    elif len(j.split('&')) == 5:
      single_score[j] = (single_score[j.split('&')[0]]+single_score[j.split('&')[1]]+single_score[j.split('&')[2]]+single_score[j.split('&')[3]]+single_score[j.split('&')[4]]) / 5

In [None]:
avg_score_combine(models_list, avg_score_combine_seed1)
avg_score_combine(models_list, avg_score_combine_seed2)
avg_score_combine(models_list, avg_score_combine_seed3)
avg_score_combine(models_list, avg_score_combine_seed4)
avg_score_combine(models_list, avg_score_combine_seed5)

# Perform average rank combinations

In [None]:
def score_to_rank(array):
  res = np.argsort(np.flip(np.argsort(array)))+1
  return res

In [None]:
avg_rank_combine_seed1 = pd.DataFrame({'xgb':score_to_rank(predictions_xgb[0]), 'rf':score_to_rank(predictions_rf[0]), 'svm':score_to_rank(predictions_svm[0]), 'adb':score_to_rank(predictions_adb[0]), 'cnn':score_to_rank(predictions_cnn[0])})
avg_rank_combine_seed2 = pd.DataFrame({'xgb':score_to_rank(predictions_xgb[1]), 'rf':score_to_rank(predictions_rf[1]), 'svm':score_to_rank(predictions_svm[1]), 'adb':score_to_rank(predictions_adb[1]), 'cnn':score_to_rank(predictions_cnn[1])})
avg_rank_combine_seed3 = pd.DataFrame({'xgb':score_to_rank(predictions_xgb[2]), 'rf':score_to_rank(predictions_rf[2]), 'svm':score_to_rank(predictions_svm[2]), 'adb':score_to_rank(predictions_adb[2]), 'cnn':score_to_rank(predictions_cnn[2])})
avg_rank_combine_seed4 = pd.DataFrame({'xgb':score_to_rank(predictions_xgb[3]), 'rf':score_to_rank(predictions_rf[3]), 'svm':score_to_rank(predictions_svm[3]), 'adb':score_to_rank(predictions_adb[3]), 'cnn':score_to_rank(predictions_cnn[3])})
avg_rank_combine_seed5 = pd.DataFrame({'xgb':score_to_rank(predictions_xgb[4]), 'rf':score_to_rank(predictions_rf[4]), 'svm':score_to_rank(predictions_svm[4]), 'adb':score_to_rank(predictions_adb[4]), 'cnn':score_to_rank(predictions_cnn[4])})

In [None]:
def avg_rank_combine(models_list, single_rank):
  for j in models_list[len(scoreSys):]:
    if len(j.split('&')) == 2:
      single_rank[j+'_r'] = (single_rank[j.split('&')[0]]+single_rank[j.split('&')[1]]) / 2
    elif len(j.split('&')) == 3:
      single_rank[j+'_r'] = (single_rank[j.split('&')[0]]+single_rank[j.split('&')[1]]+single_rank[j.split('&')[2]]) / 3
    elif len(j.split('&')) == 4:
      single_rank[j+'_r'] = (single_rank[j.split('&')[0]]+single_rank[j.split('&')[1]]+single_rank[j.split('&')[2]]+single_rank[j.split('&')[3]]) / 4
    elif len(j.split('&')) == 5:
      single_rank[j+'_r'] = (single_rank[j.split('&')[0]]+single_rank[j.split('&')[1]]+single_rank[j.split('&')[2]]+single_rank[j.split('&')[3]]+single_rank[j.split('&')[4]]) / 5

In [None]:
avg_rank_combine(models_list, avg_rank_combine_seed1)
avg_rank_combine(models_list, avg_rank_combine_seed2)
avg_rank_combine(models_list, avg_rank_combine_seed3)
avg_rank_combine(models_list, avg_rank_combine_seed4)
avg_rank_combine(models_list, avg_rank_combine_seed5)

# Perform weighted score combination by diversity strength

In [None]:
ds_score_combine_seed1 = pd.DataFrame()
ds_score_combine_seed2 = pd.DataFrame()
ds_score_combine_seed3 = pd.DataFrame()
ds_score_combine_seed4 = pd.DataFrame()
ds_score_combine_seed5 = pd.DataFrame()

In [None]:
def ds_score_combine(models_list, single_score, ds_score_combine, ds_score):
  for j in models_list[len(scoreSys):]:
    if len(j.split('&')) == 2:
      ds_score_combine[j+'_ds'] = (single_score[j.split('&')[0]]*ds_score[scoreSys.index(j.split('&')[0])]+single_score[j.split('&')[1]]*ds_score[scoreSys.index(j.split('&')[1])])/(ds_score[scoreSys.index(j.split('&')[0])] + ds_score[scoreSys.index(j.split('&')[1])])
    elif len(j.split('&')) == 3:
      ds_score_combine[j+'_ds'] = (single_score[j.split('&')[0]]*ds_score[scoreSys.index(j.split('&')[0])]+single_score[j.split('&')[1]]*ds_score[scoreSys.index(j.split('&')[1])]+single_score[j.split('&')[2]]*ds_score[scoreSys.index(j.split('&')[2])])/(ds_score[scoreSys.index(j.split('&')[0])] + ds_score[scoreSys.index(j.split('&')[1])] + ds_score[scoreSys.index(j.split('&')[2])])
    elif len(j.split('&')) == 4:
      ds_score_combine[j+'_ds'] = (single_score[j.split('&')[0]]*ds_score[scoreSys.index(j.split('&')[0])]+single_score[j.split('&')[1]]*ds_score[scoreSys.index(j.split('&')[1])]+single_score[j.split('&')[2]]*ds_score[scoreSys.index(j.split('&')[2])]+single_score[j.split('&')[3]]*ds_score[scoreSys.index(j.split('&')[3])])/(ds_score[scoreSys.index(j.split('&')[0])] + ds_score[scoreSys.index(j.split('&')[1])] + ds_score[scoreSys.index(j.split('&')[2])] + ds_score[scoreSys.index(j.split('&')[3])])
    elif len(j.split('&')) == 5:
      ds_score_combine[j+'_ds'] = (single_score[j.split('&')[0]]*ds_score[scoreSys.index(j.split('&')[0])]+single_score[j.split('&')[1]]*ds_score[scoreSys.index(j.split('&')[1])]+single_score[j.split('&')[2]]*ds_score[scoreSys.index(j.split('&')[2])]+single_score[j.split('&')[3]]*ds_score[scoreSys.index(j.split('&')[3])]+single_score[j.split('&')[4]]*ds_score[scoreSys.index(j.split('&')[4])])/(ds_score[scoreSys.index(j.split('&')[0])] + ds_score[scoreSys.index(j.split('&')[1])] + ds_score[scoreSys.index(j.split('&')[2])] + ds_score[scoreSys.index(j.split('&')[3])] + ds_score[scoreSys.index(j.split('&')[4])])

In [None]:
ds_score_combine(models_list, avg_score_combine_seed1, ds_score_combine_seed1, ds_score[0])
ds_score_combine(models_list, avg_score_combine_seed2, ds_score_combine_seed2, ds_score[1])
ds_score_combine(models_list, avg_score_combine_seed3, ds_score_combine_seed3, ds_score[2])
ds_score_combine(models_list, avg_score_combine_seed4, ds_score_combine_seed4, ds_score[3])
ds_score_combine(models_list, avg_score_combine_seed5, ds_score_combine_seed5, ds_score[4])

# Perform weighted rank combination by diversity strength

In [None]:
ds_rank_combine_seed1 = pd.DataFrame()
ds_rank_combine_seed2 = pd.DataFrame()
ds_rank_combine_seed3 = pd.DataFrame()
ds_rank_combine_seed4 = pd.DataFrame()
ds_rank_combine_seed5 = pd.DataFrame()

In [None]:
def ds_rank_combine(models_list, single_rank, ds_rank_combine, ds_rank):
  for j in models_list[len(scoreSys):]:
    if len(j.split('&')) == 2:
      ds_rank_combine[j+'_ds_r'] = (single_rank[j.split('&')[0]]*ds_rank[scoreSys.index(j.split('&')[0])]+single_rank[j.split('&')[1]]*ds_rank[scoreSys.index(j.split('&')[1])])/(ds_rank[scoreSys.index(j.split('&')[0])] + ds_rank[scoreSys.index(j.split('&')[1])])
    elif len(j.split('&')) == 3:
      ds_rank_combine[j+'_ds_r'] = (single_rank[j.split('&')[0]]*ds_rank[scoreSys.index(j.split('&')[0])]+single_rank[j.split('&')[1]]*ds_rank[scoreSys.index(j.split('&')[1])]+single_rank[j.split('&')[2]]*ds_rank[scoreSys.index(j.split('&')[2])])/(ds_rank[scoreSys.index(j.split('&')[0])] + ds_rank[scoreSys.index(j.split('&')[1])] + ds_rank[scoreSys.index(j.split('&')[2])])
    elif len(j.split('&')) == 4:
      ds_rank_combine[j+'_ds_r'] = (single_rank[j.split('&')[0]]*ds_rank[scoreSys.index(j.split('&')[0])]+single_rank[j.split('&')[1]]*ds_rank[scoreSys.index(j.split('&')[1])]+single_rank[j.split('&')[2]]*ds_rank[scoreSys.index(j.split('&')[2])]+single_rank[j.split('&')[3]]*ds_rank[scoreSys.index(j.split('&')[3])])/(ds_rank[scoreSys.index(j.split('&')[0])] + ds_rank[scoreSys.index(j.split('&')[1])] + ds_rank[scoreSys.index(j.split('&')[2])] + ds_rank[scoreSys.index(j.split('&')[3])])
    elif len(j.split('&')) == 5:
      ds_rank_combine[j+'_ds_r'] = (single_rank[j.split('&')[0]]*ds_rank[scoreSys.index(j.split('&')[0])]+single_rank[j.split('&')[1]]*ds_rank[scoreSys.index(j.split('&')[1])]+single_rank[j.split('&')[2]]*ds_rank[scoreSys.index(j.split('&')[2])]+single_rank[j.split('&')[3]]*ds_rank[scoreSys.index(j.split('&')[3])]+single_rank[j.split('&')[4]]*ds_rank[scoreSys.index(j.split('&')[4])])/(ds_rank[scoreSys.index(j.split('&')[0])] + ds_rank[scoreSys.index(j.split('&')[1])] + ds_rank[scoreSys.index(j.split('&')[2])] + ds_rank[scoreSys.index(j.split('&')[3])] + ds_rank[scoreSys.index(j.split('&')[4])])

In [None]:
ds_rank_combine(models_list, avg_rank_combine_seed1, ds_rank_combine_seed1, ds_rank[0])
ds_rank_combine(models_list, avg_rank_combine_seed2, ds_rank_combine_seed2, ds_rank[1])
ds_rank_combine(models_list, avg_rank_combine_seed3, ds_rank_combine_seed3, ds_rank[2])
ds_rank_combine(models_list, avg_rank_combine_seed4, ds_rank_combine_seed4, ds_rank[3])
ds_rank_combine(models_list, avg_rank_combine_seed5, ds_rank_combine_seed5, ds_rank[4])

# Perform weighted score combination by performance strength (MAE)

In [None]:
ps_score_combine_seed1 = pd.DataFrame()
ps_score_combine_seed2 = pd.DataFrame()
ps_score_combine_seed3 = pd.DataFrame()
ps_score_combine_seed4 = pd.DataFrame()
ps_score_combine_seed5 = pd.DataFrame()

In [None]:
def ps_score_combine(models_list, single_score, ps_score_combine, ps_score):
  for j in models_list[len(scoreSys):]:
    if len(j.split('&')) == 2:
      ps_score_combine[j+'_ps'] = (single_score[j.split('&')[0]]*(1 / ps_score[scoreSys.index(j.split('&')[0])])+single_score[j.split('&')[1]]*(1 / ps_score[scoreSys.index(j.split('&')[1])]))/(1 / ps_score[scoreSys.index(j.split('&')[0])] + 1 / ps_score[scoreSys.index(j.split('&')[1])])
    elif len(j.split('&')) == 3:
      ps_score_combine[j+'_ps'] = (single_score[j.split('&')[0]]*(1 / ps_score[scoreSys.index(j.split('&')[0])])+single_score[j.split('&')[1]]*(1 / ps_score[scoreSys.index(j.split('&')[1])])+single_score[j.split('&')[2]]*(1 / ps_score[scoreSys.index(j.split('&')[2])]))/(1 / ps_score[scoreSys.index(j.split('&')[0])] + 1 / ps_score[scoreSys.index(j.split('&')[1])] + 1 / ps_score[scoreSys.index(j.split('&')[2])])
    elif len(j.split('&')) == 4:
      ps_score_combine[j+'_ps'] = (single_score[j.split('&')[0]]*(1 / ps_score[scoreSys.index(j.split('&')[0])])+single_score[j.split('&')[1]]*(1 / ps_score[scoreSys.index(j.split('&')[1])])+single_score[j.split('&')[2]]*(1 / ps_score[scoreSys.index(j.split('&')[2])])+single_score[j.split('&')[3]]*(1 / ps_score[scoreSys.index(j.split('&')[3])]))/(1 / ps_score[scoreSys.index(j.split('&')[0])] + 1 / ps_score[scoreSys.index(j.split('&')[1])] + 1 / ps_score[scoreSys.index(j.split('&')[2])] + ps_score[scoreSys.index(j.split('&')[3])])
    elif len(j.split('&')) == 5:
      ps_score_combine[j+'_ps'] = (single_score[j.split('&')[0]]*(1 / ps_score[scoreSys.index(j.split('&')[0])])+single_score[j.split('&')[1]]*(1 / ps_score[scoreSys.index(j.split('&')[1])])+single_score[j.split('&')[2]]*(1 / ps_score[scoreSys.index(j.split('&')[2])])+single_score[j.split('&')[3]]*(1 / ps_score[scoreSys.index(j.split('&')[3])])+single_score[j.split('&')[4]]*(1 / ps_score[scoreSys.index(j.split('&')[4])]))/(1 / ps_score[scoreSys.index(j.split('&')[0])] + 1 / ps_score[scoreSys.index(j.split('&')[1])] + 1 / ps_score[scoreSys.index(j.split('&')[2])] + 1 / ps_score[scoreSys.index(j.split('&')[3])] + 1 / ps_score[scoreSys.index(j.split('&')[4])])

In [None]:
ps_score_combine(models_list, avg_score_combine_seed1, ps_score_combine_seed1, ps_score[0])
ps_score_combine(models_list, avg_score_combine_seed2, ps_score_combine_seed2, ps_score[1])
ps_score_combine(models_list, avg_score_combine_seed3, ps_score_combine_seed3, ps_score[2])
ps_score_combine(models_list, avg_score_combine_seed4, ps_score_combine_seed4, ps_score[3])
ps_score_combine(models_list, avg_score_combine_seed5, ps_score_combine_seed5, ps_score[4])

# Perform weighted rank combination by performance strength

In [None]:
ps_rank_combine_seed1 = pd.DataFrame()
ps_rank_combine_seed2 = pd.DataFrame()
ps_rank_combine_seed3 = pd.DataFrame()
ps_rank_combine_seed4 = pd.DataFrame()
ps_rank_combine_seed5 = pd.DataFrame()

In [None]:
def ps_rank_combine(models_list, single_rank, ps_rank_combine, ps_score):
  for j in models_list[len(scoreSys):]:
    if len(j.split('&')) == 2:
      ps_rank_combine[j+'_ps_r'] = (single_rank[j.split('&')[0]]*(ps_score[scoreSys.index(j.split('&')[0])])+single_rank[j.split('&')[1]]*(ps_score[scoreSys.index(j.split('&')[1])]))/(ps_score[scoreSys.index(j.split('&')[0])] + ps_score[scoreSys.index(j.split('&')[1])])
    elif len(j.split('&')) == 3:
      ps_rank_combine[j+'_ps_r'] = (single_rank[j.split('&')[0]]*(ps_score[scoreSys.index(j.split('&')[0])])+single_rank[j.split('&')[1]]*(ps_score[scoreSys.index(j.split('&')[1])])+single_rank[j.split('&')[2]]*(ps_score[scoreSys.index(j.split('&')[2])]))/(ps_score[scoreSys.index(j.split('&')[0])] + ps_score[scoreSys.index(j.split('&')[1])] + ps_score[scoreSys.index(j.split('&')[2])])
    elif len(j.split('&')) == 4:
      ps_rank_combine[j+'_ps_r'] = (single_rank[j.split('&')[0]]*(ps_score[scoreSys.index(j.split('&')[0])])+single_rank[j.split('&')[1]]*(ps_score[scoreSys.index(j.split('&')[1])])+single_rank[j.split('&')[2]]*(ps_score[scoreSys.index(j.split('&')[2])])+single_rank[j.split('&')[3]]*(ps_score[scoreSys.index(j.split('&')[3])]))/(ps_score[scoreSys.index(j.split('&')[0])] + ps_score[scoreSys.index(j.split('&')[1])] + ps_score[scoreSys.index(j.split('&')[2])] + ps_score[scoreSys.index(j.split('&')[3])])
    elif len(j.split('&')) == 5:
      ps_rank_combine[j+'_ps_r'] = (single_rank[j.split('&')[0]]*(ps_score[scoreSys.index(j.split('&')[0])])+single_rank[j.split('&')[1]]*(ps_score[scoreSys.index(j.split('&')[1])])+single_rank[j.split('&')[2]]*(ps_score[scoreSys.index(j.split('&')[2])])+single_rank[j.split('&')[3]]*(ps_score[scoreSys.index(j.split('&')[3])])+single_rank[j.split('&')[4]]*(ps_score[scoreSys.index(j.split('&')[4])]))/(ps_score[scoreSys.index(j.split('&')[0])] + ps_score[scoreSys.index(j.split('&')[1])] + ps_score[scoreSys.index(j.split('&')[2])] + ps_score[scoreSys.index(j.split('&')[3])] + ps_score[scoreSys.index(j.split('&')[4])])

In [None]:
ps_rank_combine(models_list, avg_rank_combine_seed1, ps_rank_combine_seed1, ps_score[0])
ps_rank_combine(models_list, avg_rank_combine_seed2, ps_rank_combine_seed2, ps_score[1])
ps_rank_combine(models_list, avg_rank_combine_seed3, ps_rank_combine_seed3, ps_score[2])
ps_rank_combine(models_list, avg_rank_combine_seed4, ps_rank_combine_seed4, ps_score[3])
ps_rank_combine(models_list, avg_rank_combine_seed5, ps_rank_combine_seed5, ps_score[4])

In [None]:
avg_rank_combine_seed1.rename(columns={'xgb': 'xgb_r', 'rf': 'rf_r', 'svm': 'svm_r', 'adb': 'adb_r', 'cnn': 'cnn_r'}, inplace=True)
avg_rank_combine_seed2.rename(columns={'xgb': 'xgb_r', 'rf': 'rf_r', 'svm': 'svm_r', 'adb': 'adb_r', 'cnn': 'cnn_r'}, inplace=True)
avg_rank_combine_seed3.rename(columns={'xgb': 'xgb_r', 'rf': 'rf_r', 'svm': 'svm_r', 'adb': 'adb_r', 'cnn': 'cnn_r'}, inplace=True)
avg_rank_combine_seed4.rename(columns={'xgb': 'xgb_r', 'rf': 'rf_r', 'svm': 'svm_r', 'adb': 'adb_r', 'cnn': 'cnn_r'}, inplace=True)
avg_rank_combine_seed5.rename(columns={'xgb': 'xgb_r', 'rf': 'rf_r', 'svm': 'svm_r', 'adb': 'adb_r', 'cnn': 'cnn_r'}, inplace=True)

# Calculate MAE

In [None]:
score_combine_list = np.hstack((np.array(avg_score_combine_seed1.columns), np.array(ds_score_combine_seed1.columns), np.array(ps_score_combine_seed1.columns)))
MAE = pd.DataFrame(index = score_combine_list)
for i in range(1, 6):
  mae_avg, mae_ds, mae_ps = [], [], []
  for col in globals()['avg_score_combine_seed%s' %i].columns:
    mae = mean_absolute_error(np.array(globals()['avg_score_combine_seed%s' %i][col]), y_test)
    mae_avg.append(mae)
  for col in globals()['ds_score_combine_seed%s' %i].columns:
    mae = mean_absolute_error(np.array(globals()['ds_score_combine_seed%s' %i][col]), y_test)
    mae_ds.append(mae)
  for col in globals()['ps_score_combine_seed%s' %i].columns:
    mae = mean_absolute_error(np.array(globals()['ps_score_combine_seed%s' %i][col]), y_test)
    mae_ps.append(mae)
  MAE['seed'+str(i)] = np.hstack((mae_avg, mae_ds, mae_ps))

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
MAE['avg_MAE'] = MAE.mean(axis=1)

In [None]:
MAE.sort_values(by='avg_MAE')