In [1]:
# Load Tox21 Data
import numpy as np

tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD','NR-PPAR-gamma', 
         'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']

ids = np.load('./tox21_embed_data/'+'id'+'.npz', allow_pickle=True)
label = np.load('./tox21_embed_data/'+'label'+'.npz', allow_pickle=True)
ecfp = np.load('./tox21_embed_data/'+'ecfp'+'.npz', allow_pickle=True)
multi = np.load('./tox21_embed_data/'+'multi'+'.npz', allow_pickle=True)

In [2]:
# Ramdom Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

def training_random_forest(x, y):
    model = RandomForestClassifier(class_weight="balanced", n_estimators=500, n_jobs=64)
    model.fit(x, y)
    return model

def get_clear_data(preds, labels):
    new_p = []
    new_l = []
    for p, l in zip(preds, labels):
        if l != -1:
            new_p.append(p)
            new_l.append(l)
        else:
            None
    return np.array(new_p), np.array(new_l)

def get_score(model, x, y):
    pred = model.predict_proba(x)
    p, l = get_clear_data(pred[:,2], y)
    score = roc_auc_score(l, p)
    return score

def get_scores(model, x, y):
    pred = model.predict_proba(x)
    scores = []
    for i in range(len(pred)):
        p, l = get_clear_data(pred[i][:,2], y[:,i])
        scores.append(roc_auc_score(l, p))
    return scores

def print_scores(scores, tasks):
    mean = 0
    for i in range(len(tasks)):
        mean += scores[i]
        print(tasks[i], scores[i])
    mean /= len(tasks)
    print('mean', mean)

In [3]:
# ECFP single
train_scores = []
valid_scores = []
test_scores = []
for i in range(len(tasks)):
    model = training_random_forest(ecfp['train'], label['train'][:,i])
    train_scores.append(get_score(model, ecfp['train'], label['train'][:,i]))
    valid_scores.append(get_score(model, ecfp['valid'], label['valid'][:,i]))
    test_scores.append(get_score(model, ecfp['test'], label['test'][:,i]))
print_scores(train_scores, tasks)
print_scores(valid_scores, tasks)
print_scores(test_scores, tasks)

NR-AR 0.9998508336188192
NR-AR-LBD 0.9999824118270692
NR-AhR 0.9999932033090583
NR-Aromatase 0.9997313601208402
NR-ER 0.9981555959121228
NR-ER-LBD 0.9992716834313912
NR-PPAR-gamma 0.9999604245817503
SR-ARE 0.9998767214262138
SR-ATAD5 0.999979993130975
SR-HSE 0.9991726669855345
SR-MMP 0.9997584463782954
SR-p53 0.9999933099851817
mean 0.9996438875589377
NR-AR 0.6575431034482759
NR-AR-LBD 0.7717125382262997
NR-AhR 0.8566856967307545
NR-Aromatase 0.77350921111753
NR-ER 0.6835322483423749
NR-ER-LBD 0.8222523744911805
NR-PPAR-gamma 0.7853719762665449
SR-ARE 0.7716013824884793
SR-ATAD5 0.9319733050204763
SR-HSE 0.7652088589851416
SR-MMP 0.8457722568971477
SR-p53 0.8128580341444047
mean 0.7898350821798842
NR-AR 0.8606878306878306
NR-AR-LBD 0.9111235119047619
NR-AhR 0.8781826360773729
NR-Aromatase 0.7826697892271663
NR-ER 0.7215184266477674
NR-ER-LBD 0.8482063144225307
NR-PPAR-gamma 0.8015623831950363
SR-ARE 0.8070589539007094
SR-ATAD5 0.8588739067055393
SR-HSE 0.7597311956608438
SR-MMP 0.91312

In [4]:
# ECFP multi
model = training_random_forest(ecfp['train'], label['train'])

train_scores = get_scores(model, ecfp['train'], label['train'])
valid_scores = get_scores(model, ecfp['valid'], label['valid'])
test_scores = get_scores(model, ecfp['test'], label['test'])

print_scores(train_scores, tasks)
print_scores(valid_scores, tasks)
print_scores(test_scores, tasks)

NR-AR 0.9998008735868448
NR-AR-LBD 0.9999501668433627
NR-AhR 0.9999585044131987
NR-Aromatase 0.9995525855393351
NR-ER 0.9968928167221092
NR-ER-LBD 0.9987782311634086
NR-PPAR-gamma 0.9999340409695838
SR-ARE 0.9995410192884572
SR-ATAD5 0.9999783258918895
SR-HSE 0.9989617514966495
SR-MMP 0.9997168543639622
SR-p53 0.9999620899160291
mean 0.9994189383495691
NR-AR 0.7062260536398468
NR-AR-LBD 0.8119648318042814
NR-AhR 0.858388162105425
NR-Aromatase 0.7818858720071892
NR-ER 0.6844002411091017
NR-ER-LBD 0.8409769335142471
NR-PPAR-gamma 0.808839190628328
SR-ARE 0.776248952660243
SR-ATAD5 0.9598437736993781
SR-HSE 0.7629941126997477
SR-MMP 0.8354544900075224
SR-p53 0.8044353695195962
mean 0.802638165282909
NR-AR 0.8526719576719577
NR-AR-LBD 0.9159970238095237
NR-AhR 0.8826360773729195
NR-Aromatase 0.7859744990892532
NR-ER 0.7096806343019134
NR-ER-LBD 0.8606647187728268
NR-PPAR-gamma 0.8236899155266502
SR-ARE 0.7940159574468086
SR-ATAD5 0.8625485908649174
SR-HSE 0.7379556512722341
SR-MMP 0.910890

In [5]:
# Embed single
train_scores = []
valid_scores = []
test_scores = []
for i in range(len(tasks)):
    model = training_random_forest(multi['train'], label['train'][:,i])
    train_scores.append(get_score(model, multi['train'], label['train'][:,i]))
    valid_scores.append(get_score(model, multi['valid'], label['valid'][:,i]))
    test_scores.append(get_score(model, multi['test'], label['test'][:,i]))
print_scores(train_scores, tasks)
print_scores(valid_scores, tasks)
print_scores(test_scores, tasks)

NR-AR 0.9999889374214914
NR-AR-LBD 0.999961403731624
NR-AhR 0.9999991056985603
NR-Aromatase 1.0
NR-ER 0.9999467590923943
NR-ER-LBD 0.9999708537808747
NR-PPAR-gamma 0.9999701448599169
SR-ARE 0.9999995029089767
SR-ATAD5 0.9999870788970879
SR-HSE 0.9999707439805741
SR-MMP 0.9999959118960271
SR-p53 1.0
mean 0.9999825368556273
NR-AR 0.7082854406130269
NR-AR-LBD 0.8581804281345565
NR-AhR 0.8638155705179079
NR-Aromatase 0.8152962321073238
NR-ER 0.6602772754671488
NR-ER-LBD 0.8231795567616463
NR-PPAR-gamma 0.7341016278715959
SR-ARE 0.805705383326351
SR-ATAD5 0.9173744880934324
SR-HSE 0.8059153350154191
SR-MMP 0.8943420009352065
SR-p53 0.8303556236174919
mean 0.809735746871759
NR-AR 0.8805291005291005
NR-AR-LBD 0.9383184523809525
NR-AhR 0.8963202878992352
NR-Aromatase 0.8442622950819673
NR-ER 0.7573861622962439
NR-ER-LBD 0.8284027270513756
NR-PPAR-gamma 0.9082006428945205
SR-ARE 0.8312610815602838
SR-ATAD5 0.878188775510204
SR-HSE 0.8154462790141181
SR-MMP 0.9496025338467271
SR-p53 0.8992906195

In [6]:
# Embed multi
model = training_random_forest(multi['train'], label['train'])

train_scores = get_scores(model, multi['train'], label['train'])
valid_scores = get_scores(model, multi['valid'], label['valid'])
test_scores = get_scores(model, multi['test'], label['test'])

print_scores(train_scores, tasks)
print_scores(valid_scores, tasks)
print_scores(test_scores, tasks)

NR-AR 0.999963957405504
NR-AR-LBD 0.9999721520595262
NR-AhR 0.9999658376850041
NR-Aromatase 1.0
NR-ER 0.999860402706175
NR-ER-LBD 0.9999457744760458
NR-PPAR-gamma 0.9999770879157501
SR-ARE 0.9999607298091568
SR-ATAD5 0.9999837444189171
SR-HSE 0.9999625795100365
SR-MMP 0.9999998222563491
SR-p53 1.0
mean 0.9999660073535388
NR-AR 0.7336206896551724
NR-AR-LBD 0.8701070336391438
NR-AhR 0.861037863853972
NR-Aromatase 0.7975158867706528
NR-ER 0.6586618444846293
NR-ER-LBD 0.839981908638625
NR-PPAR-gamma 0.7704244637151986
SR-ARE 0.7897596355257646
SR-ATAD5 0.9173365690884271
SR-HSE 0.8223156714325763
SR-MMP 0.8964868766137394
SR-p53 0.8343542623787646
mean 0.8159668921497222
NR-AR 0.8947354497354496
NR-AR-LBD 0.927938988095238
NR-AhR 0.8891677912730545
NR-Aromatase 0.8475670049440541
NR-ER 0.750775159461375
NR-ER-LBD 0.853441279116955
NR-PPAR-gamma 0.922740524781341
SR-ARE 0.8313497340425533
SR-ATAD5 0.9012998056365403
SR-HSE 0.8103015075376884
SR-MMP 0.9488365834471908
SR-p53 0.91651573466153

In [7]:
exit()