In [2]:
from skfp.datasets.moleculenet import load_bace, load_bbbp, load_clintox, load_muv, load_pcba, load_tox21, load_toxcast
from skfp.model_selection import scaffold_train_test_split, butina_train_test_split#, maxmin_train_test_split, randomized_scaffold_train_test_split
from skfp.fingerprints import ECFPFingerprint, MACCSFingerprint, RDKitFingerprint
from skfp.preprocessing import MolFromSmilesTransformer
from skfp.filters import BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter
from skfp.filters import HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter
from skfp.filters import OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter
from skfp.filters import RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neural_network import MLPClassifier

import numpy as np

from deepchem.feat.smiles_tokenizer import SmilesTokenizer

  from .autonotebook import tqdm as notebook_tqdm
2024-12-09 11:34:14.202643: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-09 11:34:14.309667: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-09 11:34:14.375081: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733740454.548801    5637 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733740454.611885    5637 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-09 11:34:14.917756: I tensorflow/core/platform/cpu_feature_guard.cc:210] This Tenso

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [3]:
# SMILEs tokenizer
#tokenizer = SmilesTokenizer("./vocab.txt")
#print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))

In [4]:

# model for classification
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    GaussianNB(),
    tree.DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1),
    svm.SVC(probability=True),
    # svm.SVR(kernel='linear'),
    # svm.SVR(kernel='poly'),
    # svm.SVR(kernel='rbf'),
    # svm.SVR(kernel='sigmoid'),
]

In [5]:
fingerprints = [
    ECFPFingerprint(),
    MACCSFingerprint(),
    RDKitFingerprint(),
    BeyondRo5Filter(),
]

filters = [
    BeyondRo5Filter, BMSFilter, BrenkFilter, FAF4DruglikeFilter, FAF4LeadlikeFilter, GhoseFilter, GlaxoFilter, GSKFilter,
    HaoFilter, InpharmaticaFilter, LINTFilter, LipinskiFilter, MLSMRFilter, MolecularWeightFilter, NIBRFilter, NIHFilter,
    OpreaFilter, PAINSFilter, PfizerFilter, REOSFilter, RuleOfFourFilter, RuleOfThreeFilter, RuleOfTwoFilter, RuleOfVeberFilter,
    RuleOfXuFilter, SureChEMBLFilter, TiceHerbicidesFilter, TiceInsecticidesFilter, ValenceDiscoveryFilter, ZINCBasicFilter, ZINCDruglikeFilter
]

filter_names = []

for filter in filters:
    filter_names.append(filter.__name__)

In [6]:
pipelines = {
    "baseline_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint()), LogisticRegression(max_iter=1000)),
    "rf_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), RandomForestClassifier(random_state=41, class_weight='balanced')),
    "gb_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), GradientBoostingClassifier()),
    "knc_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[1]),
    "svm_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[7]),
    "GNB_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[2]),
    "mlp_pipeline": make_pipeline(MolFromSmilesTransformer(), make_union(ECFPFingerprint(count=True), MACCSFingerprint(), RDKitFingerprint()), models[6]),
}

In [None]:
def calculate_scores(y_test, y_pred, y_proba):
    r2score = r2_score(y_test, y_pred)
    print(f"R2 score for Linear regression model: {r2score:.4f}")

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for Linear regression model: {accuracy:.4f}")

    roc_auc = roc_auc_score(y_test, y_proba[:, 1])
    print(f"ROC-AUC score for Linear regression model: {roc_auc:.4f}")
    return roc_auc

## Applied filters

In [19]:
smiles_bace, labels_bace = load_bace()
smiles_bace_train, smiles_bace_test, y_train, y_test = scaffold_train_test_split(
    smiles_bace, labels_bace, test_size=0.2
)

pipelines['baseline_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_bace_test)
print(len(smiles_bace_test))
bace_mols_mapper = dict() # smile -> label
for i in range(len(smiles_bace)):
    bace_mols_mapper[smiles_bace[i]] = labels_bace[i]

for idx, filter in enumerate(filters):
    if idx == 1:
        break

    filter_f = filter()
    filter_f.fit(smiles_bace, labels_bace)
    filtered_mols = filter_f.transform(smiles_bace)
    filtered_labels = [ bace_mols_mapper[mol] for mol in filtered_mols ]
    rest_mols = [ mol for mol in smiles_bace if mol not in filtered_mols ]
    rest_labels = [ bace_mols_mapper[mol] for mol in rest_mols ]

    filtered_mols_bace_train, filtered_mols_bace_test, y_train_filtered, y_test_filtered = scaffold_train_test_split(
        filtered_mols, filtered_labels, test_size=0.2
    )
    print(len(filtered_mols_bace_train) , len(filtered_mols_bace_test), len(y_train_filtered), len(y_test_filtered))

    pipelines['baseline_pipeline'].fit(filtered_mols_bace_train, y_train_filtered)
    y_pred_filtered = pipelines['baseline_pipeline'].predict(filtered_mols_bace_test)
    y_proba_filtered = pipelines['baseline_pipeline'].predict_proba(filtered_mols_bace_test)

    print("Baseline: ")
    print(len(y_test), len(y_pred), len(y_proba))
    roc_auc_baseline = calculate_scores(y_test, y_pred, y_proba)
    print("Filtered: ")
    print(len(y_test_filtered), len(y_pred_filtered), len(y_proba_filtered))
    roc_auc_filtered = calculate_scores(y_test_filtered, y_pred_filtered, y_proba_filtered)

    pipelines['baseline_pipeline'].fit(filtered_mols_bace_train, y_train_filtered)
    y_pred_mixed= pipelines['baseline_pipeline'].predict(smiles_bace_test)
    y_proba_mixed = pipelines['baseline_pipeline'].predict_proba(smiles_bace_test)

    print("Mixed: ")
    roc_auc_mixed = calculate_scores(y_test, y_pred_mixed, y_proba_mixed)

    pipelines['baseline_pipeline'].fit(filtered_mols_bace_train, y_train_filtered)
    y_pred_rest = pipelines['baseline_pipeline'].predict(rest_mols)
    y_proba_rest = pipelines['baseline_pipeline'].predict_proba(rest_mols)

    print("Rest: ")
    roc_auc_mixed = calculate_scores(y_test, y_pred_rest, y_proba_rest)
    

303
1198 300 1198 300
Baseline: 
303 303 303
R2 score for Linear regression model: -0.1929
Accuracy for Linear regression model: 0.7096
ROC-AUC score for Linear regression model: 0.7980
Filtered: 
300 300 300
R2 score for Linear regression model: -0.1364
Accuracy for Linear regression model: 0.7200
ROC-AUC score for Linear regression model: 0.8042
Mixed: 
R2 score for Linear regression model: -0.1929
Accuracy for Linear regression model: 0.7096
ROC-AUC score for Linear regression model: 0.7951
Rest: 


ValueError: Found input variables with inconsistent numbers of samples: [303, 15]

## Classification with the Logistic regression and RFC based on the clintox dataset; wihout filtering molecules

### Bace dataset split

In [7]:
smiles_bace, labels_bace = load_bace()

In [8]:
print(len(smiles_bace))
print(smiles_bace[0], labels_bace[0])
print(smiles_bace[1], labels_bace[1])

1513
O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C 1
Fc1cc(cc(F)c1)C[C@H](NC(=O)[C@@H](N1CC[C@](NC(=O)C)(CC(C)C)C1=O)CCc1ccccc1)[C@H](O)[C@@H]1[NH2+]C[C@H](OCCC)C1 1


In [14]:
bace_mols_mapper = dict() # smile -> label
for i in range(len(smiles_bace)):
    bace_mols_mapper[smiles_bace[i]] = labels_bace[i]

In [None]:
filter = MLSMRFilter()

filter.fit(smiles_bace, labels_bace)
# filtered_mols = filter.transform(smiles_bace)
filtered_mols = filter.fit_transform(smiles_bace, labels_bace)
print(len(filtered_mols), len(labels_bace))
print(filtered_mols[0], labels_bace[0])

395 1513
O1CC[C@@H](NC(=O)[C@@H](Cc2cc3cc(ccc3nc2N)-c2ccccc2C)C)CC1(C)C 1


In [16]:
filtered_labels = [ bace_mols_mapper[mol] for mol in filtered_mols ]
print(len(filtered_labels))

395


In [17]:
smiles_bace_train, smiles_bace_test, y_train, y_test = scaffold_train_test_split(
    filtered_mols, filtered_labels, test_size=0.2
)

### Checking the class distibution of the dataset 

In [11]:
print(f"Class distribution: {np.bincount(labels_bace)}")

Class distribution: [822 691]


## Baseline with Logistic regression model

In [18]:
pipelines['baseline_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_bace_test) 

In [20]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.3657
Accuracy for Linear regression model: 0.6709
ROC-AUC score for Linear regression model: 0.7912


In [22]:
from skfp.datasets.moleculenet import load_bace
from skfp.fingerprints import ECFPFingerprint
from skfp.model_selection import FingerprintEstimatorGridSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

fp = ECFPFingerprint(n_jobs=-1)
fp_params = {"radius": [2, 3]}
clf = RandomForestClassifier(n_jobs=-1)
clf_params = {"min_samples_split": [2, 3, 4]}
clf_cv = GridSearchCV(clf, clf_params)
fp_cv = FingerprintEstimatorGridSearch(fp, fp_params, clf_cv)

In [23]:
fp_cv.fit(smiles_bace_train, y_train)
y_pred = fp_cv.predict(smiles_bace_test)
y_proba = fp_cv.predict_proba(smiles_bace_test) 

In [24]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4807
Accuracy for Linear regression model: 0.3960
ROC-AUC score for Linear regression model: 0.4112


## RF classifier

In [25]:
pipelines['rf_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['rf_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_bace_test)  

In [26]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4943
Accuracy for Linear regression model: 0.3927
ROC-AUC score for Linear regression model: 0.3990


In [66]:
pipelines['gb_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['gb_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['gb_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.3316
Accuracy for Linear regression model: 0.4323
ROC-AUC score for Linear regression model: 0.4265


In [67]:
pipelines['knc_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['knc_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['knc_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4265
Accuracy for Linear regression model: 0.4092
ROC-AUC score for Linear regression model: 0.4389


In [68]:
pipelines['mlp_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['mlp_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['mlp_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.3858
Accuracy for Linear regression model: 0.4191
ROC-AUC score for Linear regression model: 0.5000


In [69]:
pipelines['GNB_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['GNB_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['GNB_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.0334
Accuracy for Linear regression model: 0.5050
ROC-AUC score for Linear regression model: 0.4974


In [70]:
pipelines['svm_pipeline'].fit(smiles_bace_train, y_train)
y_pred = pipelines['svm_pipeline'].predict(smiles_bace_test)
y_proba = pipelines['svm_pipeline'].predict_proba(smiles_bace_test)
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.4265
Accuracy for Linear regression model: 0.4092
ROC-AUC score for Linear regression model: 0.4565


In [45]:
# filtering molecules from the test dataset 

from skfp.filters import PfizerFilter

filt = PfizerFilter(allow_one_violation=True)

In [46]:
smiles_to_label = dict(zip(smiles_train, y_train))

In [47]:
filtered_smiles = filt.transform(smiles_train)
filtered_labels = [smiles_to_label[smi] for smi in filtered_smiles]

In [None]:
pipelines['rf_pipeline'].fit(filtered_smiles, filtered_labels)

In [None]:
y_pred = pipelines['rf_pipeline'].predict(smiles_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_test)

In [None]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for RF classifier: -0.4234
Accuracy for RF classifier: 0.6535
ROC-AUC score for RF classifier: 0.7345


## CLINTOX

In [78]:
smiles, labels = load_clintox()

In [79]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1477
[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)Cl)Cl)Cl)Cl)Cl [1 0]
[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)[O-])O)O [1 0]


In [None]:
pipelines['baseline_pipeline'].fit(smiles_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_test) 

## BBBP

In [28]:
smiles_bbbp, labels_bbbp = load_bbbp()

In [29]:
print(len(smiles_bbbp))
print(smiles_bbbp[0], labels_bbbp[0])
print(smiles_bbbp[1], labels_bbbp[1])

2039
[Cl].CC(C)NCC(O)COc1cccc2ccccc12 1
C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl 1


In [30]:
smiles_bbbp_train, smiles_bbbp_test, y_train, y_test = scaffold_train_test_split(
    smiles_bbbp, labels_bbbp, test_size=0.2
)



In [31]:
print(f"Class distribution: {np.bincount(labels_bbbp)}")

Class distribution: [ 479 1560]


In [32]:
pipelines['baseline_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['baseline_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['baseline_pipeline'].predict_proba(smiles_bbbp_test) 



In [None]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.9813
Accuracy for Linear regression model: 0.5074
ROC-AUC score for Linear regression model: 0.3868


In [36]:
pipelines['rf_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['rf_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['rf_pipeline'].predict_proba(smiles_bbbp_test) 



In [37]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8630
Accuracy for Linear regression model: 0.5368
ROC-AUC score for Linear regression model: 0.4356


In [38]:
pipelines['gb_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['gb_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['gb_pipeline'].predict_proba(smiles_bbbp_test) 



In [39]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8630
Accuracy for Linear regression model: 0.5368
ROC-AUC score for Linear regression model: 0.3970


In [43]:
pipelines['knc_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['knc_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['knc_pipeline'].predict_proba(smiles_bbbp_test) 



In [44]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8433
Accuracy for Linear regression model: 0.5417
ROC-AUC score for Linear regression model: 0.4331


In [61]:
pipelines['svm_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['svm_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['svm_pipeline'].predict_proba(smiles_bbbp_test) 



In [62]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -0.8630
Accuracy for Linear regression model: 0.5368
ROC-AUC score for Linear regression model: 0.4393


In [52]:
pipelines['GNB_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['GNB_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['GNB_pipeline'].predict_proba(smiles_bbbp_test) 



In [53]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.0799
Accuracy for Linear regression model: 0.4828
ROC-AUC score for Linear regression model: 0.4579


In [56]:
pipelines['mlp_pipeline'].fit(smiles_bbbp_train, y_train)
y_pred = pipelines['mlp_pipeline'].predict(smiles_bbbp_test)
y_proba = pipelines['mlp_pipeline'].predict_proba(smiles_bbbp_test) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [57]:
calculate_scores(y_test, y_pred, y_proba)

R2 score for Linear regression model: -1.0306
Accuracy for Linear regression model: 0.4951
ROC-AUC score for Linear regression model: 0.4528


## ESOL

In [66]:
smiles, labels = load_esol()

In [67]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

1128
OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)C(O)C3O  -0.77
Cc1occc1C(=O)Nc2ccccc2 -3.3


## LIPOP

In [68]:
smiles, labels = load_lipophilicity()

In [69]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

4200
Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14 3.54
COc1cc(OC)c(cc1NC(=O)CSCC(=O)O)S(=O)(=O)N2C(C)CCc3ccccc23 -1.18


## MUV

In [70]:
smiles, labels = load_muv()

In [71]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

93087
Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C [nan nan nan nan nan nan nan  0. nan nan nan  0. nan nan nan nan nan]
Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1 [ 0.  0. nan nan  0.  0.  0. nan nan nan  0. nan  0. nan nan  0.  0.]


## PCBA

In [72]:
smiles, labels = load_pcba()

In [73]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

437929
CC(=O)N1CCC2(CC1)NC(=O)N(c1ccccc1)N2 [ 0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.
  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0. nan  0.  0.  0. nan  0. nan  0.  0. nan  0.  0.  0.
  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0. nan  0.  0.
  0.  0.  1. nan  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan]
N#Cc1nnn(-c2ccc(Cl)cc2)c1N [ 0.  0. nan  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0. nan  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0. nan  0.  0.  0. na

## TOX21

In [74]:
smiles, labels = load_tox21()

In [75]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

7831
CCOc1ccc2nc(S(N)(=O)=O)sc2c1 [ 0.  0.  1. nan nan  0.  0.  1.  0.  0.  0.  0.]
CCN1C(=O)NC(c2ccccc2)C1=O [ 0.  0.  0.  0.  0.  0.  0. nan  0. nan  0.  0.]


## TOXCAST

In [76]:
smiles, labels = load_toxcast()

In [77]:
print(len(smiles))
print(smiles[0], labels[0])
print(smiles[1], labels[1])

8576
[O-][N+](=O)C1=CC=C(Cl)C=C1 [ 0.  0. nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  