In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem as chem
import deepchem as dc
from deepchem.models import GraphConvModel, WeaveModel, MPNNModel
from deepchem.data import DiskDataset
from matplotlib import pyplot as plt
from nl_03_filter_model_score import confuse
import deepchem.molnet 
import time
from sklearn.metrics import confusion_matrix

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def np_2d_array(arr):
    # DC expects array (n x len(tasks)
    return np.reshape(arr,(arr.size, 1))

In [3]:
def dc_featurize(dc_feature, dfs):
    # Input from rest of script should be: [test, train, val]
    # Each should consist of a df with columns: [y, w, X]
    
    train_df = dfs[0]
    train_y = np_2d_array(np.array(train_df.y))
    train_w = np_2d_array(np.array(train_df.w))

    # List of rdkit objects?
    train_X = np.array(train_df.X)
    
    test_df = dfs[1]
    test_y = np_2d_array(np.array(test_df.y))
    test_w = np_2d_array(np.array(test_df.w))
    
    # List of rdkit objects?
    test_X = np.array(test_df.X)
    
    if dc_feature is 'GraphConv':
        feat = dc.feat.graph_features.ConvMolFeaturizer()
          
    elif dc_feature is 'Weave':
        feat = dc.feat.graph_features.WeaveFeaturizer(graph_distance=True, 
                                                      explicit_H=False)
    
    else:
        print('error: bad featurizer')
    
    train_X = feat.featurize(train_X, verbose=True, log_every_n=1000)
    test_X = feat.featurize(test_X, verbose=True, log_every_n=1000)

    # from_numpy(X, y, w=None, ids=None, tasks=None, data_dir=None, verbose=True)
    # Creates a DiskDataset object from specified Numpy arrays.
    
    train_ds = DiskDataset.from_numpy(train_X, train_y, w=train_w, verbose=True)
    test_ds = DiskDataset.from_numpy(test_X, test_y, w=test_w, verbose=True) 
    
    # https://deepchem.io/docs/notebooks/graph_convolutional_networks_for_tox21.html
    # Tasks?
    # https://deepchem.io/_modules/deepchem/trans/transformers.html
    # NormalizationTransformer(Transformer)
    #transformers = [deepchem.trans.BalancingTransformer(transform_w=True, 
    #                                                   dataset=dataset)]
   
    return [train_ds, test_ds]

In [4]:
def confuse(obs_y, theo_y):
    # Copy from confuse ipynb
    # Issue with confuse for nn
    # Classification metrics can't 
    # handle a mix of binary and unknown targets
    print(obs_y)
    print(theo_y)
    
    print(type(obs_y))
    print(type(theo_y))
    
    print(len(obs_y))
    print(len(theo_y))
    
    
    con = confusion_matrix(list(obs_y), list(theo_y))
    if con.shape == (1, 1):
        print('error!')

    elif con.shape == (2, 2):
        tn, fp, fn, tp = con.ravel()
        sens = tpr = tp / (tp + fn)
        spec = tnr = tn / (tn + fp)
        f1 = (2 * tp) / (2 * tp + fp + fn)
        acc = (tp + tn) / (tp + tn + fp + fn)
        # prec = tp / (tp + fp)

        return [acc, {'sens': sens, 'spec': spec, 'f1': f1,
                      'test_n': tn + fp + fn + tp, 'test_true': tp + fp,
                      'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
                      }]

    else:
        print('error!')

In [46]:
def dc_model_build(dc_model, dss):
    train_ds = dss[0]
    test_ds = dss[1]
    print(dc_model)
    
    if dc_model is 'GraphConvModel':
        #model = GraphConvModel(per_task_metrics=True, n_tasks=1, mode='classification', dropout=0.2)
        model = GraphConvModel(1, batch_size=50, mode='classification')
        model.fit(train_ds, nb_epoch=100)
   
    elif dc_model is 'WeaveModel':
        model = WeaveModel(n_tasks=1, mode='classification', dropout=0.2)
        model.fit(train_ds, nb_epoch=100)
        
    elif dc_model is 'MPNNModel':
        pass
    
    else:
        print('dc_model selection error: invalid choice')
        return None

    # https://deepchem.io/docs/_modules/deepchem/trans/transformers.html
    
    metric = dc.metrics.Metric(dc.metrics.accuracy_score)
    acc_train = model.evaluate(train_ds, [metric]) # transformers=[])
    
    print('acc_train')
    print(acc_train)    
    
    acc_test = model.evaluate(test_ds, [metric]) # transformers)
        
    print('acc_train')
    print(acc_train)

    # Predict_y output is weird, array of two numbers...
    predict_ds = model.predict_proba(test_ds) #, transformers=[])
    
    print('predict')
    print(predict_ds.shape)
    print(type(predict_ds))
    
    return predict_ds
    
    #result_dict = confuse(test_ds.y, predict_ds)[1]
    #result_dict['acc_train'] = acc_train
    #result_dict['acc_test'] = acc_test

    #return result_dict

In [28]:
# Load X's if running machine learning
join_df_path = 'all_public_output_02.pickle' 
join_df = pd.read_pickle(join_df_path)  # 0.5 Gb

xyw_df = join_df[['n_loss_wparent_H2O', 'weight', 'Molecule']].copy(deep=True)
xyw_df = xyw_df.rename(columns={'Molecule': 'X', 'n_loss_wparent_H2O': 'y', 'weight': 'w'}, inplace=False)

# Trim to small set, 300 rows, to check functionality.
# . Do not use this split for real evaluations.
test_df = xyw_df.iloc[0:10,:].copy(deep=True)
train_df = xyw_df.iloc[11:20,:].copy(deep=True)
val_df = train_df = xyw_df.iloc[21:30,:].copy(deep=True)
dfs = [test_df, train_df, val_df]

In [29]:
# Parameters
dc_features_models = [('GraphConvModel', 'GraphConv'), 
                   ('WeaveModel', 'Weave'), 
                   ('MPNNModel','Weave') ]
dc_feature_model = dc_features_models[0]
dc_model = dc_feature_model[0]
dc_feature = dc_feature_model[1]

In [30]:
# Featurization:
dc_datasets = dc_featurize(dc_feature, dfs)

TIMING: dataset construction took 0.011 s
Loading dataset from disk.
TIMING: dataset construction took 0.009 s
Loading dataset from disk.


In [47]:
# Model building:
# It seems that most dc models only support regression???
dc_results = dc_model_build(dc_model, dc_datasets)

GraphConvModel




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


computed_metrics: [1.0]
acc_train
{'accuracy_score': 1.0}
computed_metrics: [0.4444444444444444]
acc_train
{'accuracy_score': 1.0}


AttributeError: 'GraphConvModel' object has no attribute 'predict_proba'

In [38]:
np.squeeze(dc_results)

array([[0.00355048, 0.9964495 ],
       [0.08991361, 0.91008633],
       [0.4734411 , 0.52655894],
       [0.9513122 , 0.04868777],
       [0.4421962 , 0.55780375],
       [0.41332096, 0.58667904],
       [0.46275672, 0.5372433 ],
       [0.46619338, 0.53380656],
       [0.429655  , 0.5703451 ]], dtype=float32)

In [40]:
test_ds = dc_datasets[1]

In [45]:
test_ds.y

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False]])

"test_ds" is a (300, 3) DataFrame.  Fitting dc mdoels wants this?  But predict doesn't
--> should be test_ds.X or ds.y?
predict_y is a (300, 1, 2) arr --> should be t or f?

In [14]:
dc_results

NameError: name 'dc_results' is not defined

In [5]:
# Example dataset load
tox21 = deepchem.molnet.load_tox21()

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /var/folders/7c/88zzqp7j4c36cg81sxc4x_yc0000gn/T/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 7.318 s
TIMING: dataset construction took 7.502 s
Loading dataset from disk.
TIMING: dataset construction took 0.262 s
Loading dataset from disk.
TIMING: dataset construction took 0.140 s
Loading dataset from disk.
TIMING: dataset construction took 0.143 s
Loading dataset from disk.
TIMING: dataset construction took 0.208 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.030 s
Loading dataset from disk.


In [6]:
# Example dataset:
tox21

(['NR-AR',
  'NR-AR-LBD',
  'NR-AhR',
  'NR-Aromatase',
  'NR-ER',
  'NR-ER-LBD',
  'NR-PPAR-gamma',
  'SR-ARE',
  'SR-ATAD5',
  'SR-HSE',
  'SR-MMP',
  'SR-p53'],
 (<deepchem.data.datasets.DiskDataset at 0x1a448aefd0>,
  <deepchem.data.datasets.DiskDataset at 0x1a448aeb00>,
  <deepchem.data.datasets.DiskDataset at 0x1a42b91b00>),
 [<deepchem.trans.transformers.BalancingTransformer at 0x1a42b91f28>])

In [67]:
x, w, z = dc.molnet.load_tox21(featurizer='GraphConv')

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [None]:
# Overall object is a tuple of length 3: tasks, datasets, transformers

    # tasks is a list? ['Task']
    # datasets is a tuple of deepchem.data.datasets.DiskDataset
    # transformer is a list with: <deepchem.trans.transformers.BalancingTransformer



In [65]:
type(t21_datasets)

tuple