In [2]:
import deepchem as dc
from openpom.feat.graph_featurizer import GraphFeaturizer, GraphConvConstants
from openpom.utils.data_utils import get_class_imbalance_ratio, IterativeStratifiedSplitter
from openpom.models.mpnn_pom import MPNNPOMModel
from datetime import datetime
from tqdm import tqdm
import torch
import numpy as np
from sklearn.metrics import roc_auc_score

Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/ary2260/miniconda3/envs/testenv/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [3]:
TASKS = [
'alcoholic', 'aldehydic', 'alliaceous', 'almond', 'amber', 'animal',
'anisic', 'apple', 'apricot', 'aromatic', 'balsamic', 'banana', 'beefy',
'bergamot', 'berry', 'bitter', 'black currant', 'brandy', 'burnt',
'buttery', 'cabbage', 'camphoreous', 'caramellic', 'cedar', 'celery',
'chamomile', 'cheesy', 'cherry', 'chocolate', 'cinnamon', 'citrus', 'clean',
'clove', 'cocoa', 'coconut', 'coffee', 'cognac', 'cooked', 'cooling',
'cortex', 'coumarinic', 'creamy', 'cucumber', 'dairy', 'dry', 'earthy',
'ethereal', 'fatty', 'fermented', 'fishy', 'floral', 'fresh', 'fruit skin',
'fruity', 'garlic', 'gassy', 'geranium', 'grape', 'grapefruit', 'grassy',
'green', 'hawthorn', 'hay', 'hazelnut', 'herbal', 'honey', 'hyacinth',
'jasmin', 'juicy', 'ketonic', 'lactonic', 'lavender', 'leafy', 'leathery',
'lemon', 'lily', 'malty', 'meaty', 'medicinal', 'melon', 'metallic',
'milky', 'mint', 'muguet', 'mushroom', 'musk', 'musty', 'natural', 'nutty',
'odorless', 'oily', 'onion', 'orange', 'orangeflower', 'orris', 'ozone',
'peach', 'pear', 'phenolic', 'pine', 'pineapple', 'plum', 'popcorn',
'potato', 'powdery', 'pungent', 'radish', 'raspberry', 'ripe', 'roasted',
'rose', 'rummy', 'sandalwood', 'savory', 'sharp', 'smoky', 'soapy',
'solvent', 'sour', 'spicy', 'strawberry', 'sulfurous', 'sweaty', 'sweet',
'tea', 'terpenic', 'tobacco', 'tomato', 'tropical', 'vanilla', 'vegetable',
'vetiver', 'violet', 'warm', 'waxy', 'weedy', 'winey', 'woody'
]

print("No of tasks: ", len(TASKS))
n_tasks = len(TASKS)

No of tasks:  138


save train and test splits

In [1]:
# # uncomment and run if no splits saved yet

# # download curated dataset
# !wget https://raw.githubusercontent.com/ARY2260/openpom/main/openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv

# # The curated dataset can also found at `openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv` in the repo.

# input_file = 'curated_GS_LF_merged_4983.csv' # or new downloaded file path

# # get dataset

# featurizer = GraphFeaturizer()
# smiles_field = 'nonStereoSMILES'
# loader = dc.data.CSVLoader(tasks=TASKS,
#                    feature_field=smiles_field,
#                    featurizer=featurizer)
# dataset = loader.create_dataset(inputs=[input_file])
# n_tasks = len(dataset.tasks)

# # get train valid test splits
# splitter = IterativeStratifiedSplitter(order=2)
# train_dataset, test_dataset = splitter.train_test_split(dataset, frac_train=0.8, train_dir='./splits/train_data', test_dir='./splits/test_data')

# print("train_dataset: ", len(train_dataset))
# print("test_dataset: ", len(test_dataset))

load splits

In [4]:
train_dataset = dc.data.DiskDataset('./splits/train_data')
test_dataset = dc.data.DiskDataset('./splits/test_data')
print("train_dataset: ", len(train_dataset))
print("test_dataset: ", len(test_dataset))

train_dataset:  3989
test_dataset:  994


set parameters

In [5]:
train_ratios = get_class_imbalance_ratio(train_dataset)
assert len(train_ratios) == n_tasks

# learning_rate = 0.001
learning_rate = dc.models.optimizers.ExponentialDecay(initial_rate=0.001, decay_rate=0.5, decay_steps=32*20, staircase=True)

metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

In [7]:
# run this cell if detailed log is needed

# import logging

# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)

In [6]:
# no of models in the ensemble
n_models = 10

# no of epochs each model is trained for
nb_epoch = 62

In [7]:
for i in tqdm(range(n_models)):
    model = MPNNPOMModel(n_tasks = n_tasks,
                            batch_size=128,
                            learning_rate=learning_rate,
                            class_imbalance_ratio = train_ratios,
                            loss_aggr_type = 'sum',
                            node_out_feats = 100,
                            edge_hidden_feats = 75,
                            edge_out_feats = 100,
                            num_step_message_passing = 5,
                            mpnn_residual = True,
                            message_aggregator_type = 'sum',
                            mode = 'classification',
                            number_atom_features = GraphConvConstants.ATOM_FDIM,
                            number_bond_features = GraphConvConstants.BOND_FDIM,
                            n_classes = 1,
                            readout_type = 'set2set',
                            num_step_set2set = 3,
                            num_layer_set2set = 2,
                            ffn_hidden_list= [392, 392],
                            ffn_embeddings = 256,
                            ffn_activation = 'relu',
                            ffn_dropout_p = 0.12,
                            ffn_dropout_at_input_no_act = False,
                            weight_decay = 1e-5,
                            self_loop = False,
                            optimizer_name = 'adam',
                            log_frequency = 32,
                            model_dir = f'./ensemble_models/experiments_{i+1}',
                            device_name='cuda')

    start_time = datetime.now()
    
    # fit model
    loss = model.fit(
          train_dataset,
          nb_epoch=nb_epoch,
          max_checkpoints_to_keep=1,
          deterministic=False,
          restore=False)
    end_time = datetime.now()
    
    train_scores = model.evaluate(train_dataset, [metric])['roc_auc_score']
    test_scores = model.evaluate(test_dataset, [metric])['roc_auc_score']
    print(f"loss = {loss}; train_scores = {train_scores}; test_scores = {test_scores}; time_taken = {str(end_time-start_time)}")
    model.save_checkpoint() # saves final checkpoint => `checkpoint2.pt`
    del model
    torch.cuda.empty_cache()

 10%|█         | 1/10 [03:28<31:19, 208.81s/it]

loss = 1.5725961923599243; train_scores = 0.9557426673798433; test_scores = 0.9223144806860779; time_taken = 0:03:23.894442


 20%|██        | 2/10 [06:55<27:39, 207.47s/it]

loss = 1.6186336278915405; train_scores = 0.953585491753308; test_scores = 0.922631544997691; time_taken = 0:03:23.784795


 30%|███       | 3/10 [10:25<24:21, 208.73s/it]

loss = 1.6698408126831055; train_scores = 0.9534920954830443; test_scores = 0.9197296031399491; time_taken = 0:03:27.694327


 40%|████      | 4/10 [14:08<21:26, 214.39s/it]

loss = 1.7009578943252563; train_scores = 0.9519955043773276; test_scores = 0.9214299408602377; time_taken = 0:03:40.575379


 50%|█████     | 5/10 [17:37<17:41, 212.29s/it]

loss = 1.613566279411316; train_scores = 0.9548896388455931; test_scores = 0.9226817863385266; time_taken = 0:03:25.998128


 60%|██████    | 6/10 [21:06<14:04, 211.22s/it]

loss = 1.6102882623672485; train_scores = 0.9557371496403166; test_scores = 0.92470571327698; time_taken = 0:03:26.483416


 70%|███████   | 7/10 [24:34<10:30, 210.17s/it]

loss = 1.7500712871551514; train_scores = 0.9504602598858963; test_scores = 0.9198826298090355; time_taken = 0:03:25.481960


 80%|████████  | 8/10 [28:02<06:59, 209.57s/it]

loss = 1.5922919511795044; train_scores = 0.9564375769714096; test_scores = 0.923833304244804; time_taken = 0:03:25.575397


 90%|█████████ | 9/10 [31:29<03:28, 208.75s/it]

loss = 1.694993019104004; train_scores = 0.9518378526137781; test_scores = 0.9237147952865264; time_taken = 0:03:24.401467


100%|██████████| 10/10 [34:59<00:00, 209.96s/it]

loss = 1.69662344455719; train_scores = 0.95221293215269; test_scores = 0.9227818672399795; time_taken = 0:03:27.712148





Get test score from the ensemble

In [9]:
list_preds = []
for i in range(n_models):
    model = MPNNPOMModel(n_tasks = n_tasks,
                            batch_size=128,
                            learning_rate=learning_rate,
                            class_imbalance_ratio = train_ratios,
                            loss_aggr_type = 'sum',
                            node_out_feats = 100,
                            edge_hidden_feats = 75,
                            edge_out_feats = 100,
                            num_step_message_passing = 5,
                            mpnn_residual = True,
                            message_aggregator_type = 'sum',
                            mode = 'classification',
                            number_atom_features = GraphConvConstants.ATOM_FDIM,
                            number_bond_features = GraphConvConstants.BOND_FDIM,
                            n_classes = 1,
                            readout_type = 'set2set',
                            num_step_set2set = 3,
                            num_layer_set2set = 2,
                            ffn_hidden_list= [392, 392],
                            ffn_embeddings = 256,
                            ffn_activation = 'relu',
                            ffn_dropout_p = 0.12,
                            ffn_dropout_at_input_no_act = False,
                            weight_decay = 1e-5,
                            self_loop = False,
                            optimizer_name = 'adam',
                            log_frequency = 32,
                            model_dir = f'./ensemble_models/experiments_{i+1}',
                            device_name='cuda')
    model.restore(f"./ensemble_models/experiments_{i+1}/checkpoint2.pt")
    # test_scores = model.evaluate(test_dataset, [metric])['roc_auc_score']
    # print("test_score: ", test_scores)
    preds = model.predict(test_dataset)
    list_preds.append(preds)

preds_arr = np.asarray(list_preds)
ensemble_preds = np.mean(preds_arr, axis=0)
print("average ensemble score: ", roc_auc_score(test_dataset.y, ensemble_preds, average="macro"))

average ensemble score:  0.9332425000551015
