In [1]:
import os
import argparse
import pandas as pd
import numpy as np
import torch.multiprocessing as mp
import gc

import wandb
os.environ["WANDB_SILENT"] = "True"
wandb.login(key = "27edf9c66b032c03f72d30e923276b93aa736429")

import hjson  # Use HJSON instead of json
import torch
import torch.multiprocessing as mp
import deepchem as dc

import sys
sys.path.insert(1, '/scratch/work/masooda1/active_learning')
from pytorch_lightning import seed_everything
from utils.data_utils import (read_train_test_files, scafoldsplit_train_test, convert_to_dataframe, 
                               drop_unwanted_tasks, get_initial_set_with_main_and_aux_samples)

from utils.data_utils import convert_dataframe_to_dataloader
from utils.model_utils import get_pred_with_uncertainities
from utils.utils import wandb_init_model, compute_binary_classification_metrics_MT, active_learning_loop

Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/scratch/work/masooda1/.conda_envs/env_arslan/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [None]:
def compute_pos_neg(data):
    pos = (data.y == 1).sum()
    negative = (data.y == 0).sum()
    ratio = pos / negative
    return pos, negative, ratio

In [None]:
DATASET_ARRAY= [
    "hia_hou",
    "pgp_broccatelli",
    "bioavailability_ma",
    "cyp2c19_veith",
    "cyp2d6_veith",
    "cyp3a4_veith",
    "cyp1a2_veith",
    "cyp2c9_veith",
    "cyp2c9_substrate_carbonmangels",
    "cyp2d6_substrate_carbonmangels",
    "cyp3a4_substrate_carbonmangels"]

#DATASET_ARRAY= [
#    "hia_hou"]

for dataset in DATASET_ARRAY:
    config = {
        "project_name": f"BALD_{dataset}",
    
        "_comment_input_files": "Configuration for input data files",
        "target_file": f"/scratch/work/masooda1/datasets/datasets_for_active_learning/filtered_data/TDC_ADME/{dataset}_filtered.csv",
        "BERT_features_file": f"/scratch/work/masooda1/datasets/datasets_for_active_learning/MolBERT_features/MolBERT_{dataset}.csv",
        "ECFP_features_file": f"/scratch/work/masooda1/datasets/datasets_for_active_learning/MF/MF_r2_1024_{dataset}.csv",
        "pos_weights": f"/scratch/work/masooda1/datasets/datasets_for_active_learning/filtered_data/TDC_ADME/{dataset}_pos_ratio.csv",
        "class_weights": "/path/to/class_weights.json",
    
        "_comment_output_dir": "Configuration for output directories",
        "metadata_dir": f"/scratch/cs/pml/AI_drug/trained_model_pred/active_learning/ADME/{dataset}/",
        "wandb_dir": f"/scratch/cs/pml/AI_drug/trained_model_pred/active_learning/ADME/{dataset}/wandb/",
        "wandb_offline": False,
        "wandb_mode": "online",
    
        "_comment_data_split": "Configuration for data splitting",
        "train_test_split_exists": False,
        "Compound_col": "Drug",
        "train_frac": 0.8,
        "use_all_tasks_to_split": True,
        "splitter": "RandomSplitter",
    
        "_comment_tasks": "Configuration for task selection",
        "all_tasks":"Y",
        "main_task": "Y",
        "selected_tasks": "Y",
        "main_task_index":0, 
        "aux_task": None,  
        "aux_task_index": None,
    
        "_comment_features": "Configuration for input features",
        "features_type": "FP",
        "fp_size": 1024,
    
        "_comment_bnn_architecture": "Configuration for BNN architecture",
        "input_dim": 768,
        "hidden_dim": 128,
        "depth" : 1,
        "dropout_p": 0.5,
        "BatchNorm1d": True,
        "use_skip_connection": True,
        "l2_lambda": 1e-4,
        "optm_l2_lambda": 1e-4,
    
        "_comment_optimization": "Configuration for optimization parameters",
        "optim": "Adam",
        "lr": 0.001,
        "lr_schedulers": "CosineAnnealingLR",
    
        "_comment_losses": "Configuration for weighted losses",
        "loss_type" : "BCE",
        "missing" : "nan",
        "alpha": 0.0,
        "beta": 0.0,
        "gamma":0.0,
    
        "_comment_training": "Configuration for training parameters",
        "epochs": 1000,
        "min_epochs": 50,
        "num_workers": 1, 
        "compute_metric_after_n_epochs": 1,
        "batch_size": 32,
        "pretrained_model": False,
        "return_trainer": True,
        "_comment_gpu": "Use [0] for GPU 0, None for CPU",
        "gpu": None,
        "_comment_accelerator": "Use gpu or cpu",
        "accelerator": "cpu",
    
        "_comment_early_stopping": "Configuration for early stopping",
        "check_val_every_n_epoch": 1,
        "EarlyStopping": True,
        "metric_to_monitor": "val_BCE_non_weighted",
        "metric_direction": "min",
        "_comment_patience": "Will stop after compute_metric_after_n_epochs * patience",
        "patience": 20,
    
        "_comment_active_learning": "Configuration for active learning parameters",
        "main_task_initial_set_samples": 50,
        "num_forward_passes": 20,
        "num_iterations": 1000,
        "sampling_strategy": "BALD",
        "n_query": 1,
        "dataset": f"{dataset}",
        "seed": 0
    }

    t_names = os.path.join(config["metadata_dir"], config["sampling_strategy"], config["main_task"])
    config["query_set_dir"] = os.path.join(t_names, "query_set")
    config["result_dir"] = os.path.join(t_names, "Results")
    config["model_weights_dir"] = os.path.join(t_names, "model_weights")
    
    config["num_of_tasks"] = len(config["selected_tasks"])
    config["device"] = "cpu"
    
    config["splitter"] = "RandomStratifiedSplitter"
    config["seed"] = 88

    # Splitting by using deepchem
    train_set, test_set = scafoldsplit_train_test(config, all_tasks = True)
    
    # Calculate statistics for each dataset separately
    train_pos, train_neg, train_ratio = compute_pos_neg(train_set)
    test_pos, test_neg, test_ratio = compute_pos_neg(test_set)
    
    # Calculate total by summing train and test
    total_pos = train_pos + test_pos
    total_neg = train_neg + test_neg
    total_ratio = total_pos / total_neg

    initial_set, train_set = get_initial_set_with_main_and_aux_samples(train_set, config)
    
    random_stratified_splitter = dc.splits.RandomStratifiedSplitter()
    pool_set, val_set = random_stratified_splitter.train_test_split(train_set, frac_train=0.80, seed=config["seed"])
    
    initial_pos, initial_neg, initial_ratio = compute_pos_neg(initial_set)
    pool_pos, pool_neg, pool_ratio = compute_pos_neg(pool_set)
    val_pos, val_neg, val_ratio = compute_pos_neg(val_set)
    test_pos, test_neg, test_ratio = compute_pos_neg(test_set)
    
    # Print in the requested format
    print(f"total_pos, total_neg, pos_to_neg_ratio\n{total_pos}, {total_neg}, {total_ratio:.4f}")
    print(f"initial_pos, initial_neg, pos_to_neg_ratio\n{initial_pos}, {initial_neg}, {initial_ratio:.4f}")
    print(f"pool_pos, pool_neg, pos_to_neg_ratio\n{pool_pos}, {pool_neg}, {pool_ratio:.4f}")
    print(f"val_pos, val_neg, pos_to_neg_ratio\n{val_pos}, {val_neg}, {val_ratio:.4f}")
    print(f"test_pos, test_neg, pos_to_neg_ratio\n{test_pos}, {test_neg}, {test_ratio:.4f}")

    print(f"#################{dataset}#############################")
    print(f"{dataset} & {total_pos} & {total_neg} & {initial_pos} & {initial_neg} & {pool_pos} & {pool_neg} & {val_pos} & {val_neg} & {test_pos} & {test_neg}")
    print("##############################################")

In [50]:
25+298+75+99

497

train_test_features (965, 768) (241, 768)
train_test_targets (965, 1) (241, 1)
##############################################
total_pos, total_neg, pos_to_neg_ratio 491, 424, 1.16
initial_pos, initial_neg, pos_to_neg_ratio 25, 25, 1.00
pool_pos, pool_neg, pos_to_neg_ratio 393, 339, 1.16
val_pos, val_neg, pos_to_neg_ratio 98, 85, 1.15
test_pos, test_neg, pos_to_neg_ratio 129, 112, 1.15
pgp_broccatelli & 491 & 424 & 25 & 25 & 393 & 339 & 98 & 85 & 129 & 112


(373, 30, 12.433333333333334)

In [6]:
# Who cares about deepchem data_object, trash it
selected_tasks = config["selected_tasks"] if isinstance(config["selected_tasks"], list) else [config["selected_tasks"]]

initial_set = convert_to_dataframe(initial_set, selected_tasks)
val_set = convert_to_dataframe(val_set, selected_tasks)
pool_set = convert_to_dataframe(pool_set, selected_tasks)
test_set = convert_to_dataframe(test_set, selected_tasks)

In [7]:
for itteration in range(config["num_iterations"]):

    from utils.models import Vanilla_MLP_classifier
    seed_everything(seed = config["seed"])
    config["itteration"] = itteration
    config["model_name"] = rf'itteration_{config["itteration"]}_s{config["seed"]}_alpha_{config["alpha"]}_gamma_{config["gamma"]}_loss_type_{config["loss_type"]}_λ{config["optm_l2_lambda"]}'
    config["checkpoint_name"] = f'model-iter{itteration}-{config["main_task"]}-{{epoch:02d}}-{{val_BCE_non_weighted:.4f}}' 

    # get dataloaders
    train_dataloader = convert_dataframe_to_dataloader(dataframe= initial_set, config = config, shuffle= True)
    val_dataloader = convert_dataframe_to_dataloader(dataframe= val_set, config = config, shuffle= False)
    test_dataloader = convert_dataframe_to_dataloader(dataframe= test_set, config = config, shuffle= False)
    pool_dataloader = convert_dataframe_to_dataloader(dataframe= pool_set, config = config, shuffle= False)

    # Train model
    config["training_steps"] = len(train_dataloader)
    trained_model, run, trainer = wandb_init_model(model = Vanilla_MLP_classifier, 
                                                            train_dataloader = train_dataloader,
                                                            val_dataloader =val_dataloader,
                                                            config = config, 
                                                            model_type = 'MLP')
    
    wandb.finish()

    ###### Model Evaluation #############
    # make dir
    query_set_dir = config["metadata_dir"] + "query_set/"
    result_dir = config["metadata_dir"]
    os.makedirs(query_set_dir, exist_ok = True)
    os.makedirs(result_dir, exist_ok = True)

    # Evaluation
    trained_model = trained_model.eval()
    targets, pred_mean, pred_var, all_pred = get_pred_with_uncertainities(
                                                                            test_dataloader, trained_model,
                                                                            n_classes=config["num_of_tasks"],
                                                                            cal_uncert=False,
                                                                            num_forward_passes=1
                                                                        )
    metrics = compute_binary_classification_metrics_MT(targets, pred_mean, missing='nan')

    metrics = metrics.append(metrics.mean(), ignore_index=True)
    metrics.insert(0, 'Tasks', list(config["selected_tasks"]) + ['mean'])
    metrics.to_csv(result_dir + f'metrics_{config["model_name"]}.csv', index=False)
    print(metrics.mean())

    query_set, updated_training_set, updated_poolset = active_learning_loop(trained_model,
                                                                        pool_dataloader, 
                                                                        initial_set,
                                                                        pool_set, 
                                                                        config)
    query_set.to_csv(query_set_dir + f'query_set_{config["model_name"]}.csv', index=False)
    del initial_set,pool_set
    del train_dataloader,val_dataloader,test_dataloader,pool_dataloader, trained_model

    initial_set = updated_training_set.copy()
    pool_set = updated_poolset.copy()

    torch.cuda.empty_cache()
    gc.collect()

    print('++++++++++++++++++++++++++++++++++++++++++++++++++++')
    gpu_memory_status = torch.cuda.memory_allocated() / (1024 ** 3)
    print("GPU Memory Status (after clearing):", gpu_memory_status)
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++')

[rank: 0] Global seed set to 42


After merging (50, 768)
After merging (60, 768)
After merging (113, 768)
After merging (343, 768)


  rank_zero_warn(
  rank_zero_warn(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Loading best model from /scratch/cs/pml/AI_drug/trained_model_pred/active_learning/ADME/hia_hou/BALD/Y/model_weights/model-iter0-Y-epoch=38-val_BCE_non_weighted=8.1021-v3.ckpt
Best validation score: 8.10205364227295


balanced_acc         0.878427
f1_score             0.950980
specificity          0.928571
sensitivity          0.828283
roc_auc              0.912698
AUPR                 0.986843
average_precision    0.986916
ECE                  0.171768
ACE                  0.171675
dtype: float64


  metrics = metrics.append(metrics.mean(), ignore_index=True)
  print(metrics.mean())


0.06900143801342873


IndexError: index 342 is out of bounds for axis 1 with size 1

In [None]:
metrics.insert(0, 'Tasks', list(config["selected_tasks"]) + ['mean'])

In [24]:
targets

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [9]:
def compute_ece(y_true, y_prob, n_bins=10, equal_intervals = True):
    # Calculate bin boundaries
    if equal_intervals == True: # ECE
        bin_boundaries = np.linspace(0, 1, n_bins + 1)
    else:                       # ACE
        bin_boundaries = np.percentile(y_prob, np.linspace(0, 100, n_bins + 1))
    
    # Calculate bin indices
    bin_indices = np.digitize(y_prob, bin_boundaries[1:-1])
    
    ece = 0
    total_samples = len(y_true)
    
    # Calculate ECE
    for bin_idx in range(n_bins):
        # Filter samples within the bin
        bin_mask = bin_indices == bin_idx
        bin_samples = np.sum(bin_mask)
        
        if bin_samples > 0:
            # Calculate accuracy and confidence for the bin
            bin_accuracy = np.mean(y_true[bin_mask])
            bin_confidence = np.mean(y_prob[bin_mask])
        
            # Update ECE
            ece += (bin_samples / total_samples) * np.abs(bin_accuracy - bin_confidence)
    
    return ece

In [14]:
ECE = compute_ece(targets.reshape(-1), pred_mean.reshape(-1), n_bins=10, equal_intervals = True)
ECE

0.17176794644502524

In [12]:
targets.shape, pred_mean.shape

((113,), (113, 1))