# Compare the performance of the models

In [2]:
import yaml
import fsspec

import pandas as pd
import numpy as np
import datamol as dm

from typing import Optional
from datetime import datetime
from loguru import logger
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from mood.constants import RESULTS_DIR
from mood.dataset import load_data_from_tdc, MOOD_REGR_DATASETS
from mood.metrics import Metric, compute_bootstrapped_metric
from mood.representations import featurize
from mood.baselines import predict_baseline_uncertainty
from mood.train import train_baseline_model
from mood.experiment import basic_tuning_loop
from mood.utils import bin_with_overlap, load_distances_for_downstream_application
from mood.distance import compute_knn_distance
from mood.preprocessing import DEFAULT_PREPROCESSING

from mood.representations import compute_jointformer

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/adamizdebski/miniconda3/envs/mood-experiments/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [3]:
baseline_algorithm: str = "RF"
representation: str = "Jointformer"
dataset: str = "Lipophilicity"
n_seeds: int = 1
n_trials: int = 2
n_startup_trials: int = 1
base_save_dir: str = RESULTS_DIR
sub_save_dir: Optional[str] = None
overwrite: bool = False

In [4]:
if sub_save_dir is None:
    sub_save_dir = datetime.now().strftime("%Y%m%d")
out_dir = dm.fs.join(base_save_dir, "dataframes", "compare_performance", sub_save_dir)
dm.fs.mkdir(out_dir, exist_ok=True)

In [5]:

# Load the dataset
smiles, y = load_data_from_tdc(dataset)



Found local copy...
Loading...
Done!
Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


In [6]:

X, mask = featurize(
smiles,
representation,
standardize_fn=DEFAULT_PREPROCESSING[representation],
disable_logs=True,
)
y = y[mask]
is_regression = dataset in MOOD_REGR_DATASETS


Preprocess Jointformer:   0%|          | 0/4200 [00:00<?, ?it/s]Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Preprocess Jointformer:   0%|          | 1/4200 [00:01<1:53:24,  1.62s/it]Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality
Failed to find the pandas get_adjustment() function to patch
Failed to patch panda

In [7]:

# Get the metrics
perf_metric = Metric.get_default_performance_metric(dataset)
cali_metric = Metric.get_default_calibration_metric(dataset)



In [8]:

# Generate all data needed for these plots
dist_train = []
dist_test = []
y_pred = []
y_true = []
y_uncertainty = []

for seed in range(n_seeds):
    # Randomly split the dataset
    # This ensures that the distribution of distances from val to train is relatively uniform
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)

    file_name = f"best_hparams_{dataset}_{baseline_algorithm}_{representation}_{seed}.yaml"
    out_path = dm.fs.join(out_dir, file_name)

    if dm.fs.exists(out_path):
        # Load the data of the completed hyper-param study if it already exists
        logger.info(f"Loading the best hyper-params from {out_path}")
        with fsspec.open(out_path) as fd:
            params = yaml.safe_load(fd)

    else:
        # Run a hyper-parameter search
        study = basic_tuning_loop(
            X_train=X_train,
            X_test=X_val,
            y_train=y_train,
            y_test=y_val,
            name=baseline_algorithm,
            is_regression=is_regression,
            metric=perf_metric,
            global_seed=seed,
            n_trials=n_trials,
            n_startup_trials=n_startup_trials,
        )

        params = study.best_params
        random_state = seed + study.best_trial.number
        params["random_state"] = random_state

        logger.info(f"Saving the best hyper-params to {out_path}")
        with fsspec.open(out_path, "w") as fd:
            yaml.dump(params, fd)

        file_name = f"trials_{dataset}_{baseline_algorithm}_{representation}_{seed}.csv"
        out_path = dm.fs.join(out_dir, file_name)

        logger.info(f"Saving the trials dataframe to {out_path}")
        study.trials_dataframe().to_csv(out_path)

    random_state = params.pop("random_state")
    model = train_baseline_model(
        X_train,
        y_train,
        baseline_algorithm,
        is_regression,
        params,
        random_state,
        for_uncertainty_estimation=True,
        ensemble_size=10,
    )

    y_pred_ = model.predict(X_test)
    y_uncertainty_ = predict_baseline_uncertainty(model, X_test)

    y_pred.append(y_pred_)
    y_true.append(y_test)
    y_uncertainty.append(y_uncertainty_)

    dist_train_, dist_test_ = compute_knn_distance(X_train, [X_train, X_test])
    dist_train.append(dist_train_)
    dist_test.append(dist_test_)

dist_test = np.concatenate(dist_test)
dist_train = np.concatenate(dist_train)
y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)
y_uncertainty = np.concatenate(y_uncertainty)



[32m2024-08-16 18:37:35.781[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mLoading the best hyper-params from /home/adamizdebski/files/mood-experiments/results/dataframes/compare_performance/20240816/best_hparams_Lipophilicity_RF_Jointformer_0.yaml[0m


In [9]:
# Collect the distances of the downstream applications
dist_scr = load_distances_for_downstream_application(
    "virtual_screening", representation, dataset, update_cache=True
)
dist_opt = load_distances_for_downstream_application(
    "optimization", representation, dataset, update_cache=True
)
dist_app = np.concatenate((dist_opt, dist_scr))


[32m2024-08-16 18:37:50.064[0m | [34m[1mDEBUG   [0m | [36mmood.utils[0m:[36mload_distances_for_downstream_application[0m:[36m57[0m - [34m[1mDownloading /home/adamizdebski/files/mood-experiments/downstream_applications/distances/virtual_screening/Lipophilicity/Jointformer.parquet to /home/adamizdebski/.cache/MOOD/downstream_applications/distances/virtual_screening/Lipophilicity/Jointformer.parquet[0m
[32m2024-08-16 18:37:50.449[0m | [34m[1mDEBUG   [0m | [36mmood.utils[0m:[36mload_distances_for_downstream_application[0m:[36m57[0m - [34m[1mDownloading /home/adamizdebski/files/mood-experiments/downstream_applications/distances/optimization/Lipophilicity/Jointformer.parquet to /home/adamizdebski/.cache/MOOD/downstream_applications/distances/optimization/Lipophilicity/Jointformer.parquet[0m


In [10]:

# Compute the difference in IID and OOD performance and calibration
lower, upper = np.quantile(dist_train, 0.025), np.quantile(dist_train, 0.975)
mask = np.logical_and(dist_test >= lower, dist_test <= upper)
score_iid = perf_metric(y_true[mask], y_pred[mask])
calibration_iid = cali_metric(y_true[mask], y_pred[mask], y_uncertainty[mask])
logger.info(f"Found an IID {perf_metric.name} score of {score_iid:.3f}")
logger.info(f"Found an IID {cali_metric.name} calibration score of {calibration_iid:.3f}")

lower, upper = np.quantile(dist_app, 0.025), np.quantile(dist_app, 0.975)
mask = np.logical_and(dist_test >= lower, dist_test <= upper)
score_ood = perf_metric(y_true[mask], y_pred[mask])
calibration_ood = cali_metric(y_true[mask], y_pred[mask], y_uncertainty[mask])
logger.info(f"Found an OOD {perf_metric.name} score of {score_ood:.3f}")
logger.info(f"Found an OOD {cali_metric.name} calibration score of {calibration_ood:.3f}")

file_name = f"gap_{dataset}_{baseline_algorithm}_{representation}.csv"
out_path = dm.fs.join(out_dir, file_name)
if dm.fs.exists(out_path) and not overwrite:
    raise RuntimeError(f"{out_path} already exists!")

# Saving this as a CSV might be a bit wasteful,
# but it's convenient
logger.info(f"Saving the IID/OOD gap data to {out_path}")

pd.DataFrame(
    {
        "dataset": dataset,
        "algorithm": baseline_algorithm,
        "representation": representation,
        "iid_score": [score_iid, calibration_iid],
        "ood_score": [score_ood, calibration_ood],
        "metric": [perf_metric.name, cali_metric.name],
        "type": ["performance", "calibration"],
    }
).to_csv(out_path, index=False)



[32m2024-08-16 18:37:50.703[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mFound an IID MAE score of 0.720[0m
[32m2024-08-16 18:37:50.704[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mFound an IID Pearson calibration score of 0.257[0m
[32m2024-08-16 18:37:50.708[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mFound an OOD MAE score of 0.775[0m
[32m2024-08-16 18:37:50.708[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mFound an OOD Pearson calibration score of 0.218[0m


RuntimeError: /home/adamizdebski/files/mood-experiments/results/dataframes/compare_performance/20240816/gap_Lipophilicity_RF_Jointformer.csv already exists!

In [None]:

# Compute the performance over distance
df = pd.DataFrame()

for distance, mask in tqdm(bin_with_overlap(dist_test)):

    target = y_true[mask]
    preds = y_pred[mask]
    uncertainty = y_uncertainty[mask]

    n_samples = len(mask)
    if n_samples < 25 or len(np.unique(target)) == 1:
        continue

    perf_mu, perf_std = compute_bootstrapped_metric(
        targets=target, predictions=preds, metric=perf_metric, n_jobs=4
    ) # this breaks kernel -> is it because of pandas??

    break

    cali_mu, cali_std = compute_bootstrapped_metric(
        targets=target, predictions=preds, uncertainties=uncertainty, metric=cali_metric, n_jobs=4
    )

    df_ = pd.DataFrame(
        {
            "dataset": dataset,
            "algorithm": baseline_algorithm,
            "representation": representation,
            "distance": distance,
            "score_mu": [perf_mu, cali_mu],
            "score_std": [perf_std, cali_std],
            "type": ["performance", "calibration"],
            "metric": [perf_metric.name, cali_metric.name],
            "n_samples": n_samples,
        }
    )
    continue
    df = pd.concat((df, df_), ignore_index=True)

# file_name = f"perf_over_distance_{dataset}_{baseline_algorithm}_{representation}.csv"
# out_path = dm.fs.join(out_dir, file_name)
# if dm.fs.exists(out_path) and not overwrite:
#     raise RuntimeError(f"{out_path} already exists!")

# logger.info(f"Saving the performance over distance data to {out_path}")
# df.to_csv(out_path, index=False)

0it [00:00, ?it/s]

: 