# Using biomed.sm.mv-te-84m Models Via SmallMoleculeMultiViewModel API

In [1]:
# Necessary Imports
from bmfm_sm.api.smmv_api import SmallMoleculeMultiViewModel, PredictionIterator
from bmfm_sm.core.data_modules.namespace import LateFusionStrategy
from bmfm_sm.api.dataset_registry import DatasetRegistry

from dataclasses import asdict
from itertools import islice
import pandas as pd
import os

2025-06-03 14:25:29,653 - rdkit - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Enabling RDKit 2024.03.6 jupyter extensions


## Explore the data

Following are the datasets available for evaluation, finetuning and inference.

In [2]:
pd.DataFrame(DatasetRegistry.get_instance().get_collection('MoleculeNet'))

Unnamed: 0,dataset_name,num_tasks,task_type,description,preferred_metric,path,example,collection,num_classes
0,BACE,1,TaskType.CLASSIFICATION,MoleculeNet: Inhibition of human beta secretase 1,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/bace.csv,"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O,0",DatasetCollection.MOLECULENET,2.0
1,BBBP,1,TaskType.CLASSIFICATION,MoleculeNet: Blood brain barrier penetration,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/bbbp.csv,"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O,0",DatasetCollection.MOLECULENET,2.0
2,CLINTOX,2,TaskType.CLASSIFICATION,MoleculeNet: Toxicity data of FDA-approved dru...,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/clintox.csv,"[N+](=O)([O-])[O-],1 0",DatasetCollection.MOLECULENET,2.0
3,ESOL,1,TaskType.REGRESSION,MoleculeNet: Water solubility data for organics,Metrics.RMSE,datasets/raw_data/MoleculeNet/esol.csv,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,DatasetCollection.MOLECULENET,
4,FREESOLV,1,TaskType.REGRESSION,MoleculeNet: Hydration free energy,Metrics.RMSE,datasets/raw_data/MoleculeNet/freesolv.csv,"CN(C)C(=O)c1ccc(cc1)OC,-11.01",DatasetCollection.MOLECULENET,
5,HIV,1,TaskType.CLASSIFICATION,MoleculeNet: Inhibition of HIV viral replication,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/hiv.csv,"CC(C)CC1=CC=C(C=C1)C(C)C(=O)O,0",DatasetCollection.MOLECULENET,2.0
6,LIPOPHILICITY,1,TaskType.REGRESSION,MoleculeNet: Octonol/water distribution coefff...,Metrics.RMSE,datasets/raw_data/MoleculeNet/lipophilicity.csv,"Cn1c(CN2CCN(CC2)c3ccc(Cl)cc3)nc4ccccc14,3.54",DatasetCollection.MOLECULENET,
7,MUV,17,TaskType.CLASSIFICATION,MoleculeNet: PubChem derived target-activity t...,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/muv.csv,Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1...,DatasetCollection.MOLECULENET,2.0
8,PCBA,128,TaskType.CLASSIFICATION,MoleculeNet: PubChem derived target-activity (...,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/pcba.csv,"CC(=O)N1CCC2(CC1)NC(=O)N(c1ccccc1)N2,0 0 -1 0 ...",DatasetCollection.MOLECULENET,2.0
9,QM7,1,TaskType.REGRESSION,MoleculeNet: Electronic properties derived fro...,Metrics.MAE,datasets/raw_data/MoleculeNet/qm7.csv,"C([H])([H])([H])[H],-417.96",DatasetCollection.MOLECULENET,


Get more information about a particular dataset

In [3]:
dataset = DatasetRegistry.get_instance().get_dataset_info('TOX21')
pd.DataFrame([asdict(dataset)])

Unnamed: 0,dataset_name,num_tasks,task_type,description,preferred_metric,path,example,collection,num_classes
0,TOX21,12,TaskType.CLASSIFICATION,MoleculeNet: Toxicity against set of targets,Metrics.ROCAUC,datasets/raw_data/MoleculeNet/tox21.csv,"CCOc1ccc2nc(S(N)(=O)=O)sc2c1,0 0 1 -1 -1 0 0 1...",DatasetCollection.MOLECULENET,2


The ComputationalADME dataset is supported but the raw data is not provided in the downloadable data root 

In [4]:
pd.DataFrame(DatasetRegistry.get_instance().get_collection('ComputationalADME'))

Unnamed: 0,dataset_name,num_tasks,task_type,description,preferred_metric,path,example,collection,num_classes
0,HLM,1,TaskType.REGRESSION,ComputationalADME: LOG HLM_CLint (mL/min/kg),Metrics.RMSE,datasets/raw_data/ComputationalADME/hlm.csv,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,DatasetCollection.COMPUTATIONALADME,
1,HPPB,1,TaskType.REGRESSION,ComputationalADME: LOG PLASMA PROTEIN BINDING ...,Metrics.RMSE,datasets/raw_data/ComputationalADME/hppb.csv,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,DatasetCollection.COMPUTATIONALADME,
2,MDR1-MDCK-ER,1,TaskType.REGRESSION,ComputationalADME: LOG MDR1-MDCK ER (B-A/A-B),Metrics.RMSE,datasets/raw_data/ComputationalADME/mdr1-mdck-...,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,DatasetCollection.COMPUTATIONALADME,
3,RLM,1,TaskType.REGRESSION,ComputationalADME: LOG RLM_CLint (mL/min/kg),Metrics.RMSE,datasets/raw_data/ComputationalADME/rlm.csv,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,DatasetCollection.COMPUTATIONALADME,
4,RPPB,1,TaskType.REGRESSION,ComputationalADME: LOG PLASMA PROTEIN BINDING ...,Metrics.RMSE,datasets/raw_data/ComputationalADME/rppb.csv,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,DatasetCollection.COMPUTATIONALADME,
5,SOLUBILITY,1,TaskType.REGRESSION,ComputationalADME: LOG SOLUBILITY PH 6.8 (ug/mL),Metrics.RMSE,datasets/raw_data/ComputationalADME/solubility...,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,DatasetCollection.COMPUTATIONALADME,


## Using Models from HuggingFace

We have made pretrained and finetuned models available in [HuggingFace](https://huggingface.co/ibm/biomed.sm.mv-te-84m)

### Loading Pretrained Model from HuggingFace

Load a pretrained model from Huggingface by setting `model_path` to a Huggingface repo, and setting the `huggingface` argument to `True`

In [5]:
model = SmallMoleculeMultiViewModel.from_pretrained(LateFusionStrategy.ATTENTIONAL,
                                                    model_path='ibm/biomed.sm.mv-te-84m',
                                                    huggingface=True)

2025-06-03 14:25:48,400 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint via HuggingFace Hub from provided path ibm/biomed.sm.mv-te-84m
2025-06-03 14:25:49,363 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:25:49,364 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:25:51,169 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


### Get Embeddings from a Pretrained Model

In [6]:
example_smiles = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
example_emb = SmallMoleculeMultiViewModel.get_embeddings(
    smiles=example_smiles,
    model_path="ibm/biomed.sm.mv-te-84m",
    huggingface=True,
)
print(example_emb.shape)

2025-06-03 14:25:51,424 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint via HuggingFace Hub from provided path ibm/biomed.sm.mv-te-84m
2025-06-03 14:25:52,478 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:25:52,479 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:25:52,644 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


torch.Size([512])


### Load Finetuned Model from HuggingFace

In [7]:
dataset_registry = DatasetRegistry()
ds = dataset_registry.get_dataset_info('BACE')

In [8]:
finetuned_model_ds = SmallMoleculeMultiViewModel.from_finetuned(
    ds,
    model_path="ibm/biomed.sm.mv-te-84m-MoleculeNet-ligand_scaffold-BACE-101",
    inference_mode=True,
    huggingface=True
)

2025-06-03 14:25:55,228 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint via HuggingFace Hub from provided path ibm/biomed.sm.mv-te-84m-MoleculeNet-ligand_scaffold-BACE-101
2025-06-03 14:25:56,019 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:25:56,019 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:25:57,907 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


### Get Predictions from Finetuned Model

In [9]:
# Get predictions
prediction = SmallMoleculeMultiViewModel.get_predictions(
    example_smiles, ds, finetuned_model=finetuned_model_ds
)
print(prediction)

tensor(0, dtype=torch.int32)


NOTE: The ComputationalADME finetuned checkpoints are only available on our HuggingFace. The checkpoints are not included as downloadable files in the provided data_root folder (Specified in the README) 

In [10]:
ds = dataset_registry.get_dataset_info('SOLUBILITY')

finetuned_model_ds = SmallMoleculeMultiViewModel.from_finetuned(
    ds,
    model_path="ibm-research/biomed.sm.mv-te-84m-ComputationalADME-random-SOLUBILITY-101",
    inference_mode=True,
    huggingface=True
)

prediction = SmallMoleculeMultiViewModel.get_predictions(
    example_smiles, ds, finetuned_model=finetuned_model_ds
)
print(prediction)

2025-06-03 14:25:58,736 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint via HuggingFace Hub from provided path ibm-research/biomed.sm.mv-te-84m-ComputationalADME-random-SOLUBILITY-101
2025-06-03 14:25:59,504 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:25:59,506 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:01,171 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


tensor(1.4094)


## Using Local Data

You can also download the checkpoints and data to your system and use the API. Follow procedure in the README for setting up the data. Make sure that `BMFM_HOME` environment variable is pointing to the data_root.

In [11]:
print(os.environ['BMFM_HOME'])

/dccstor/fmm/data_root


### Loading Pretrained Model (Local)

Load our provided pretrained checkpoint, which uses a Attentional Late Fusion strategy (default). It loads the model in inference mode.

In [12]:
model_local = SmallMoleculeMultiViewModel.from_pretrained(LateFusionStrategy.ATTENTIONAL, inference_mode = True)

2025-06-03 14:26:02,757 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:26:02,758 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:02,777 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint from default path /dccstor/fmm/data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-with-coeff-agg.pth
2025-06-03 14:26:04,609 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading pretrain checkpoint for SmallMoleculeMultiView Model - <All keys matched successfully>
2025-06-03 14:26:04,634 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


We can also load with a different fusion strategy - Note, aggregator will not be pretrained but BaseModel will be.

In [13]:
model_local = SmallMoleculeMultiViewModel.from_pretrained(LateFusionStrategy.MOE_WEIGHTED_CONCAT_PROJECTED, inference_mode = True)

2025-06-03 14:26:05,598 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using moe_weighted_concat architecture for aggregator
2025-06-03 14:26:05,599 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:05,605 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint from default path /dccstor/fmm/data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-with-coeff-agg.pth
2025-06-03 14:26:05,853 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading pretrain checkpoint for SmallMoleculeMultiView Model - _IncompatibleKeys(missing_keys=['aggregator.weighted_concat_network.gating_network.fc.weight', 'aggregator.weighted_concat_network.gating_network.fc.bias'], unexpected_keys=['aggregator.w_before_mean.0.weight', 'aggregator.w_before_mean.0.bias', 'aggregator.w_before_mean.2.weight', 'aggregator.down_project.weight', 'aggregator.down_project.bias', 'aggregator.shared_ta

We can also provide a `custom_model` path to load from a specific checkpoint

In [14]:
ckpt_path = f"{os.environ['BMFM_HOME']}/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-base.pth"
model_local = SmallMoleculeMultiViewModel.from_pretrained(LateFusionStrategy.CONCAT, 
                                     model_path=ckpt_path,
                                     inference_mode = True)

2025-06-03 14:26:06,730 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using concat architecture for aggregator
2025-06-03 14:26:06,731 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:06,732 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint from provided path /dccstor/fmm/data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-base.pth
2025-06-03 14:26:08,098 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading pretrain checkpoint for SmallMoleculeMultiView Model - <All keys matched successfully>
2025-06-03 14:26:08,118 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


We can set `model_path` to `False` if to load a non-pretrained model.

In [15]:
model_local = SmallMoleculeMultiViewModel.from_pretrained(LateFusionStrategy.MOE_NOISED_WEIGHTED_CONCAT_BOTH_PROJECTED, 
                                     model_path=False,
                                     inference_mode = False)

2025-06-03 14:26:09,072 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using moe_noised_weighted_concat architecture for aggregator
2025-06-03 14:26:09,073 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:09,078 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Not using checkpoint for model initialization


### Get Embeddings from a Pretrained Model (Local)

In [16]:
example_smiles = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
example_emb = SmallMoleculeMultiViewModel.get_embeddings(example_smiles)
example_emb.shape

2025-06-03 14:26:10,031 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:26:10,032 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:10,050 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading checkpoint from default path /dccstor/fmm/data_root/bmfm_model_dir/pretrained/MULTIVIEW_MODEL/biomed-smmv-with-coeff-agg.pth
2025-06-03 14:26:10,307 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading pretrain checkpoint for SmallMoleculeMultiView Model - <All keys matched successfully>
2025-06-03 14:26:10,330 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


torch.Size([512])

### Load Finetuned Model (Local)

Choose a supported dataset for which finetuned checkpoint is available

In [17]:
#Example of a Classification Prediction
dataset_registry = DatasetRegistry()
bace_ds = dataset_registry.get_dataset_info('BACE')

In [18]:
finetuned_model_bace_local = SmallMoleculeMultiViewModel.from_finetuned(bace_ds, inference_mode = True)

2025-06-03 14:26:15,088 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:26:15,089 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:15,843 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading finetune checkpoint for SmallMoleculeMultiView Model - <All keys matched successfully>
2025-06-03 14:26:16,522 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading finetune checkpoint for Prediction Head: /dccstor/fmm/data_root/bmfm_model_dir/finetuned/MULTIVIEW_MODEL/MoleculeNet/ligand_scaffold/BACE/best-101.ckpt
2025-06-03 14:26:16,586 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


In [19]:
example_smiles = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
bace_prediction = SmallMoleculeMultiViewModel.get_predictions(example_smiles, bace_ds, finetuned_model=finetuned_model_bace_local)
bace_prediction

tensor(0, dtype=torch.int32)

In [20]:
#Example of a Regression Prediction
esol_ds = dataset_registry.get_dataset_info('ESOL')
finetuned_model_esol_local = SmallMoleculeMultiViewModel.from_finetuned(esol_ds, inference_mode = True)
esol_prediction = SmallMoleculeMultiViewModel.get_predictions(example_smiles, esol_ds, finetuned_model=finetuned_model_esol_local)
esol_prediction

2025-06-03 14:26:21,670 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Using coeff_mlp architecture for aggregator
2025-06-03 14:26:21,671 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - dim_list [512, 512, 768] for aggregator
2025-06-03 14:26:22,601 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading finetune checkpoint for SmallMoleculeMultiView Model - <All keys matched successfully>
2025-06-03 14:26:23,299 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - Loading finetune checkpoint for Prediction Head: /dccstor/fmm/data_root/bmfm_model_dir/finetuned/MULTIVIEW_MODEL/MoleculeNet/ligand_scaffold/ESOL/best-101.ckpt
2025-06-03 14:26:23,364 - root - INFO - ccc-login3.pok.ibm.com:140229646791360:0:0 - in train False setting deterministic_eval = True


tensor(-3.5532)