### Example script for training MPNN-POM model

In [5]:
import sys
import warnings
warnings.filterwarnings('ignore')

In [6]:
parent_dir = "/Midgard/home/farzantn/phd/Olfaction/MoLFormer_N2024"
sys.path.append(parent_dir)
parent_dir="/Midgard/home/farzantn/mambaforge/envs/MolTran_CUDA11_cuda/lib/python3.8"
sys.path.append(parent_dir)

In [7]:
import deepchem as dc
import os
os.environ['TF_ENABLE_MLIR_OPTIMIZATIONS'] = '1'
from openpom.feat.graph_featurizer import GraphFeaturizer, GraphConvConstants
from openpom.utils.data_utils import get_class_imbalance_ratio
from openpom.models.mpnn_pom import MPNNPOMModel
import numpy as np
import random
import torch
import pandas as pd
from constants import *

In [8]:
seed = 2024
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)
# set_seeds(2024)

In [9]:
base_path = '/local_storage/datasets/farzaneh/alignment_olfaction_datasets'

In [10]:
def convert_todf_openpom(embeddings_dataset,cids,subjects=None,y=None):
    embeddings_dataset = pd.DataFrame(embeddings_dataset)
    embeddings_dataset['embeddings'] = embeddings_dataset.loc[:, 0:768].values.tolist()
    embeddings_dataset['CID'] = cids
    if subjects is not None:
        embeddings_dataset['subject'] = subjects
    if y is not None:
        y_dataset = pd.DataFrame(y)
        y_dataset['y'] = y_dataset.loc[:, 0:256].values.tolist()
    
        df = pd.concat([embeddings_dataset, y_dataset], axis=1)
        return df
    else:
        return embeddings_dataset

In [11]:

def embed_mols(input_file):
    # get dataset
    # print(os.getcwd())
    featurizer = GraphFeaturizer()
    smiles_field = 'nonStereoSMILES'
    loader = dc.data.CSVLoader(tasks=[],
                       feature_field=smiles_field,
                       featurizer=featurizer)
    dataset = loader.create_dataset(inputs=[input_file])
    
    embeddings=model.predict_embedding(dataset)
    return embeddings,dataset

In [12]:
def postproce_molembeddings(embeddings,index):
    # molecules_embeddings_penultimate = torch.cat(embeddings)
    df_molecules_embeddings = pd.DataFrame(embeddings, index=index)
    df_molecules_embeddings['Combined'] = df_molecules_embeddings.loc[:, '0':'767'].values.tolist()
    df_molecules_embeddings=df_molecules_embeddings.reset_index()
    return(df_molecules_embeddings)

In [13]:
def prepare_mols_helper(input_file,tasks,mol_type="nonStereoSMILES",index="cid"):
    featurizer = GraphFeaturizer()
    # smiles_field = 'nonStereoSMILES'
    loader = dc.data.CSVLoader(tasks=tasks,
                   feature_field=mol_type,
                   featurizer=featurizer
                          )
    dataset = loader.create_dataset(inputs=[input_file])
    df_mols = pd.read_csv(input_file)
    print(df_mols.columns)

    df_mols_embeddings_original=model.predict_embedding(dataset)
    return df_mols_embeddings_original,dataset

In [14]:
# download curated dataset
# !wget https://raw.githubusercontent.com/ARY2260/openpom/main/openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv

# The curated dataset can also found at `openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv` in the repo.

input_file = '/local_storage/datasets/farzaneh/openpom/data/curated_datasets/curated_GS_LF_merged_4983.csv' # or new downloaded file path

In [15]:
df_gslf = pd.read_csv(input_file)

In [16]:
# get dataset
print(os.getcwd())
featurizer = GraphFeaturizer()
smiles_field = 'nonStereoSMILES'
loader = dc.data.CSVLoader(tasks=gs_lf_tasks,
                   feature_field=smiles_field,
                   featurizer=featurizer)
dataset = loader.create_dataset(inputs=[input_file])
n_tasks = len(dataset.tasks)

In [17]:
# get train valid test splits

randomstratifiedsplitter = dc.splits.RandomStratifiedSplitter()
train_dataset, valid_dataset, test_dataset = randomstratifiedsplitter.train_valid_test_split(dataset, frac_train = 0.8, frac_valid = 0.1, frac_test = 0.1, seed = seed)

In [18]:

train,valid,test=randomstratifiedsplitter.split(dataset, frac_train = 0.8, frac_valid = 0.1, frac_test = 0.1, seed = seed)

In [19]:
df_train_valid_test = pd.DataFrame({'main_idx': train + valid + test,
                   'split': ['train'] * len(train) + ['valid'] * len(valid) + ['test'] * len(test)})


In [20]:
for i in range(len(train)):
    if not np.array_equal(train_dataset.y[i],dataset.y[train[i]]):
        print(i)

for i in range(len(valid)):
    if not np.array_equal(valid_dataset.y[i],dataset.y[valid[i]]):
        print(i)

for i in range(len(test)):
    if not np.array_equal(test_dataset.y[i],dataset.y[test[i]]):
        print(i)


for i in range(len(train)):
    if not np.array_equal(train_dataset.y[i],df_gslf.iloc[train[i]].values[2:].tolist()):
        print(i)

for i in range(len(valid)):
    if not np.array_equal(valid_dataset.y[i],df_gslf.iloc[valid[i]].values[2:].tolist()):
        print(i)

for i in range(len(test)):
    if not np.array_equal(test_dataset.y[i],df_gslf.iloc[test[i]].values[2:].tolist()):
        print(i)
        

In [21]:
train_ratios = get_class_imbalance_ratio(train_dataset)
assert len(train_ratios) == n_tasks

In [22]:
train_dataset.y

In [23]:
# learning_rate = ExponentialDecay(initial_rate=0.001, decay_rate=0.5, decay_steps=32*15, staircase=True)
learning_rate = 0.001

In [31]:
# initialize model
device_name = 'cuda'
model = MPNNPOMModel(n_tasks = n_tasks,
                            batch_size=128,
                            learning_rate=learning_rate,
                            class_imbalance_ratio = train_ratios,
                            loss_aggr_type = 'sum',
                            node_out_feats = 100,
                            edge_hidden_feats = 75,
                            edge_out_feats = 100,
                            num_step_message_passing = 5,
                            mpnn_residual = True,
                            message_aggregator_type = 'sum',
                            mode = 'classification',
                            number_atom_features = GraphConvConstants.ATOM_FDIM,
                            number_bond_features = GraphConvConstants.BOND_FDIM,
                            n_classes = 1,
                            readout_type = 'set2set',
                            num_step_set2set = 3,
                            num_layer_set2set = 2,
                            ffn_hidden_list= [392, 392],
                            ffn_embeddings = 256,
                            ffn_activation = 'relu',
                            ffn_dropout_p = 0.12,
                            ffn_dropout_at_input_no_act = False,
                            weight_decay = 1e-5,
                            self_loop = False,
                            optimizer_name = 'adam',
                            log_frequency = 32,
                            model_dir = '../examples/experiments',
                            device_name=device_name)

In [32]:
nb_epoch = 150

In [33]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)


In [34]:
model.model_dir

In [35]:
model.load_from_pretrained(model)

In [36]:
test_scores = model.evaluate(test_dataset, [metric])['roc_auc_score']
# print("time_taken: ", str(end_time-start_time))
print("test_score: ", test_scores)

## Extracting embeddings

## GS-LF

In [32]:
embeddings_dataset=model.predict_embedding(dataset)

In [36]:
cids_gslf= df_gslf.index.values.tolist()

In [44]:
df_embeddings = convert_todf_openpom(embeddings_dataset,cids_gslf,None,dataset.y)
df_embeddings.to_csv('gslf_pom_embeddings_Apr17.csv', index=False)

In [45]:
# df_embeddings17.head(5)

### Sagar

In [32]:

input_file_sagar= '/local_storage/datasets/farzaneh/openpom/data/curated_datasets/curated_sagar_subjects_nonaminus.csv'
df_sagar_temp=pd.read_csv(input_file_sagar)
cids_sagar= df_sagar_temp['cid'].values.tolist()
subjects_sagar= df_sagar_temp['subject'].values.tolist()
sagar_tasks= df_sagar_temp.columns.to_list()[1:16]
df_mols_embeddings_original,sagar_dataset=prepare_mols_helper(input_file_sagar,sagar_tasks)

In [33]:
df_sagar_temp

In [34]:
df_embeddings_sagar = convert_todf_openpom(df_mols_embeddings_original,cids_sagar,subjects_sagar,sagar_dataset.y)
df_embeddings_sagar.to_csv('sagar_pom_embeddings_Apr17.csv', index=False)

### Keller

In [37]:
input_file_keller= '/local_storage/datasets/farzaneh/openpom/data/curated_datasets/curated_keller2016_nona.csv'
df_keller_temp=pd.read_csv(input_file_keller)
keller_tasks= df_keller_temp.columns.to_list()[5:]
cids_keller= df_keller_temp['CID'].values.tolist()
subjects_keller= df_keller_temp['Subject'].values.tolist()
df_mols_embeddings_original_keller,keller_dataset=prepare_mols_helper(input_file_keller,keller_tasks,index="CID")

In [38]:
df_embeddings_keller = convert_todf_openpom(df_mols_embeddings_original_keller,cids_keller,subjects_keller,keller_dataset.y)
df_embeddings_keller.to_csv('keller_pom_embeddings_Apr17.csv', index=False)

### Ravia

In [52]:
input_file_ravia = '/local_storage/datasets/farzaneh/openpom/data/curated_datasets/ravia_molecules.csv'
df_ravia_temp=pd.read_csv(input_file_ravia)
embeddings_ravia,dataset=embed_mols(input_file_ravia)
cids_ravia= df_ravia_temp['CID'].values.tolist()
df_embeddings_ravia = convert_todf_openpom(embeddings_ravia,cids_ravia)
df_embeddings_ravia.to_csv('ravia_pom_embeddings_Apr17.csv', index=False)

### Snitz

In [54]:
input_file_snitz = '/local_storage/datasets/farzaneh/openpom/data/curated_datasets/snitz_molecules.csv'
df_snitz_temp=pd.read_csv(input_file_snitz)
embeddings_snitz,dataset=embed_mols(input_file_snitz)
cids_snitz= df_snitz_temp['CID'].values.tolist()
df_embeddings_snitz = convert_todf_openpom(embeddings_snitz,cids_snitz)
df_embeddings_snitz.to_csv('snitz_pom_embeddings_Apr17.csv', index=False)

In [40]:
input_file_snitz = base_path + '/curated_datasets/mols_datasets/snitz_molecules.csv'
df_snitz_temp=pd.read_csv(input_file_snitz)
df_snitz_temp.columns.values.tolist()


In [50]:
input_file_dra = base_path + '/curated_datasets/tasks/dravnieks1985_applicability_1.csv'
df_dra_temp=pd.read_csv(input_file_dra)
df_draviensk_temp=pd.read_csv(input_file_dra)
print(len(df_draviensk_temp))
df_draviensk_temp = df_draviensk_temp.drop_duplicates(subset=['CID'])
print(len(df_draviensk_temp))
df_draviensk_temp.to_csv(base_path+'/curated_datasets/mols_datasets/dravnieks1985_applicability_1_molecules.csv', index=False)

#### Draviensk

In [39]:
input_file_draviensk = base_path+'/curated_datasets/mols_datasets/dravnieks1985_applicability_1_molecules.csv'
df_draviensk_temp=pd.read_csv(input_file_draviensk)
embeddings_draviensk,dataset=embed_mols(input_file_draviensk)
cids_draviensk= df_draviensk_temp['CID'].values.tolist()
df_embeddings_draviensk = convert_todf_openpom(embeddings_draviensk,cids_draviensk)
df_embeddings_draviensk.to_csv(base_path+'/curated_datasets/embeddings/pom/dravnieks1985_applicability_1_pom_embeddings_Apr17.csv', index=False)