In [1]:
import torch
from transformers import AutoModel, AutoTokenizer,AutoModelForMaskedLM, AutoModelForCausalLM,AutoModelForSeq2SeqLM
import pubchempy as pcp
from scipy.io import loadmat
import pandas as pd
import numpy as np
from NoteBooks.utils import *

In [2]:
def extract_representations(tokenizer, model,model_name):

    for subject_id in range(1, 4):
        CIDs, smiles_subject = read_CIDs(subject_id)
        inputs = tokenizer(smiles_subject, padding=True, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs,output_hidden_states=True)
        
        #save the embeddings as npy together with CID and subject ID and behavior ratings
            
            for i,output in enumerate(outputs.hidden_states):
                np.save(f'results/embeddings_{model_name}_{subject_id}_{i}.npy', output[:,0,:].cpu().numpy())

In [3]:
base_dir = '../../../T5 EVO'

In [4]:
#get smiles from CID
def get_smiles_from_cid(cid):
    
    compound =  pcp.Compound.from_cid(cid)
    return compound.canonical_smiles

In [5]:
def read_CIDs(subject_id):
    mat1 = loadmat(f'{base_dir}/fmri/Fahime/behavior/behav_ratings_NEMO0{subject_id}.mat')
    CIDs = mat1['behav'][0][0]['cid']
    CIDs = CIDs.squeeze(1)
    CIDs = CIDs.tolist()
    smiles_subject = []
    for cid in CIDs:
        smiles = get_smiles_from_cid(cid)
        smiles_subject.append(smiles)
    return CIDs, smiles_subject

In [7]:
for subject_id in range(1, 4):
    CIDs, smiles_subject = read_CIDs(subject_id)
    #save the CIDs and smiles
    pd.DataFrame({'CIDs':CIDs, 'smiles':smiles_subject}).to_csv(f'results/CIDs_smiles_{subject_id}.csv')

KeyboardInterrupt: 

In [21]:
for subject_id in range(1, 4):
    mat1 = loadmat(f'{base_dir}/fmri/Fahime/behavior/behav_ratings_NEMO0{subject_id}.mat')
    ratings = mat1['behav'][0][0]['ratings']
    print(ratings.shape)
    #save the ratings
    np.save(f'results/ratings_{subject_id}.npy', ratings)

(160, 18)
(160, 18)
(160, 18)


# OpenPOM

In [7]:
# model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)
all_outputs = []
layers = []
all_subjects = []
for subject_id in range(1, 4):
    CIDs, smiles_subject = read_CIDs(subject_id)
    with torch.no_grad():
        outputs = read_pom(base_dir, CIDs)
        print(outputs.shape)
            #save to npy
        np.save(f'results/openpom_{subject_id}_1000.npy', outputs)
        # save the output and layers


(160, 256)
(160, 256)
(160, 256)


# MoLFormer-XL-both-10pct

In [13]:
model = AutoModel.from_pretrained("ibm/MoLFormer-XL-both-10pct", deterministic_eval=True, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ibm/MoLFormer-XL-both-10pct", trust_remote_code=True)
all_outputs = []
layers = []
all_subjects = []
for subject_id in range(1, 4):
    CIDs, smiles_subject = read_CIDs(subject_id)
    inputs = tokenizer(smiles_subject, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs,output_hidden_states=True)
        for i,output in enumerate(outputs.hidden_states):
            #save to npy
            np.save(f'results/embeddings_MoLFormer-XL-both-10pct_{subject_id}_{i}.npy', output[:,0,:].cpu().numpy())
        # save the output and layers
   

In [14]:
np.load('results/embeddings_MoLFormer-XL-both-10pct_1_0.npy').shape

(160, 768)

# ChemBERTa-zinc-base-v1

In [16]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
extract_representations(tokenizer, model,'embeddings_ChemBERTa-zinc-base-v1')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# MoLGPT

In [14]:
tokenizer = AutoTokenizer.from_pretrained("msb-roshan/molgpt")
model = AutoModelForCausalLM.from_pretrained("msb-roshan/molgpt")
extract_representations(tokenizer, model,'molgpt')

# MoLGen

In [11]:
tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large")
model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large")
extract_representations(tokenizer, model,'MolGen-large')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AttributeError: 'Seq2SeqLMOutput' object has no attribute 'hidden_states'

# BASRTSmiles

In [13]:
tokenizer = AutoTokenizer.from_pretrained("gayane/BARTSmiles", add_prefix_space=True)
model = AutoModel.from_pretrained('gayane/BARTSmiles')
extract_representations(tokenizer, model,'BARTSmiles')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

#ChemGPT

In [9]:
tokenizer = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-4.7M")
model = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-4.7M")
extract_representations(tokenizer, model,'ChemGPT-4_7M')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

# Selformer

In [8]:
tokenizer = AutoTokenizer.from_pretrained("HUBioDataLab/SELFormer")
model = AutoModelForMaskedLM.from_pretrained("HUBioDataLab/SELFormer")
extract_representations(tokenizer, model,'SELFormer')

# IBM SmallMoleculeMultiViewModel

In [None]:
# Necessary imports
from bmfm_sm.api.smmv_api import SmallMoleculeMultiViewModel
from bmfm_sm.core.data_modules.namespace import LateFusionStrategy

# Load Model
model = SmallMoleculeMultiViewModel.from_pretrained(
    LateFusionStrategy.ATTENTIONAL,
    model_path="ibm/biomed.sm.mv-te-84m",
    huggingface=True
)

# Load Model and get embeddings for a molecule
example_smiles = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
example_emb = SmallMoleculeMultiViewModel.get_embeddings(
    smiles=example_smiles,
    model_path="ibm/biomed.sm.mv-te-84m",
    huggingface=True,
)
print(example_emb.shape)

In [None]:
# Necessary imports
from bmfm_sm.api.smmv_api import SmallMoleculeMultiViewModel
from bmfm_sm.core.data_modules.namespace import LateFusionStrategy

# Load Model
model = SmallMoleculeMultiViewModel.from_pretrained(
    LateFusionStrategy.ATTENTIONAL,
    model_path="ibm/biomed.sm.mv-te-84m",
    huggingface=True
)

# Load Model and get embeddings for a molecule
example_smiles = "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O"
example_emb = SmallMoleculeMultiViewModel.get_embeddings(
    smiles=example_smiles,
    model_path="ibm/biomed.sm.mv-te-84m",
    huggingface=True,
)
print(example_emb.shape)

# ChemBERT_ChEMBL_pretrained

In [18]:
tokenizer = AutoTokenizer.from_pretrained("jonghyunlee/ChemBERT_ChEMBL_pretrained")
model = AutoModel.from_pretrained("jonghyunlee/ChemBERT_ChEMBL_pretrained")
all_outputs = []
layers = []
all_subjects = []
for subject_id in range(1, 4):
    CIDs, smiles_subject = read_CIDs(subject_id)
    inputs = tokenizer(smiles_subject, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs,output_hidden_states=True)
    
    #save the embeddings as npy together with CID and subject ID and behavior ratings
        
        for i,output in enumerate(outputs.hidden_states):
            np.save(f'results/embeddings_ChemBERT_ChEMBL_pretrained_{subject_id}_{i}.npy', output[:,0,:].cpu().numpy())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertModel were not initialized from the model checkpoint at jonghyunlee/ChemBERT_ChEMBL_pretrained and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# GTMGC

In [None]:
model = MoleBERTTokenizer.from_pretrained("RichXuOvO/MoleBERT-Tokenizer")
model = GTMGCForGraphRegression.from_pretrained("RichXuOvO/GTMGC_Small-Molecule3D-Random-Gap")  
extract_representations(tokenizer, model,'GTMGC_Small-Molecule3D-Random-Gap')