In [5]:
import torch
from transformers import AutoModel, AutoTokenizer,AutoModelForMaskedLM, AutoModelForCausalLM,AutoModelForSeq2SeqLM,GraphormerForGraphClassification
import pubchempy as pcp
from scipy.io import loadmat
import pandas as pd
import numpy as np 
from rdkit import Chem
import os

In [None]:
base_dir = '../../../../T5 EVO'

In [None]:
set_seeds(seed=2024)

In [29]:
def extract_representations(
    tokenizer,
    model,
    model_name: str,   
    ds: str,
    input_type: str = "smiles",   # "smiles" or "selfies"
    token: int = 0,               # which token to read (e.g., CLS/first)
    save_path: str | None = None,
) -> pd.DataFrame:
    """
    Create embeddings from UNIQUE CIDs using either SMILES or SELFIES.
    The output DF has:
        ['cid', 'isomeric_text', 'canonical_text', 'input_type', 'model', 'layer', 'e0', ..., 'e{d-1}'].

    Tokenization uses the chosen input_type; for each row it prefers the isomeric text if present,
    otherwise falls back to the canonical text. Both text variants are kept as metadata columns.
    """
    assert input_type.lower() in {"smiles", "selfies"}, "input_type must be 'smiles' or 'selfies'"
    model.eval()

    # ---- Load & pick columns ----
    df = pd.read_csv(f"datasets/{ds}/{ds}_data.csv")
    if "cid" not in df.columns:
        raise ValueError("Dataset must contain a 'cid' column.")

    if input_type.lower() == "smiles":
        iso_col = "isomericsmiles"
        can_col = "canonicalsmiles"
    else:  # selfies
        iso_col = "isomericselfies"
        can_col = "canonicalselfies"

    if iso_col is None and can_col is None:
        raise ValueError(f"No {input_type} columns found.")

    use_cols = ["cid"]
    if iso_col: use_cols.append(iso_col)
    if can_col: use_cols.append(can_col)

    work = df[use_cols].copy()
    work = work.drop_duplicates(subset=["cid"], keep="first").sort_values("cid").reset_index(drop=True)

   
    # ---- Device ----
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    rows = []
    emb_dim = None

    for cid, iso_txt, can_txt in zip(work["cid"], work[iso_col], work[can_col]):
        # --- normalize texts ---
        iso_txt = iso_txt if isinstance(iso_txt, str) and iso_txt.strip() else ""
        can_txt = can_txt if isinstance(can_txt, str) and can_txt.strip() else ""
        if not iso_txt and not can_txt:
            continue  # nothing to encode for this cid

        # --- run model for isomeric (if present) ---
        iso_hiddens = None
        if iso_txt:
            iso_inputs = tokenizer([iso_txt], padding=True, truncation=True, return_tensors="pt")
            iso_inputs = {k: v.to(device) for k, v in iso_inputs.items()}
            with torch.no_grad():
                iso_out = model(**iso_inputs, output_hidden_states=True)
            iso_hiddens = iso_out.hidden_states  # tuple of [B,T,D] tensors

        # --- run model for canonical (if present) ---
        can_hiddens = None
        if can_txt:
            can_inputs = tokenizer([can_txt], padding=True, truncation=True, return_tensors="pt")
            can_inputs = {k: v.to(device) for k, v in can_inputs.items()}
            with torch.no_grad():
                can_out = model(**can_inputs, output_hidden_states=True)
            can_hiddens = can_out.hidden_states

        # number of layers to emit = max of the two (they should match)
        n_layers = max(
            len(iso_hiddens) if iso_hiddens is not None else 0,
            len(can_hiddens) if can_hiddens is not None else 0,
        )

        for layer_idx in range(n_layers):
            # get vectors (or None) for this layer
           
            iso_vec = iso_hiddens[layer_idx][0, token, :].detach().cpu().numpy()

            
            can_vec = can_hiddens[layer_idx][0, token, :].detach().cpu().numpy()

            # set / check embedding dim
            if emb_dim is None:
                
                emb_dim = iso_vec.shape[0]
                emb_dim = can_vec.shape[0]
            # sanity: if both exist, ensure same D
            if (iso_vec is not None) and (can_vec is not None):
                assert iso_vec.shape[0] == can_vec.shape[0], "Iso/Can dims differ!"

            row = {
                "cid": cid,
                "isomeric_text": iso_txt,
                "canonical_text": can_txt,
                "input_type": input_type.lower(),   # "smiles" or "selfies"
                "model": model_name,
                "layer": layer_idx,
            }

            # add iso_* columns (fill with NaN if missing)
            if emb_dim is None:
                continue  # defensive; should not happen if any vec exists
            for i in range(emb_dim):
                row[f"iso_e{i}"] = float(iso_vec[i]) 
                row[f"can_e{i}"] = float(can_vec[i])

            rows.append(row)


        out_df = pd.DataFrame(rows)

        
    os.makedirs(save_path, exist_ok=True)

    out_df.to_csv(f"{save_path}/{ds}_{model_name.split('/')[1]}_embeddings.csv", index=False)



In [None]:
# def extract_representations(tokenizer, model,model_name,input_type='smiles',token=0):
#     model.eval()  
#     for subject_id in range(s_start,s_end+1):
#         input_molecules = pd.read_csv(f'{base_dir}/datasets/{ds}/{ds}_data.csv')[input_type].values.tolist()
#         inputs = tokenizer(input_molecules, padding=True, return_tensors="pt")
#         with torch.no_grad():
#             outputs = model(**inputs,output_hidden_states=True)
#             for i,output in enumerate(outputs.hidden_states):
#                 np.save(f'{base_dir}/fmri/embeddings{ds}/embeddings_{model_name}_{subject_id}_{i}{ds}.npy', output[:,token,:].cpu().numpy())

In [None]:
# def extract_representations_by_molfeat(model_name,transformer,input_type='smiles',token=0):
    
#     for subject_id in range(s_start,s_end+1):
#         input_molecules = pd.read_csv(f'{base_dir}/fmri/embeddings{ds}/CIDs_smiles_selfies_{subject_id}{ds}.csv')[input_type].values.tolist()
        
#         outputs = transformer(input_molecules)
#         np.save(f'{base_dir}/fmri/embeddings{ds}/embeddings_{model_name}_{subject_id}_{-1}{ds}.npy',outputs)

# Encoder-Only

## MoLFormer-XL-both-10pct

In [30]:
Input_types = {
    'ibm/MoLFormer-XL-both-10pct':'smiles',
    'seyonec/ChemBERTa-zinc-base-v1':'smiles',
    'jonghyunlee/ChemBERT_ChEMBL_pretrained':'smiles',
    "HUBioDataLab/SELFormer":'selfies'
    }

for model_name in ['ibm/MoLFormer-XL-both-10pct','seyonec/ChemBERTa-zinc-base-v1',"jonghyunlee/ChemBERT_ChEMBL_pretrained","HUBioDataLab/SELFormer"]:
    for ds in ['sagar2023','keller2016','bierling2025']:
    
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        extract_representations(tokenizer, model,model_name,save_path='embeddings',ds=ds,input_type=Input_types[model_name])



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  df = pd.read_csv(f"datasets/{ds}/{ds}_data.csv")
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variabl

## ChemBERTa-zinc-base-v1

In [None]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModelForMaskedLM.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
extract_representations(tokenizer, model,'ChemBERTa-zinc-base-v1')

## SELFormer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HUBioDataLab/SELFormer")
model = AutoModelForMaskedLM.from_pretrained("HUBioDataLab/SELFormer")
extract_representations(tokenizer, model,'SELFormer',input_type='selfies')

## ChemBERT_ChEMBL_pretrained

In [None]:
tokenizer = AutoTokenizer.from_pretrained("jonghyunlee/ChemBERT_ChEMBL_pretrained")
model = AutoModel.from_pretrained("jonghyunlee/ChemBERT_ChEMBL_pretrained")
extract_representations(tokenizer, model,'ChemBERT_ChEMBL_pretrained')


# Decoder-Only

## BARTSmiles

In [None]:
model_path = "gayane/"
model_name = "BARTSmiles"  # Replace with actual model name if different
model = AutoModel.from_pretrained(model_path+model_name)
tokenizer = AutoTokenizer.from_pretrained(model_path+model_name,add_prefix_space=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model.eval()
for subject_id in range(1, 4):
        
        
        input_molecules = pd.read_csv(f'{base_dir}/fmri/embeddings/CIDs_smiles_selfies_{subject_id}.csv')['smiles'].values.tolist()
        inputs = tokenizer(input_molecules, return_tensors="pt",return_token_type_ids=False, add_special_tokens=True,padding=True)
        
        with torch.no_grad():
            outputs = model(**inputs,output_hidden_states=True)
            for i,output in enumerate(outputs.decoder_hidden_states):
                np.save(f'{base_dir}/fmri/embeddings/embeddings_decoder_{model_name}_{subject_id}_{i}.npy', output[:,-1,:].cpu().numpy())
                print(i,output.shape)
                # np.save(f'{base_dir}/fmri/results/decoder_{model_name}_{subject_id}_{i}_avg.npy', output[:,:].cpu().numpy())


            for i,output in enumerate(outputs.encoder_hidden_states):
                np.save(f'{base_dir}/fmri/embeddings/embeddings_encoder_{model_name}_{subject_id}_{i}.npy', output[:,0,:].cpu().numpy())
                print(i,output.shape)


## SMILES-GPT

In [None]:
from transformers import GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast
model_name= 'smiles-gpt'
model_dir = f'{base_dir}/fmri/models/smiles-gpt/'
checkpoint = "checkpoints/benchmark-5m"

config = GPT2Config.from_pretrained(model_dir+checkpoint, output_hidden_states=True)
model = GPT2LMHeadModel.from_pretrained(model_dir+checkpoint, config=config)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir+checkpoint,add_prefix_space=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model.eval()
for subject_id in range(1, 4):
        input_molecules = pd.read_csv(f'{base_dir}/fmri/embeddings/CIDs_smiles_selfies_{subject_id}.csv')['smiles'].values.tolist()
        inputs = tokenizer(input_molecules, return_tensors="pt", add_special_tokens=True,return_token_type_ids=False,padding=True)

        with torch.no_grad():
            outputs = model(**inputs,return_dict=True)
            for i,output in enumerate(outputs.hidden_states):
                np.save(f'{base_dir}/fmri/embeddings/embeddings_{model_name}_{subject_id}_{i}.npy', output[:,-1,:].cpu().numpy())
                

# MoLGen

In [None]:
# from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# model_path = "zjunlp/"
# model_name = "MolGen-large"
# tokenizer = AutoTokenizer.from_pretrained(model_path+model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path+model_name)
# model.eval()
# for subject_id in range(1, 4):
#         CIDs, smiles_subject = read_CIDs(base_dir,subject_id)
#         # smiles_subject2=['CC(C)CC1=CC=C(C=C1)C(C)C(=O)O','CC(C)CC1=CC=C(C=C1)C(C)C(=O']
#         inputs = tokenizer(smiles_subject, return_tensors="pt",return_token_type_ids=False, add_special_tokens=True,padding=True)
#         # inputs.pop("token_type_ids", None)
#         # print(type(smiles_subject), smiles_subject)
#         print(tokenizer.vocab_size)
#         print(tokenizer.tokenize(smiles_subject[0]))


#         with torch.no_grad():
#             outputs = model(**inputs,output_hidden_states=True)
#             for i,output in enumerate(outputs.decoder_hidden_states):
#                 print(i,output.shape)
#                 np.save(f'results/embeddings_decoder_{model_name}_{subject_id}_{i}.npy', output[:,-1,:].cpu().numpy())
#                 output = torch.mean(output, dim=1)
#                 np.save(f'results/embeddings_decoder_{model_name}_{subject_id}_{i}_avg.npy', output[:,:].cpu().numpy())


#             for i,output in enumerate(outputs.encoder_hidden_states):
#                 print(i,output.shape)
#                 np.save(f'results/embeddings_encoder_{model_name}_{subject_id}_{i}.npy', output[:,-1,:].cpu().numpy())
#                 output = torch.mean(output, dim=1)
#                 np.save(f'results/embeddings_encoder_{model_name}_{subject_id}_{i}_avg.npy', output[:,:].cpu().numpy())


## ChemGPT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-4.7M")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-4.7M")
extract_representations(tokenizer, model,'ChemGPT-4.7M',token=-1)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-19M")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-19M")
extract_representations(tokenizer, model,'ChemGPT-19M',token=-1)


In [None]:

tokenizer = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-1.2B")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-1.2B")
extract_representations(tokenizer, model,'ChemGPT-1.2B',token=-1)


# MoLGPT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("msb-roshan/molgpt")
model = AutoModelForCausalLM.from_pretrained("msb-roshan/molgpt")
extract_representations(tokenizer, model,'molgpt',token=-1,input_type='selfies')

# IBM SmallMoleculeMultiViewModel

# GTMGC

# Molecular Representations

In [None]:
descriptors =pd.read_csv(f'{base_dir}/fmri/molecular_descriptors_data.txt', sep='\t')
descriptors.set_index('CID', inplace=True)
descriptors.sort_values(by='CID',inplace=True)
descriptors.fillna(value=0,inplace=True)
for subject_id in range(1, 4):
    CIDs - pd.read_csv(f'{base_dir}/fmri/embeddings/CIDs_smiles_selfies_{subject_id}.csv')['CIDs'].values
    descriptors_cid = descriptors.loc[CIDs]
    descriptors_numpy = descriptors_cid.to_numpy()
    np.save(f'{base_dir}/fmri/embeddings/embeddings_molecular_descriptors_{subject_id}_1.npy', descriptors_numpy)

    #

In [None]:

#convert dataframe to numpy array


In [None]:
descriptors_cid

In [None]:


import numpy as np
# np.float = float 
import sys
# sys.path.append("/Volumes/work/phd/2025/MoLFormer_fMRI/Graphormer")
# import graphormer

from molfeat.trans.pretrained import GraphormerTransformer
# import datamol as dm
transformer = GraphormerTransformer(s='pcqm4mv2_graphormer_base', dtype=float)
# smiles = dm.freesolv().iloc[:100].smiles

# extract_representations_by_molfeat('pcqm4mv2_graphormer_base',transformer,input_type='smiles',token=-1)

ValueError: `graphormer` is required to use this featurizer.

In [None]:
from molfeat.utils import requires
print(requires.check("graphormer_pretrained"))


2025-07-30 13:52:13 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [2]:
import numpy as np
print(np.__version__)
print(np.float)

1.24.4


AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [1]:
# ! conda activate molfeat_graphormer
! which python
! python --version

/Users/farzaneh/opt/anaconda3/envs/molfeat_graphormer/bin/python
Python 3.11.13
