Finetuned - ChemBERTa + SVR (EmSVR-BACE): pIC50 Prediction Pipeline  
----------------------------------------- 


- Generates ChemBERTa embeddings from SMILES
- Selects key features based on a LASSO feature index list 
- Predicts pIC50 using a EmSVR-BACE model



In [2]:
# Imports

import torch
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import joblib
from tqdm import tqdm
import random
import numpy as np
import os


# Configuration

# Here we take 500 smiles from MoleculeNet and predicted their IC50 values using our EmSVR-BACE model. 
# Note these SMILES are not in our training and test set.

input_smiles_csv = "SMILES.csv"                                     # keep your smiles in a csv file with the header "SMILES". 
Lasso_features = "Feature selected by lasso_115.csv"                # This file has 115 top features based on Lasso regression
finetuned_model_path = "./Finetuned_ChemBERTa_Model"                # ChemBERTa finetuned Model path
svr_model_file = "EmSVR-BACE.joblib"                                # EmSVR-BACE model
embeddings_csv = "outputs/generated_embeddings.csv"
filtered_features_csv = "outputs/EmSVR-BACE-Features.csv"
output_csv = "outputs/final_predicted_pIC50.csv"
random_state = 42

# Create output folder
os.makedirs("outputs", exist_ok=True)


# Reproducibility

torch.manual_seed(random_state)
np.random.seed(random_state)
random.seed(random_state)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_state)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\n Using device: {device}")
print(f" Random state set to {random_state}\n")


# Load ChemBERTa

print(" 1. Loading fine-tuned ChemBERTa model...")
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)
model = AutoModel.from_pretrained(finetuned_model_path)
model.to(device)
model.eval()
print(" Model loaded successfully!\n")


# Load SMILES

print(f"2. Reading SMILES from: {input_smiles_csv}")
df_smiles = pd.read_csv(input_smiles_csv)
if "SMILES" not in df_smiles.columns:
    raise ValueError("Input CSV must contain a column named 'SMILES'.")
smiles_list = df_smiles["SMILES"].tolist()
print(f" Loaded {len(smiles_list)} SMILES molecules.\n")


# Generate embeddings

embeddings = []
print("3. Generating ChemBERTa embeddings...")
with torch.no_grad():
    for smile in tqdm(smiles_list, desc="Extracting embeddings"):
        try:
            inputs = tokenizer(
                smile,
                return_tensors="pt",
                padding="max_length",
                truncation=True,
                max_length=512,
            ).to(device)
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embedding.cpu().squeeze().numpy())
        except:
            embeddings.append(np.zeros(model.config.hidden_size))

embeddings_df = pd.DataFrame(embeddings)
embeddings_df.to_csv(embeddings_csv, index=False)
print(f"Embeddings saved to: {embeddings_csv}\n")


# Feature selection using LASSO indices

print(f"4. Selecting features from LASSO indices in: {Lasso_features}")
columns_df = pd.read_csv(Lasso_features)  # header exists
columns_to_keep = columns_df.iloc[:, 0].astype(int).tolist() 
max_index = embeddings_df.shape[1]
valid_indices = [i for i in columns_to_keep if i < max_index]

if not valid_indices:
    raise ValueError("No matching columns found! Check your LASSO indices.")

filtered_df = embeddings_df.iloc[:, valid_indices]
filtered_df.to_csv(filtered_features_csv, index=False)
print(f"Filtered features saved to: {filtered_features_csv}")
print(f"Saved {len(valid_indices)} columns out of {len(columns_to_keep)} successfully!\n")


# Load SVR model and predict

print(f"5. Loading SVR model from: {svr_model_file}")
svr_model = joblib.load(svr_model_file)

filtered_df = filtered_df.astype(float)
print("6. Predicting pIC50 values...")
predictions = svr_model.predict(filtered_df)

output_df = df_smiles.copy()
output_df["predicted_pIC50"] = predictions
output_df.to_csv(output_csv, index=False)
print(f"7. Prediction completed! Results saved to: {output_csv}\n")
print("8. Sample predictions:")
print(output_df[["SMILES", "predicted_pIC50"]].head())


Some weights of RobertaModel were not initialized from the model checkpoint at ./Finetuned_ChemBERTa_Model and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Using device: cpu
 Random state set to 42

 1. Loading fine-tuned ChemBERTa model...
 Model loaded successfully!

2. Reading SMILES from: SMILES.csv
 Loaded 500 SMILES molecules.

3. Generating ChemBERTa embeddings...


Extracting embeddings: 100%|██████████| 500/500 [00:18<00:00, 27.39it/s]


Embeddings saved to: outputs/generated_embeddings.csv

4. Selecting features from LASSO indices in: Feature selected by lasso_115.csv
Filtered features saved to: outputs/EmSVR-BACE-Features.csv
Saved 115 columns out of 115 successfully!

5. Loading SVR model from: EmSVR-BACE.joblib
6. Predicting pIC50 values...
7. Prediction completed! Results saved to: outputs/final_predicted_pIC50.csv

8. Sample predictions:
                                           SMILES  predicted_pIC50
0                          Oc1ccc(cc1)CC([NH3+])C         3.316924
1                           Oc1ccc(cc1CC)CC[NH3+]         3.597867
2                       Fc1ccc(cc1)CC1CC[NH2+]CC1         3.722697
3  Clc1cc2CC(N=C(NC(Cc3ccccc3)c3ncccn3)c2cc1)(C)C         6.057023
4   O1[C@@H]2COCC[C@@]2(N=C1N)c1cc(ccc1)-c1cncnc1         6.233910


