# The aim is to train our selected transfer learning model on all of our samples, we will use the CharmBERTa

In [1]:
%pip install transformers torch tqdm


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## ChemBERTa Embedding + Dataset Loader

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import os

# Load ChemBERTa model/tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def extract_embeddings(smiles_list, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(smiles_list), batch_size)):
        batch = smiles_list[i:i+batch_size]
        encodings = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128)
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
            embeddings.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


## Load Data + Embed + Save

In [3]:
# === Load original CSVs with SMILES ===
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

datasets = {
    "train": "Data/CT-ADE-SOC/train.csv",
    "val": "Data/CT-ADE-SOC/val.csv",
    "test": "Data/CT-ADE-SOC/test.csv"
}

target_cols = [col for col in pd.read_csv(datasets["train"]).columns if col.startswith("label_")]
chemberta_data = {}

for split, path in datasets.items():
    df = pd.read_csv(path).dropna(subset=["smiles"])  # remove invalid SMILES
    smiles_list = df["smiles"].tolist()
    print(f"🔬 Extracting embeddings for {split} set ({len(smiles_list)} molecules)...")
    
    # Run ChemBERTa embedding
    X = extract_embeddings(smiles_list)
    y = df[target_cols].values
    
    chemberta_data[split] = (X, y)


🔬 Extracting embeddings for train set (12419 molecules)...


100%|██████████| 389/389 [00:08<00:00, 45.21it/s]


🔬 Extracting embeddings for val set (1518 molecules)...


100%|██████████| 48/48 [00:00<00:00, 57.79it/s]


🔬 Extracting embeddings for test set (1260 molecules)...


100%|██████████| 40/40 [00:00<00:00, 51.37it/s]


## Train MLP classifier 

In [14]:
from torch.utils.data import Dataset, DataLoader

class ChemDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

batch_size = 64
train_loader = DataLoader(ChemDataset(*chemberta_data['train']), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(ChemDataset(*chemberta_data['val']), batch_size=batch_size)

import torch.nn as nn
class ChemBERTaMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, output_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = ChemBERTaMLP(768, len(target_cols)).to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train loop
for epoch in range(20):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        preds = model(xb)
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")


Epoch 1 | Loss: 53.7769
Epoch 2 | Loss: 50.1690
Epoch 3 | Loss: 49.4614
Epoch 4 | Loss: 48.9339
Epoch 5 | Loss: 48.2256
Epoch 6 | Loss: 47.7185
Epoch 7 | Loss: 47.6535
Epoch 8 | Loss: 47.5768
Epoch 9 | Loss: 46.8161
Epoch 10 | Loss: 46.4474
Epoch 11 | Loss: 46.3456
Epoch 12 | Loss: 45.9062
Epoch 13 | Loss: 45.7065
Epoch 14 | Loss: 45.4276
Epoch 15 | Loss: 45.3747
Epoch 16 | Loss: 45.2678
Epoch 17 | Loss: 44.8617
Epoch 18 | Loss: 44.7204
Epoch 19 | Loss: 44.3440
Epoch 20 | Loss: 44.3042


## save inference

In [15]:
# Save model weights only (safe for future use)
os.makedirs("Model/ChemBERta", exist_ok=True)
torch.save(model.state_dict(), "Model/ChemBERta/chemberta_weights.pt")
print("✅ Model weights saved to Model/ChemBERta/chemberta_weights.pt")


✅ Model weights saved to Model/ChemBERta/chemberta_weights.pt


## Eval

In [18]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import numpy as np

# Load test data
X_test, y_test = chemberta_data["test"]
test_loader = DataLoader(ChemDataset(X_test, y_test), batch_size=batch_size)

# Reload model from weights (optional if already in memory)
model = ChemBERTaMLP(768, len(target_cols))
model.load_state_dict(torch.load("Model/ChemBERta/chemberta_weights.pt"))
model.to(device)
model.eval()

# Evaluation Function
def evaluate(model, dataloader, y_true):
    y_probs = []
    with torch.no_grad():
        for xb, _ in dataloader:
            xb = xb.to(device)
            probs = model(xb).cpu().numpy()
            y_probs.append(probs)
    
    y_probs = np.vstack(y_probs)
    y_pred = (y_probs > 0.5).astype(int)

    print("🔍 Classification Report (micro average):")
    print(classification_report(y_true, y_pred, zero_division=0))

    try:
        roc_auc_macro = roc_auc_score(y_true, y_probs, average="macro")
        roc_auc_micro = roc_auc_score(y_true, y_probs, average="micro")
        print(f"✅ ROC-AUC (Macro): {roc_auc_macro:.4f}")
        print(f"✅ ROC-AUC (Micro): {roc_auc_micro:.4f}")
    except ValueError:
        print("⚠️ ROC-AUC could not be calculated.")

    avg_prec = average_precision_score(y_true, y_probs, average="micro")
    print(f"✅ Average Precision (Micro): {avg_prec:.4f}")

# Run evaluation
print("🧪 Evaluating on Test Set...")
evaluate(model, test_loader, y_test)


🧪 Evaluating on Test Set...
🔍 Classification Report (micro average):
              precision    recall  f1-score   support

           0       0.71      0.04      0.07       140
           1       0.00      0.00      0.00       106
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00        40
           4       0.00      0.00      0.00        23
           5       0.00      0.00      0.00       111
           6       0.39      0.25      0.30       513
           7       0.21      0.08      0.12       288
           8       0.00      0.00      0.00        26
           9       0.00      0.00      0.00        18
          10       0.09      0.03      0.05       367
          11       0.00      0.00      0.00       106
          12       0.62      0.02      0.04       235
          13       0.50      0.06      0.11       159
          14       0.07      0.03      0.04       231
          15       0.00      0.00      0.00        29
          16

## SHAP

In [23]:
# Ensure SHAP values and feature input align
print("SHAP values shape:", shap_values[0].shape)
print("Input features shape:", X_explain.cpu().numpy().shape)

SHAP values shape: (768, 27)
Input features shape: (10, 768)


In [27]:
import shap
import torch
import numpy as np
import matplotlib.pyplot as plt

def explain_with_shap(model, X_tensor, y_labels, feature_names, device, frame_index=20):
    model.eval()

    # Select background and test sample
    background = X_tensor[:200].to(device)  # (200, 768)
    test_sample = X_tensor[frame_index:frame_index+1].to(device)  # (1, 768)

    # Use GradientExplainer (DeepExplainer fails)
    explainer = shap.GradientExplainer(model, background)

    # Get predicted class with highest confidence (sigmoid output)
    with torch.no_grad():
        output = model(test_sample).cpu().numpy()[0]  # shape: [num_classes]
        pred_class = np.argmax(output)
        pred_label = y_labels[pred_class]
        confidence = output[pred_class]

    # Compute SHAP values
    shap_values = explainer.shap_values(test_sample)
    shap_vector = shap_values[pred_class][0]  # shape: (768,)
    base_value = explainer.expected_value[pred_class]

    # Create SHAP Explanation object
    explanation = shap.Explanation(
        values=shap_vector,
        base_values=base_value,
        data=X_tensor[frame_index].cpu().numpy(),
        feature_names=feature_names
    )

    # Plot waterfall
    fig = plt.figure(figsize=(10, 6))
    shap.plots.waterfall(explanation, max_display=15, show=False)
    plt.tight_layout()

    return fig, pred_label, confidence


In [28]:
X_tensor = torch.tensor(chemberta_data["test"][0], dtype=torch.float32)
y_labels = target_cols  # List of class labels (column names)
feature_names = [f"dim_{i}" for i in range(768)]

fig, pred_label, confidence = explain_with_shap(
    model, X_tensor, y_labels, feature_names, device, frame_index=20
)
print(f"🔍 Predicted class: {pred_label} (confidence = {confidence:.2f})")
fig.show()


IndexError: index 6 is out of bounds for axis 0 with size 1