In [1]:
# Cell 1: Imports and setup
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from sklearn.cluster import Birch
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import esm
from descriptastorus.descriptors import rdNormalizedDescriptors
import warnings
# import faiss
warnings.filterwarnings('ignore')

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Enable pandas progress bar
tqdm.pandas()

# Parameters
RANDOM_STATE = 5
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

print("Setup complete!")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Setup complete!


In [2]:
# Cell 2: Load datasets and get target sample size
print("Loading datasets...")
df_complete = pd.read_csv("./datasets/cysdb_complete_with_sequences.csv").dropna(subset=['SMILES', 'Sequence'])


# Set target sample size to match positives
NUM_SAMPLES = len(df_complete[df_complete['Activity'] == 1])
print(f"Target sample size: {NUM_SAMPLES:,}")
print(f"Available negatives: {len(df_complete[df_complete['Activity'] == 0]):,}")
print(f"Sampling ratio: {NUM_SAMPLES / len(df_complete[df_complete['Activity'] == 0]):.3f}")

Loading datasets...
Target sample size: 49,511
Available negatives: 12,201,748
Sampling ratio: 0.004


In [6]:
import torch
from esm.models.esmc import ESMC
from esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    LogitsConfig,
    LogitsOutput,
    ProteinType,
)
EMBEDDING_CONFIG = LogitsConfig(
    sequence=True, return_embeddings=True, return_hidden_states=False
)
# Cell 3: ESM-C protein embeddings + RDKit normalized descriptors
print("Setting up ESM-C model...")
client = ESMC.from_pretrained("esmc_600m").to("cuda")
def embed_sequence(model: ESM3InferenceClient, sequence: str) -> torch.Tensor:
    protein = ESMProtein(sequence=sequence)
    protein_tensor = model.encode(protein)
    output = model.logits(protein_tensor, EMBEDDING_CONFIG)
    # output.embeddings: shape [1, seq_len, 1152]
    mean_embedding = output.embeddings.mean(dim=1).squeeze(0).detach().cpu().numpy()  # shape [1152]
    return mean_embedding

# Get unique proteins
proteins_unique = df_complete['Sequence'].unique()
print(f"Unique proteins to embed: {len(proteins_unique):,}")

protein_embed_dict = {}

for sequence in tqdm(proteins_unique, total=len(proteins_unique), desc="Embedding proteins"):
    if sequence not in protein_embed_dict:
        try:
            embedding = embed_sequence(client, sequence)
            protein_embed_dict[sequence] = embedding
        except Exception as e:
            print(f"Error embedding {sequence}: {e}")


print("Generating protein embeddings...")



Setting up ESM-C model...


Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 22857.24it/s]


Unique proteins to embed: 9,446


Embedding proteins: 100%|██████████| 9446/9446 [06:08<00:00, 25.64it/s]

Generating protein embeddings...





In [7]:
import gc
# Clear memory
gc.collect()

20

In [8]:
# Cell 4: RDKit molecular descriptors
print("Computing molecular descriptors...")
gen = rdNormalizedDescriptors.RDKit2DNormalized()

# Get unique molecules
unique_molecules = df_complete['SMILES'].unique()
print(f"Unique molecules: {len(unique_molecules):,}")

# Storage for molecular features
molecule_features = {}

# Process molecules
for smiles in tqdm(unique_molecules, desc="RDKit descriptors"):
    try:
        desc = gen.process(smiles=smiles)
        if desc is not None and len(desc) > 1:
            molecule_features[smiles] = np.array(desc[1:], dtype=np.float32)
    except Exception as e:
        print(f"Error processing {smiles}: {e}")

print(f"Generated features for {len(molecule_features)} molecules")


Computing molecular descriptors...
Unique molecules: 352


RDKit descriptors: 100%|██████████| 352/352 [00:04<00:00, 71.89it/s]

Generated features for 352 molecules





In [9]:
# more memory efficient storage
df_complete['concatenated_features'] = df_complete.apply(
    lambda row: np.concatenate((protein_embed_dict[row['Sequence']], molecule_features[row['SMILES']])), axis=1
)

In [10]:

# Prepare data for faiss (float32, contiguous)
features = np.stack(df_complete['concatenated_features'].values).astype('float32')
features = np.ascontiguousarray(features)
shape = features.shape
print(f"Features shape: {shape}")
features.tofile("./datasets/features.npy")


Features shape: (12251259, 1352)


In [3]:
features = np.memmap("./datasets/features.npy", dtype='float32', mode='r', shape=(12251259, 1352))

In [12]:
import gc
# Clear memory
gc.collect()

0

In [7]:
import faiss 
faiss.get_num_gpus()

2

In [8]:
# Cell: Clustering with faiss-gpu
import faiss
import numpy as np
import gc
batch_size= 244034
n_points, d = features.shape
print(f"Number of points: {n_points}, Dimension: {d}")
# n_points = 12201748
print("Clustering with faiss-gpu...")
# Set number of clusters (e.g., same logic as before)
n_clusters = min(max(NUM_SAMPLES // 10, 10), 300)  # Between 10-300 clusters
print(f"Using {n_clusters} clusters")

# Initialize faiss KMeans (GPU)
ngpu= faiss.get_num_gpus()          
res = faiss.StandardGpuResources() 
kmeans = faiss.Kmeans(d=features.shape[1], k=n_clusters, gpu=True, niter=50, verbose=True, seed=5, max_points_per_centroid=2000000)

# Train KMeans
kmeans.train(features)

cpu_centroids = faiss.IndexFlatL2(d)
cpu_centroids.add(kmeans.centroids)

gpu_centroids = faiss.index_cpu_to_all_gpus(cpu_centroids)  # two-device copy


# Assign clusters
labels = np.empty(len(features), dtype=np.int32)
for start in range(0, len(features), batch_size):
    end = min(start + batch_size, len(features))
    batch = np.ascontiguousarray(features[start:end])
    _, I = gpu_centroids.search(batch, 1)
    labels[start:end] = I.ravel()

df_complete['faiss_cluster'] = labels
print(f"Cluster assignment complete. Cluster counts:")
unique, counts = np.unique(labels, return_counts=True)
for u, c in zip(unique, counts):
    print(f"  Cluster {u}: {c} samples")

# Clean up GPU memory
del features, kmeans, res
gc.collect()

Number of points: 12251259, Dimension: 1352
Clustering with faiss-gpu...
Using 300 clusters
Clustering 12251259 points in 1352D to 300 clusters, redo 1 times, 50 iterations
  Preprocessing in 7.11 s
  Iteration 49 (246.65 s, search 126.31 s): objective=6.74866e+08 imbalance=299.999 nsplit=298       
Cluster assignment complete. Cluster counts:
  Cluster 0: 286 samples
  Cluster 1: 23 samples
  Cluster 2: 139 samples
  Cluster 3: 214 samples
  Cluster 4: 332 samples
  Cluster 5: 70 samples
  Cluster 7: 262 samples
  Cluster 8: 400 samples
  Cluster 9: 118 samples
  Cluster 10: 70 samples
  Cluster 11: 283 samples
  Cluster 12: 286 samples
  Cluster 13: 46 samples
  Cluster 14: 115 samples
  Cluster 15: 216 samples
  Cluster 16: 192 samples
  Cluster 17: 94 samples
  Cluster 18: 115 samples
  Cluster 19: 191 samples
  Cluster 20: 238 samples
  Cluster 21: 139 samples
  Cluster 22: 323 samples
  Cluster 23: 286 samples
  Cluster 24: 213 samples
  Cluster 25: 47 samples
  Cluster 26: 92 sa

142

In [9]:
df_complete.to_csv("./datasets/cysdb_complete_with_clusters.csv", index=False)