In [None]:
import pandas as pd 
import esm 
import torch
import numpy as np

positive = pd.read_csv('datasets/cysdb_complete_with_sequences.csv')
sample_negatives_stratified_faiss = pd.read_csv('datasets/sample_negatives_stratified_faiss.csv')

In [None]:
import torch
from esm.models.esmc import ESMC
from esm.sdk.api import (
    ESM3InferenceClient,
    ESMProtein,
    LogitsConfig,
    LogitsOutput,
    ProteinType,
)
EMBEDDING_CONFIG = LogitsConfig(
    sequence=True, return_embeddings=True, return_hidden_states=False
)
print("Setting up ESM-C model...")
client = ESMC.from_pretrained("esmc_600m").to("cuda")
def embed_sequence(model: ESM3InferenceClient, sequence: str) -> torch.Tensor:
    protein = ESMProtein(sequence=sequence)
    protein_tensor = model.encode(protein)
    output = model.logits(protein_tensor, EMBEDDING_CONFIG)
    # output.embeddings: shape [1, seq_len, 1152]
    mean_embedding = output.embeddings.mean(dim=1).squeeze(0).detach().cpu().numpy()  # shape [1152]
    return mean_embedding


In [None]:
pos_prot_unique = positive['Sequence'].unique()
neg_prot_unique = sample_negatives_stratified_faiss['Sequence'].unique()
proteins_unique = np.concatenate([pos_prot_unique, neg_prot_unique])
print(f"Unique proteins to embed: {len(proteins_unique):,}")

protein_embed_dict = {}

print("Generating protein embeddings...")
for sequence in tqdm(proteins_unique, total=len(proteins_unique), desc="Embedding proteins"):
    if sequence not in protein_embed_dict:
        try:
            embedding = embed_sequence(client, sequence)
            protein_embed_dict[sequence] = embedding
        except Exception as e:
            print(f"Error embedding {sequence}: {e}")




In [None]:
print("Computing molecular descriptors...")
gen = rdNormalizedDescriptors.RDKit2DNormalized()

# Get unique molecules
unique_molecules = sample_negatives_stratified_faiss['SMILES'].unique()
print(f"Unique molecules: {len(unique_molecules):,}")

# Storage for molecular features
molecule_features = {}

# Process molecules
for smiles in tqdm(unique_molecules, desc="RDKit descriptors"):
    try:
        desc = gen.process(smiles=smiles)
        if desc is not None and len(desc) > 1:
            molecule_features[smiles] = np.array(desc[1:], dtype=np.float32)
    except Exception as e:
        print(f"Error processing {smiles}: {e}")

print(f"Generated features for {len(molecule_features)} molecules")

In [None]:
sample_negatives_stratified_faiss['concatenated_features'] = sample_negatives_stratified_faiss.apply(
    lambda row: np.concatenate((protein_embed_dict[row['Sequence']], molecule_features[row['SMILES']])), axis=1
)

ModuleNotFoundError: No module named 'numpy._core.numeric'

In [None]:
from cuml.manifold.umap import UMAP
import plotly.express as px

# for now only negatives
print("Fitting UMAP...")
reducer = UMAP(
    n_components=2,
    metric='euclidean',
    random_state=5,
    verbose=True,
    n_epochs=500,
    n_neighbors=55)

reducer_3d = UMAP(
    n_components=3,
    metric='euclidean',
    random_state=5,
    verbose=True,
    n_epochs=500,
    n_neighbors=55)

# features = np.stack(sample_negatives_stratified_faiss['concatenated_features'].values)
print(f"Features shape: {features.shape}")

proj_2d = reducer_2d.fit_transform(features)
proj_3d = reducer_3d.fit_transform(features)

fig_2d = px.scatter(
    proj_2d, x=0, y=1,
    color=df.species, labels={'color': 'species'}
)
fig_3d = px.scatter_3d(
    proj_3d, x=0, y=1, z=2,
    color=df.species, labels={'color': 'species'}
)
fig_3d.update_traces(marker_size=5)

fig_2d.show()
fig_3d.show()