In [74]:

## Remove is you use an installed version of esm3
import sys
sys.path.append("../esm3")
##

import numpy as np
import torch
import huggingface_hub

from esm.utils.structure.protein_chain import ProteinChain
from esm.models.esm3 import ESM3
from esm.sdk import client
from esm.sdk.api import (
    ESMProtein
)
from esm.utils import encoding

import pandas as pd

import os
from tqdm import tqdm

import argparse
from math import ceil


In [75]:
folder_for_embeddings = "../data"
sequences_to_embed = folder_for_embeddings + "/" + "df_Desai_15loci_sequence.npy"
sequences = np.load(sequences_to_embed, allow_pickle=True)

In [76]:
files = [f for f in os.listdir(".") if "noisy_" in f]

In [77]:
files

['6xf5_noisy_0.01.pdb',
 '6xf5_noisy_0.1.pdb',
 '6xf5_noisy_0.2.pdb',
 '6xf5_noisy_0.05.pdb',
 '6xf5_noisy_0.5.pdb']

In [78]:
model =  ESM3.from_pretrained("esm3_sm_open_v1", device=torch.device("cuda"))

  state_dict = torch.load(


In [79]:
batch = [ESMProtein.from_protein_chain(ProteinChain.from_pdb(seq)) for seq in files]
batch = [model.encode(seq) for seq in batch]
embeddings, _ = model.get_embeddings_batched(batch)

  state_dict = torch.load(
  with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):  # type: ignore


In [80]:
embeddings = embeddings.mean(dim=1).cpu().detach().numpy()

In [81]:
embeddings.shape

(5, 1536)

In [82]:
wt_file = ["mutant-evoEf-6xf5-rbd-only/1/1_correct_mutant_no_relax.pdb"]

In [83]:
batch_wt = [ESMProtein.from_protein_chain(ProteinChain.from_pdb(seq)) for seq in wt_file]
batch_wt = [model.encode(seq) for seq in batch_wt]
embeddings_wt, _ = model.get_embeddings_batched(batch_wt)
embeddings_wt = embeddings_wt.mean(dim=1).cpu().detach().numpy()[0]

  with torch.no_grad(), torch.cuda.amp.autocast(enabled=False):  # type: ignore


In [84]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(embeddings, embeddings_wt.reshape(1, -1))

array([[0.9999953 ],
       [0.9996752 ],
       [0.99675167],
       [0.99992645],
       [0.9590763 ]], dtype=float32)

In [85]:
# Distance L2
from sklearn.metrics.pairwise import euclidean_distances

euclidean_distances(embeddings, embeddings_wt.reshape(1, -1))


array([[  28.934711],
       [ 242.98268 ],
       [ 822.0334  ],
       [ 116.01453 ],
       [2714.2312  ]], dtype=float32)

In [38]:
weights_random = np.random.rand(embeddings.shape[1])
weights_random = weights_random / weights_random.sum()

In [39]:
# scalar product
np.dot(embeddings, weights_random) - np.dot(embeddings_wt, weights_random)

array([-0.00551926,  0.0309702 ,  0.57443128,  0.02507479,  2.33023471])

# Compare for datasets

In [104]:
def difference_cos_dist_from_own_wt(embeddings, index_wt=2**14):
    cosine = cosine_similarity(embeddings, embeddings[index_wt].reshape(1, -1))
    euclidean = euclidean_distances(embeddings, embeddings[index_wt].reshape(1, -1))
    print(f"Mean cosine similarity: {cosine.mean()} +/- {cosine.std()} in logscale {-np.log(1 - cosine.mean())}")
    print(f"Mean euclidean distance: {euclidean.mean()} +/- {euclidean.std()}")

def difference_cos_dist_from(embeddings, embeddings_wt):
    cosine = cosine_similarity(embeddings, embeddings_wt.reshape(1, -1))
    euclidean = euclidean_distances(embeddings, embeddings_wt.reshape(1, -1))
    print(f"Mean cosine similarity: {cosine.mean()} +/- {cosine.std()} in logscale {-np.log(1 - cosine.mean())}")
    print(f"Mean euclidean distance: {euclidean.mean()} +/- {euclidean.std()}")

# Load Noisy dataset

In [122]:
noisy = torch.load('../data/embeddings_withCoordinates_noisy_WT_6xf5_01.pt')
noisy_not_working = torch.load('../data/embeddings_withCoordinates_noisy_WT_6xf5.pt')
wt_struct_same_for_all = torch.load('../data/embeddings_withCoordinates_Isolated_WT_down_state_6xf5.pt')
no_relax = torch.load('../data/embeddings_withCoordinates_no_relax_mutated_WT_6xf5.pt')
md = torch.load('../data/embeddings_withCoordinates__md_frame_WT_6xf5.pt')
minim_gromacs = torch.load('../data/embeddings_withCoordinates_minim_gromacs_mutated_6xf5.pt')

  noisy = torch.load('../data/embeddings_withCoordinates_noisy_WT_6xf5_01.pt')
  wt_struct_same_for_all = torch.load('../data/embeddings_withCoordinates_Isolated_WT_down_state_6xf5.pt')
  no_relax = torch.load('../data/embeddings_withCoordinates_no_relax_mutated_WT_6xf5.pt')
  md = torch.load('../data/embeddings_withCoordinates__md_frame_WT_6xf5.pt')
  minim_gromacs = torch.load('../data/embeddings_withCoordinates_minim_gromacs_mutated_6xf5.pt')


In [123]:
difference_cos_dist_from_own_wt(noisy)

Mean cosine similarity: 0.9997641444206238 +/- 9.51098627410829e-05 in logscale 8.352290892053263
Mean euclidean distance: 213.0749053955078 +/- 44.1492805480957


In [106]:
difference_cos_dist_from_own_wt(wt_struct_same_for_all)

Mean cosine similarity: 0.9999495148658752 +/- 2.177390160795767e-05 in logscale 9.893831638786633
Mean euclidean distance: 96.72863006591797 +/- 21.2951602935791


In [107]:
difference_cos_dist_from_own_wt(no_relax)

Mean cosine similarity: 0.9999279975891113 +/- 2.5892122721415944e-05 in logscale 9.538810954943926
Mean euclidean distance: 113.27227783203125 +/- 21.040546417236328


In [108]:
difference_cos_dist_from_own_wt(md)

Mean cosine similarity: 0.9993987083435059 +/- 0.0001352711842628196 in logscale 7.416430455793932
Mean euclidean distance: 333.9731750488281 +/- 38.50557327270508


In [109]:
difference_cos_dist_from_own_wt(minim_gromacs)

Mean cosine similarity: 0.9996281862258911 +/- 0.00011530718620633706 in logscale 7.897117436270938
Mean euclidean distance: 260.2164611816406 +/- 40.65966796875


# Compare to reference

In [71]:
difference_cos_dist_from(noisy, embeddings_wt)

Mean cosine similarity: 0.9994924068450928 +/- 0.00012710451846942306
Mean euclidean distance: 317.1007385253906 +/- 41.62765121459961


In [72]:
difference_cos_dist_from(wt_struct_same_for_all, embeddings_wt)

Mean cosine similarity: 0.9953235387802124 +/- 7.669304613955319e-05
Mean euclidean distance: 977.3038330078125 +/- 9.230360984802246


In [73]:
difference_cos_dist_from(no_relax, embeddings_wt)

Mean cosine similarity: 0.999830961227417 +/- 2.5893703423207626e-05
Mean euclidean distance: 176.771484375 +/- 13.31243896484375


In [89]:
difference_cos_dist_from(md, embeddings_wt)

Mean cosine similarity: 0.9987190961837769 +/- 0.00021908928465563804
Mean euclidean distance: 487.70452880859375 +/- 44.632999420166016


In [92]:
difference_cos_dist_from(minim_gromacs, embeddings_wt)

Mean cosine similarity: 0.9995127320289612 +/- 0.00010684359585866332
Mean euclidean distance: 305.6966247558594 +/- 36.022396087646484


# Check RMSD

In [119]:
indexes = range(1, 100)
index_wt = 2**14 + 1
suffix = "minim_gromacs"
structure_wt = ProteinChain.from_pdb(f"mutant-evoEf-6xf5-rbd-only/{index_wt}/{index_wt}_{suffix}.pdb")

structures = [ProteinChain.from_pdb(f"mutant-evoEf-6xf5-rbd-only/{i}/{i}_{suffix}.pdb") for i in indexes]



In [120]:
rmsd = [structure_wt.rmsd(structure) for structure in structures]

In [121]:
np.mean(rmsd), np.std(rmsd)

(0.3987810756462255, 0.008843511187527542)