In [6]:
from itertools import permutations
# ! pip install --upgrade transformers py3Dmol accelerate
# ! jupyter labextension install jupyterlab_3dmol
import py3Dmol

In [7]:
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch

In [14]:
from transformers import AutoTokenizer, EsmForProteinFolding

model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.weight', 'esm.contact_head.regression.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.detach().to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.detach().cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [23]:
inputs1 = tokenizer(["GSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  
inputs2 = tokenizer(["GSGTGSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  
inputs3 = tokenizer(["GSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  
inputs4 = tokenizer(["GSGTGSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  

outputs1 = model(inputs1)
outputs2 = model(inputs2)
outputs3 = model(inputs3)
outputs4 = model(inputs4)

In [25]:
pdb1 = convert_outputs_to_pdb(outputs1)
pdb2 = convert_outputs_to_pdb(outputs2)
pdb3 = convert_outputs_to_pdb(outputs3)
pdb4 = convert_outputs_to_pdb(outputs4)

In [26]:
with open("GSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb", "w") as f:
    f.write("".join(pdb1))

with open("GSGTGSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb", "w") as f:
    f.write("".join(pdb2))

with open("GSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb", "w") as f:
    f.write("".join(pdb3))

with open("GSGTGSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb", "w") as f:
    f.write("".join(pdb4))


In [15]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance_and_return_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('my_protein', pdb_file)
    model = structure[0]  # assuming only one model in the PDB file
    chain = model['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord.tolist(), c_term_coord.tolist()

In [16]:
from Bio.PDB import PDBParser
import numpy as np

def  visualizeDistanceInPDB(pdb_F):
    parser = PDBParser()
    structure = parser.get_structure('shortest', pdb_F)

    distance, n_term_coord, c_term_coord = calc_distance(structure)
    print(distance)

    # Plotting with 3Dmol
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
    view.addModel(open(pdb_F, 'r').read(), 'pdb')

    # Set N and C terminal spheres
    view.addSphere({'center': {'x': n_term_coord[0].tolist(), 'y': n_term_coord[1].tolist(), 'z': n_term_coord[2].tolist()}, 'radius': 1.0, 'color':'red'})
    view.addSphere({'center': {'x': c_term_coord[0].tolist(), 'y': c_term_coord[1].tolist(), 'z': c_term_coord[2].tolist()}, 'radius': 1.0, 'color':'blue'})

    view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
    view.zoomTo()
    view.show()

In [17]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance(structure):
    model = structure[0]  # assuming only one model in the PDB file
    chain = model['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord, c_term_coord


In [27]:
visualizeDistanceInPDB('GSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb')
visualizeDistanceInPDB('GSGTGSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb')
visualizeDistanceInPDB('GSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb')
visualizeDistanceInPDB('GSGTGSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb')


25.426283




13.0366535




21.767725




2.8981013




In [34]:
visualizeDistanceInPDB('sameEnds/shortLinkers/GSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb')

visualizeDistanceInPDB('sameEnds/longLinkers//GSGTGSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb')




25.426283




25.426283


