In [None]:
from itertools import permutations


In [1]:
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch

In [2]:
from transformers import AutoTokenizer, EsmForProteinFolding

model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.weight', 'esm.contact_head.regression.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import pandas as pd
import py3Dmol
import os


# Function to visualize a PDB file
def visualize_pdb(pdb, n_term_coord, c_term_coord):
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
    view.addModel(open(pdb, 'r').read(), 'pdb')
    
    # Set N and C terminal spheres
    view.addSphere({'center': {'x': n_term_coord[0], 'y': n_term_coord[1], 'z': n_term_coord[2]}, 'radius': 1.0, 'color':'red'})
    view.addSphere({'center': {'x': c_term_coord[0], 'y': c_term_coord[1], 'z': c_term_coord[2]}, 'radius': 1.0, 'color':'blue'})

    view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
    view.zoomTo()
    view.show()


In [9]:
# Use the PDB parser to load the PDB file

def  visualizeDistanceInPDB():
    parser = PDBParser()
    structure = parser.get_structure('shortest', 'longLinkers/GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTEIYQAGSTPCNGVEGFGSGPSKPSKRSFIEDLLFGSGTGSG.pdb')

    distance, n_term_coord, c_term_coord = calc_distance(structure)
    print(distance)

    # Plotting with 3Dmol
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
    view.addModel(open('longLinkers/GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTEIYQAGSTPCNGVEGFGSGPSKPSKRSFIEDLLFGSGTGSG.pdb', 'r').read(), 'pdb')

    # Set N and C terminal spheres
    view.addSphere({'center': {'x': n_term_coord[0].tolist(), 'y': n_term_coord[1].tolist(), 'z': n_term_coord[2].tolist()}, 'radius': 1.0, 'color':'red'})
    view.addSphere({'center': {'x': c_term_coord[0].tolist(), 'y': c_term_coord[1].tolist(), 'z': c_term_coord[2].tolist()}, 'radius': 1.0, 'color':'blue'})

    view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
    view.zoomTo()
    view.show()

In [3]:
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.detach().to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.detach().cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [43]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance_and_return_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('my_protein', pdb_file)
    model = pdb_file[0]  # assuming only one model in the PDB file
    chain = pdb_file['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord.tolist(), c_term_coord.tolist()


In [5]:
inputs1 = tokenizer(["GSGTGSGDKYFKNHTSPDVDLGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide
inputs2 = tokenizer(["GSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSGTGSGDKYFKNHTSPDVDLGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide
inputs3 = tokenizer(["GSGTGSGPSKRSFIEDLLFNKVGSGTGSGDKYFKNHTSPDVDLGSGTGSGLFRKSNLKPFERDISTEGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide

outputs1 = model(inputs1)
outputs2 = model(inputs2)
outputs3 = model(inputs3)

folded_positions1 = outputs1.positions
folded_positions2 = outputs2.positions
folded_positions3 = outputs3.positions

In [44]:
pdb1 = convert_outputs_to_pdb(outputs1)
pdb2 = convert_outputs_to_pdb(outputs2)
pdb3 = convert_outputs_to_pdb(outputs3)

In [45]:
with open("output_structure.pdb", "w") as f:
    f.write("".join(pdb1))

In [46]:
distance, n_term_coord, c_term_coor = calc_distance_and_return_coordinates("output_structure.pdb")
n_term_coord = eval(row['N-Term Coords'])
c_term_coord = eval(row['C-Term Coords'])
visualize_pdb(pdb1, n_term_coord, c_term_coor)

TypeError: string indices must be integers

In [None]:
def  visualizeDistanceInPDB()
    parser = PDBParser()
    structure = parser.get_structure('shortest', 'longLinkers/GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTEIYQAGSTPCNGVEGFGSGPSKPSKRSFIEDLLFGSGTGSG.pdb')

    distance, n_term_coord, c_term_coord = calc_distance(structure)
    print(distance)

    # Plotting with 3Dmol
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
    view.addModel(open('longLinkers/GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTEIYQAGSTPCNGVEGFGSGPSKPSKRSFIEDLLFGSGTGSG.pdb', 'r').read(), 'pdb')

    # Set N and C terminal spheres
    view.addSphere({'center': {'x': n_term_coord[0].tolist(), 'y': n_term_coord[1].tolist(), 'z': n_term_coord[2].tolist()}, 'radius': 1.0, 'color':'red'})
    view.addSphere({'center': {'x': c_term_coord[0].tolist(), 'y': c_term_coord[1].tolist(), 'z': c_term_coord[2].tolist()}, 'radius': 1.0, 'color':'blue'})

    view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
    view.zoomTo()
    view.show()

In [None]:
inputs1 = tokenizer(["GSGTGSGDKYFKNHTSPDVDLGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide
inputs2 = tokenizer(["GSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSGTGSGDKYFKNHTSPDVDLGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide
inputs3 = tokenizer(["GSGTGSGPSKRSFIEDLLFNKVGSGTGSGDKYFKNHTSPDVDLGSGTGSGLFRKSNLKPFERDISTEGSGTGSG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide

outputs1 = model(inputs1)
outputs2 = model(inputs2)
outputs3 = model(inputs3)

folded_positions1 = outputs1.positions
folded_positions2 = outputs2.positions
folded_positions3 = outputs3.positions