In [2]:
from itertools import permutations
# ! pip install --upgrade transformers py3Dmol accelerate
# ! jupyter labextension install jupyterlab_3dmol
import py3Dmol

In [3]:
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch

In [4]:
from transformers import AutoTokenizer, EsmForProteinFolding

model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")

Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.weight', 'esm.contact_head.regression.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.detach().to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.detach().cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [6]:
seq1 = "GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG"
seq2 = "GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSG"
seq3 = "GSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG"
seq4 = "GSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG"
seq5 = "GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG"
seq6 = "GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSG"

In [7]:
# Add extra sequences "GSG" at the ends of each sequence seq1-6 and create a list of sequences. 
# Add 1 to 5 extra GSG linkers in a loop and create separate seq for every added gsg.
seq_list = []

for i in range(1,7):
    seq_list.append("GS" * i + seq1)
    seq_list.append("GS" * i + seq2)
    seq_list.append("GS" * i + seq3)
    seq_list.append("GS" * i + seq4)
    seq_list.append("GS" * i + seq5)
    seq_list.append("GS" * i + seq6)
    seq_list.append(seq1 + "GS" * i)
    seq_list.append(seq2 + "GS" * i)
    seq_list.append(seq3 + "GS" * i)
    seq_list.append(seq4 + "GS" * i)
    seq_list.append(seq5 + "GS" * i)
    seq_list.append(seq6 + "GS" * i)
    



In [42]:
token = tokenizer(seq6, return_tensors="pt", add_special_tokens=False)['input_ids']
outputs1 = model(token)

In [43]:
pdb1 = convert_outputs_to_pdb(outputs1)
with open("variableLinkerSize/"+seq1+".pdb", "w") as f:
        f.write("".join(pdb1)) 

In [44]:
visualizeDistanceInPDB("variableLinkerSize/"+seq1+".pdb")
print(seq1)

22.606508
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG




In [10]:
from tqdm import tqdm

# tokenList = []

# for sequence in tqdm(seq_list):
#     token = tokenizer(sequence, return_tensors="pt", add_special_tokens=False)['input_ids']
#     tokenList.append(model(token))

# outputs1 = model(inputs1)
# outputs2 = model(inputs2)
# outputs3 = model(inputs3)
# outputs4 = model(inputs4)

epitopes_tokenized = tokenizer(seq_list, padding=False, add_special_tokens=False)['input_ids']

from tqdm import tqdm

outputs = []
                                                  
with torch.no_grad():
    for input_ids in tqdm(epitopes_tokenized):
        input_ids = torch.tensor(input_ids, device='cpu').unsqueeze(0)
        output = model(input_ids)
        outputs.append({key: val.cpu() for key, val in output.items()})



  0%|          | 0/72 [00:00<?, ?it/s]

100%|██████████| 72/72 [1:05:06<00:00, 54.25s/it]


In [11]:
outputs
i = 0

for output in outputs:
    with open("variableLinkerSize/"+seq_list[i]+".pdb", "w") as f:
        f.write("".join(convert_outputs_to_pdb(output)))    
    i+=1

    

# pdb2 = convert_outputs_to_pdb(outputs2)
# pdb3 = convert_outputs_to_pdb(outputs3)
# pdb4 = convert_outputs_to_pdb(outputs4)

In [12]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance_and_return_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('my_protein', pdb_file)
    model = structure[0]  # assuming only one model in the PDB file
    chain = model['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord.tolist(), c_term_coord.tolist()

In [13]:
from Bio.PDB import PDBParser
import numpy as np

def  visualizeDistanceInPDB(pdb_F):
    parser = PDBParser()
    structure = parser.get_structure('shortest', pdb_F)

    distance, n_term_coord, c_term_coord = calc_distance(structure)
    print(distance)

    # # Plotting with 3Dmol
    # view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
    # view.addModel(open(pdb_F, 'r').read(), 'pdb')

    # # Set N and C terminal spheres
    # view.addSphere({'center': {'x': n_term_coord[0].tolist(), 'y': n_term_coord[1].tolist(), 'z': n_term_coord[2].tolist()}, 'radius': 1.0, 'color':'red'})
    # view.addSphere({'center': {'x': c_term_coord[0].tolist(), 'y': c_term_coord[1].tolist(), 'z': c_term_coord[2].tolist()}, 'radius': 1.0, 'color':'blue'})

    # view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
    # view.zoomTo()
    # view.show()

In [14]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance(structure):
    model = structure[0]  # assuming only one model in the PDB file
    chain = model['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord, c_term_coord


In [15]:
# visualizeDistanceInPDB('GSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb')
# visualizeDistanceInPDB('GSGTGSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb')
# visualizeDistanceInPDB('GSGLDSFKEELDKYFGSGTGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG.pdb')
# visualizeDistanceInPDB('GSGTGSGLDSFKEELDKYFGSGIYQAGSTPCNGVEGFNCYFPLQSYGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG.pdb')

for seq in seq_list:
    visualizeDistanceInPDB("variableLinkerSize/"+seq+".pdb")
    print(seq)




42.500946
GSGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG
47.723385
GSGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSG
48.643745
GSGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG
40.618233
GSGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG
20.360767
GSGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG
12.55597
GSGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSG
37.73284
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGGS
50.21553
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGGS
56.19434
GSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGGS
38.62101
GSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGGS
36.98



6.362552
GSGSGSGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSG
33.130276
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGGSGSGS
46.986717
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGGSGSGS
42.274662
GSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGGSGSGS
37.619926
GSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGGSGSGS
43.374508
GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGGSGSGS
15.986128
GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGGSGSGS
51.577488
GSGSGSGSGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG
29.439503
GSGSGSGSGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSG
39.707928
GSGSGSGSGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFN



56.964207
GSGSGSGSGSGSGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG
25.509344
GSGSGSGSGSGSGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSG
46.01534
GSGSGSGSGSGSGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSG
37.507217
GSGSGSGSGSGSGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG
16.809
GSGSGSGSGSGSGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG
16.053616
GSGSGSGSGSGSGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSG
42.958954
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGGSGSGSGSGSGS
45.948353
GSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGGSGSGSGSGSGS
60.577663
GSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGGSGSGSGSGSGS
21.996084
GSGL

