In [1]:
# Install dependencies if needed
# ! pip install --upgrade transformers py3Dmol accelerate
#! pip install biopython
# ! jupyter labextension install jupyterlab_3dmol

In [1]:
from itertools import product, permutations

In [2]:
from transformers import AutoTokenizer, EsmForSequenceClassification
import torch

In [3]:
S2 = [[11, 'PLQPELDSFKEELDKYFKNHTSPDV'], [24, 'QPELDSFKEELDKYFKNHTSP'], [25, "ELDSFKEELDKYFKNHTSPD"], [15, "LDSFKEELDKYF"], [10, "DSFKEELDKYFKNHTS"], [16, "SFKEELDKYF"], [23, "FKEELDKYFKNHT"], [12, "KEELDKYFKNHTSPDVD"], [17, "DKYFKNHTSPDVDL"]]
RBD_RBM = [[21, "LFRKSNLKPFERDISTE"], [8, "DISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVL"], [13, "TEIYQAGSTPCNGVEGF"], [27, "TEIYQAGSTPCNGVEGFNCYF"], [9, "IYQAGSTPCNGVEGFNCYFPLQSY"]]
S1_S2  = [[6, "PSKPSKRSFIEDLLF"], [4, "PSKPSKRSFIEDLLFNKV"], [19, "PSKPSKRSFIEDLLFNKVTLADAGF"], [3, "KPSKRSFIEDLLFNK"], [20, "PSKRSFIEDLLFNKV"], [26, "PSKRSFIEDLLFNKVTLADA"], [14, "KRSFIEDLLFNK"]]

group1 = [item[1].upper() for item in S2]
group2 = [item[1].upper() for item in RBD_RBM]
group3 = [item[1].upper() for item in S1_S2]

In [4]:
groups = [group1, group2, group3]

all_permutations = []

# Iterate over all permutations of the groups
for perm in permutations(groups):
    # For each permutation of the groups, calculate the Cartesian product
    combinations = list(product(*perm))
    # Add the combinations to the overall list
    all_permutations.extend(combinations)


In [5]:
for combo in all_permutations:
    print(combo)


('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'PSKPSKRSFIEDLLF')
('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'PSKPSKRSFIEDLLFNKV')
('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'PSKPSKRSFIEDLLFNKVTLADAGF')
('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'KPSKRSFIEDLLFNK')
('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'PSKRSFIEDLLFNKV')
('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'PSKRSFIEDLLFNKVTLADA')
('PLQPELDSFKEELDKYFKNHTSPDV', 'LFRKSNLKPFERDISTE', 'KRSFIEDLLFNK')
('PLQPELDSFKEELDKYFKNHTSPDV', 'DISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVL', 'PSKPSKRSFIEDLLF')
('PLQPELDSFKEELDKYFKNHTSPDV', 'DISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVL', 'PSKPSKRSFIEDLLFNKV')
('PLQPELDSFKEELDKYFKNHTSPDV', 'DISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVL', 'PSKPSKRSFIEDLLFNKVTLADAGF')
('PLQPELDSFKEELDKYFKNHTSPDV', 'DISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVL', 'KPSKRSFIEDLLFNK')
('PLQPELDSFKEELDKYFKNHTSPDV', 'DISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVL', '

In [42]:
import pandas as pd

# Initializing DataFrame
df = pd.DataFrame(columns=["sequence_id", "variant_short", "variant_long","variant_short_diff","variant_long_diff" ])

# Generating sequences for each combination and adding them to DataFrame
for i, combo in enumerate(all_permutations):
    variant_long = 'GSGTGSG' + 'GSGTGSG'.join(combo) + 'GSGTGSG'
    variant_short = 'GSG' + 'GSG'.join(combo) + 'GSG'
    variant_short_diff = 'GSGTGSG' + 'GSG'.join(combo) + 'GSGTGSG'
    variant_long_diff = 'GSG' + 'GSGTGSG'.join(combo) + 'GSG'
    df.loc[i] = ['sequence_' + str(i+1), variant_short, variant_long, variant_short_diff, variant_long_diff]


In [43]:
df
# save df in a file
df.to_csv('all_combinations.csv', index=False)

## Set up ESM machine learning model for 3D structure generation.

In [8]:
from transformers import AutoTokenizer, EsmForProteinFolding

model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")

tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")



Some weights of EsmForProteinFolding were not initialized from the model checkpoint at facebook/esmfold_v1 and are newly initialized: ['esm.contact_head.regression.bias', 'esm.contact_head.regression.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers.models.esm.openfold_utils.protein import to_pdb, Protein as OFProtein
from transformers.models.esm.openfold_utils.feats import atom14_to_atom37

def convert_outputs_to_pdb(outputs):
    final_atom_positions = atom14_to_atom37(outputs["positions"][-1], outputs)
    outputs = {k: v.detach().to("cpu").numpy() for k, v in outputs.items()}
    final_atom_positions = final_atom_positions.detach().cpu().numpy()
    final_atom_mask = outputs["atom37_atom_exists"]
    pdbs = []
    for i in range(outputs["aatype"].shape[0]):
        aa = outputs["aatype"][i]
        pred_pos = final_atom_positions[i]
        mask = final_atom_mask[i]
        resid = outputs["residue_index"][i] + 1
        pred = OFProtein(
            aatype=aa,
            atom_positions=pred_pos,
            atom_mask=mask,
            residue_index=resid,
            b_factors=outputs["plddt"][i],
            chain_index=outputs["chain_index"][i] if "chain_index" in outputs else None,
        )
        pdbs.append(to_pdb(pred))
    return pdbs

In [10]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance(structure):
    model = structure[0]  # assuming only one model in the PDB file
    chain = model['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord, c_term_coord


## Single sequence prediction

In [16]:
inputs = tokenizer(["GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGPSKPSKRSFIEDLLFGSGTG"], return_tensors="pt", add_special_tokens=False)['input_ids']  # A tiny random peptide

outputs = model(inputs)

folded_positions = outputs.positions

In [19]:
pdb = convert_outputs_to_pdb(outputs)

In [20]:
import py3Dmol

view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
view.addModel("".join(pdb), 'pdb')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})

<py3Dmol.view at 0x7f1a15ea6190>

In [21]:
# The plddt field is scaled from 0-1 on earlier versions of ESMFold but will be updated
# to match AlphaFold's scale of 0-100 in future versions.
# We check here so that this code will work on either:

if torch.max(outputs['plddt']) <= 1.0:
    vmin = 0.5
    vmax = 0.95
else:
    vmin = 50
    vmax = 95

view.setStyle({'cartoon': {'colorscheme': {'prop':'b','gradient': 'roygb','min': vmin,'max': vmax}}})

<py3Dmol.view at 0x7f1a15ea6190>

In [41]:
with open("output_structure.pdb", "w") as f:
    f.write("".join(pdb))

In [68]:
# Use the PDB parser to load the PDB file

def  visualizeDistanceInPDB()
    parser = PDBParser()
    structure = parser.get_structure('shortest', 'longLinkers/GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTEIYQAGSTPCNGVEGFGSGPSKPSKRSFIEDLLFGSGTGSG.pdb')

    distance, n_term_coord, c_term_coord = calc_distance(structure)
    print(distance)

    # Plotting with 3Dmol
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js', width=800, height=400)
    view.addModel(open('longLinkers/GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTEIYQAGSTPCNGVEGFGSGPSKPSKRSFIEDLLFGSGTGSG.pdb', 'r').read(), 'pdb')

    # Set N and C terminal spheres
    view.addSphere({'center': {'x': n_term_coord[0].tolist(), 'y': n_term_coord[1].tolist(), 'z': n_term_coord[2].tolist()}, 'radius': 1.0, 'color':'red'})
    view.addSphere({'center': {'x': c_term_coord[0].tolist(), 'y': c_term_coord[1].tolist(), 'z': c_term_coord[2].tolist()}, 'radius': 1.0, 'color':'blue'})

    view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
    view.zoomTo()
    view.show()

SyntaxError: invalid syntax (2923939451.py, line 3)

## Batch prediction

In [16]:
df.variant_short.tolist()

['GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGKPSKRSFIEDLLFNKGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVTLADAGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGKRSFIEDLLFNKGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFNK

In [53]:
df.variant_short_diff.tolist()

['GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGPSKPSKRSFIEDLLFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGPSKPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGKPSKRSFIEDLLFNKGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGPSKRSFIEDLLFNKVTLADAGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGLFRKSNLKPFERDISTEGSGKRSFIEDLLFNKGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGPSKPSKRSFIEDLLFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGPSKPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGDISTEIYQAGSTPCNGVEGFNCYFP

In [46]:
df.variant_long.tolist()

['GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGKPSKRSFIEDLLFNKGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVTLADAGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGKRSFIEDLLFNKGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFNKVGSGTGSG',
 'GSGTGSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFNK

In [30]:
df.variant_long_diff.tolist()

['GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGKPSKRSFIEDLLFNKGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGPSKRSFIEDLLFNKVTLADAGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGLFRKSNLKPFERDISTEGSGTGSGKRSFIEDLLFNKGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFNKVGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLGSGTGSGPSKPSKRSFIEDLLFNKVTLADAGFGSG',
 'GSGPLQPELDSFKEELDKYFKNHTSPDVGSGTGSGDISTEIYQAGSTPCNGVEGFNCYFP

In [14]:
epitopes_tokenized1 = tokenizer(df.variant_short.tolist(), padding=False, add_special_tokens=False)['input_ids']

In [47]:
epitopes_tokenized2 = tokenizer(df.variant_long.tolist(), padding=False, add_special_tokens=False)['input_ids']

In [31]:
epitopes_tokenized3 = tokenizer(df.variant_short_diff.tolist(), padding=False, add_special_tokens=False)['input_ids']

In [48]:
epitopes_tokenized4 = tokenizer(df.variant_long_diff.tolist(), padding=False, add_special_tokens=False)['input_ids']

Now we loop over our tokenized data, passing each sequence into our model:


In [33]:
from tqdm import tqdm

outputs = []
                                                  
with torch.no_grad():
    for input_ids in tqdm(epitopes_tokenized1):
        input_ids = torch.tensor(input_ids, device='cpu').unsqueeze(0)
        output = model(input_ids)
        outputs.append({key: val.cpu() for key, val in output.items()})

100%|██████████| 1890/1890 [5:14:54<00:00, 10.00s/it]  


In [34]:
pdb_list1 = [convert_outputs_to_pdb(output) for output in outputs]

In [35]:
protein_identifiers = df.variant_short.tolist()
for identifier, pdb in zip(protein_identifiers, pdb_list1):
    with open(f"sameEnds/shortLinkers/{identifier}.pdb", "w") as f:
        f.write("".join(pdb))

In [18]:
from tqdm import tqdm

outputs = []

with torch.no_grad():
    for input_ids in tqdm(epitopes_tokenized2):
        input_ids = torch.tensor(input_ids, device='cpu').unsqueeze(0)
        output = model(input_ids)
        outputs.append({key: val.cpu() for key, val in output.items()})

100%|██████████| 1890/1890 [3:35:53<00:00,  6.85s/it]  


In [19]:
pdb_list2 = [convert_outputs_to_pdb(output) for output in outputs]

In [25]:
protein_identifiers = df.variant_long.tolist()
for identifier, pdb in zip(protein_identifiers, pdb_list2):
    with open(f"sameEnds/longLinkers/{identifier}.pdb", "w") as f:
        f.write("".join(pdb))

In [49]:
from tqdm import tqdm

outputs = []

with torch.no_grad():
    for input_ids in tqdm(epitopes_tokenized3):
        input_ids = torch.tensor(input_ids, device='cpu').unsqueeze(0)
        output = model(input_ids)
        outputs.append({key: val.cpu() for key, val in output.items()})

pdb_list3 = [convert_outputs_to_pdb(output) for output in outputs]

protein_identifiers = df.variant_short_diff.tolist()
for identifier, pdb in zip(protein_identifiers, pdb_list3):
    with open(f"differentEnds/shortLinkers/{identifier}.pdb", "w") as f:
        f.write("".join(pdb))

100%|██████████| 1890/1890 [4:43:11<00:00,  8.99s/it]  


In [55]:
protein_identifiers = df.variant_short_diff.tolist()
for identifier, pdb in zip(protein_identifiers, pdb_list3):
    with open(f"differentEnds/shortLinkers/{identifier}.pdb", "w") as f:
        f.write("".join(pdb))

In [50]:
from tqdm import tqdm

outputs = []

with torch.no_grad():
    for input_ids in tqdm(epitopes_tokenized4):
        input_ids = torch.tensor(input_ids, device='cpu').unsqueeze(0)
        output = model(input_ids)
        outputs.append({key: val.cpu() for key, val in output.items()})

pdb_list4 = [convert_outputs_to_pdb(output) for output in outputs]

protein_identifiers = df.variant_long_diff.tolist()
for identifier, pdb in zip(protein_identifiers, pdb_list4):
    with open(f"differentEnds/longLinkers/{identifier}.pdb", "w") as f:
        f.write("".join(pdb))

100%|██████████| 1890/1890 [4:36:35<00:00,  8.78s/it]  


In [56]:
protein_identifiers = df.variant_long_diff.tolist()
for identifier, pdb in zip(protein_identifiers, pdb_list4):
    with open(f"differentEnds/longLinkers/{identifier}.pdb", "w") as f:
        f.write("".join(pdb))

## Calculate distances between N and C 

In [4]:
from Bio.PDB import PDBParser
import numpy as np

def calc_distance_and_return_coordinates(pdb_file):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('my_protein', pdb_file)
    model = structure[0]  # assuming only one model in the PDB file
    chain = model['A']  # replace with your chain ID

    # Get the residue numbers for the first (N-term) and last (C-term) residues
    res_keys = sorted(list(chain.child_dict.keys()))
    n_term_residue = chain[res_keys[0]]
    c_term_residue = chain[res_keys[-1]]

    # Get the coordinates for the N and C atoms
    n_term_coord = n_term_residue['N'].get_coord()
    c_term_coord = c_term_residue['C'].get_coord()

    # Calculate the distance
    distance = np.linalg.norm(c_term_coord - n_term_coord)
    return distance, n_term_coord.tolist(), c_term_coord.tolist()

In [5]:
import os
import pandas as pd
import numpy as np
import re

def process_pdb_directory(directory_path, output_csv_file):
    result_dict = {"File Name": [], "Distance": [], "N-Term Coords": [], "C-Term Coords": [], "E1": [], "E2": [], "E3": []}

    for filename in os.listdir(directory_path):
        if filename.endswith(".pdb"):
            pdb_file = os.path.join(directory_path, filename)
            distance, n_term_coords, c_term_coords = calc_distance_and_return_coordinates(pdb_file)

            # Extract the sequence from the filename
            sequence = filename.rstrip('.pdb')
            

            fragments = re.split('GSG', sequence)

            result_dict["File Name"].append(filename)
            result_dict["Distance"].append(distance)
            result_dict["N-Term Coords"].append(n_term_coords)
            result_dict["C-Term Coords"].append(c_term_coords)
            # Assumes there are 3 fragments
            for i in range(1, 4):
                result_dict[f"E{i}"].append(fragments[i] if i < len(fragments) else None)

    results_df = pd.DataFrame(result_dict)
    results_df.to_csv(output_csv_file, index=False)



In [6]:
process_pdb_directory("sameEnds/longLinkers", "sameEnds/long_Linker_distances.csv")
process_pdb_directory("sameEnds/shortLinkers", "sameEnds/short_Linker_distances.csv")
process_pdb_directory("differentEnds/longLinkers", "differentEnds/long_Linker_distances.csv")
process_pdb_directory("differentEnds/shortLinkers", "differentEnds/short_Linker_distances.csv")