In [1]:
import pandas as pd
import ast

# Import necessary modules from PyRosetta
from pyrosetta import *
from pyrosetta.teaching import *
from pyrosetta.rosetta.core.conformation import ResidueFactory
from pyrosetta.rosetta.core.chemical import aa_from_oneletter_code
from Bio.SeqUtils import seq3

# Initialize PyRosetta once at the beginning of your notebook session
init()

┌──────────────────────────────────────────────────────────────────────────────┐
│                                 PyRosetta-4                                  │
│              Created in JHU by Sergey Lyskov and PyRosetta Team              │
│              (C) Copyright Rosetta Commons Member Institutions               │
│                                                                              │
│ NOTE: USE OF PyRosetta FOR COMMERCIAL PURPOSES REQUIRE PURCHASE OF A LICENSE │
│         See LICENSE.PyRosetta.md or email license@uw.edu for details         │
└──────────────────────────────────────────────────────────────────────────────┘
PyRosetta-4 2024 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python311.Release 2024.24+release.ca096dac4f43ee5ee195f87f3703a520fcf60cf9 2024-06-14T16:57:57] retrieved from: http://www.pyrosetta.org
core.init: Checking for fconfig files in pwd and ./rosetta/flags
core.init: Rosetta version: PyRosetta4.conda.linux.cxx11thread.seria

In [2]:
import numpy as np

In [3]:
df = pd.read_csv('cleaned_data.csv')

In [4]:
def find_gaps_and_create_dict(df, filepath):
    gap_dict = {}

    with open(filepath, 'r') as file:
        lines = file.readlines()

    for _, row in df.iterrows():
        cath_id = row['cath_id']
        search_str = f"[GAP DETECTED for cath_id {cath_id}]"
        for i, line in enumerate(lines):
            if search_str in line:
                next_line = lines[i + 1].strip()
                key, value = next_line.split(': ')
                gap_dict[cath_id] = ast.literal_eval(value)
                break
                
    
    return gap_dict


# Load data
result_dict = find_gaps_and_create_dict(df, 'data_processing_main_loop_output.txt')
print(result_dict)

{'3be3A00': [50], '3zq4C03': [493, 494, 495, 496, 497, 498, 499, 500], '1aqcA00': [354, 355, 366, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 408, 409, 429, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 458], '2gnxA01': [122, 137, 146, 173, 174, 175, 176, 177, 178, 179, 180, 181, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 259, 264, 286], '3i9v600': [58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73], '3rylA01': [246], '2i8dA01': [27, 44, 53, 71], '3m92A01': [67], '3vkgA02': [1620, 1621, 1622, 1623, 1624], '4fgmA01': [142], '3natA01': [54, 55, 56, 57, 58, 59, 60, 143], '2xqoA00': [92, 131, 139, 158, 185, 220], '4pkwA02': [348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367], '2pofA00': [79, 111, 194, 195, 207, 214], '3tdqA00': [58, 92, 100], '1xl3C00': [18, 41, 49], '3n6rA03': [549, 585, 621, 622],

In [None]:
# for _, row in df.iterrows():
#     # dataframe accession
#     cath_id = row['cath_id']
#     new_seq = row['sequences']

#     # custom data accession
#     template_pdb = f"../data/{cath_id}/pdb/{cath_id}"
#     original_pose = pose_from_pdb(template_pdb)
#     original_sequence = pose.sequence()

#     # Modified Pose
#     modified_pose = Pose()

#     for i, residue in enumerate(original_sequence):
#         if original_pose.residue(1).name().split(':')[0] == 'pdb_UNK': # Handle UNK residues
#             # Use resolved residue from new_seq
#             residue_from_new_seq = new_seq[i] # 1 letter AA code
#             aa_code = seq3(residue_from_new_seq).upper() # Convert to 3 letter code, all uppercase
#             new_res_type = residue_set.name_map(aa_code)
#             new_res = ResidueFactory.create_residue(new_res_type)
#             modified_pose.append_residue_by_jump(new_res, i)
#         else:
#             # Add original residue information
#             modified_pose.append_residue_by_jump(original_pose.residue(i+1), i) # pyrosetta numbers residues starting from 1, not 0
    

#     # Add in resolved GAP residues
#     if cath_id in result_dict:
#         gaps = result_dict[cath_id]
#         for idx in gaps:
#             # Create residue object
#             insertion_1_letter_code = new_seq[idx]
#             aa_code = seq3(residue_from_new_seq).upper() # Convert to 3 letter code, all uppercase
#             new_res_type = residue_set.name_map(aa_code)
#             new_res = ResidueFactory.create_residue(new_res_type)

#             # Determine insertion index
#             insertion_idx = idx - original_pose.pdb_info().number(1)

#             # Insertion
#             modified_pose.append_residue_by_jump(new_res, insertion_idx)

In [5]:
# dataframe accession
cath_id = '2gnxA01'
new_seq = 'LKKVVAGRGAPGGTAPHVEELLPHLSEQLCFFVQARMEIADFYEKMYALSTQKFINTEELVSTLDTILRKYSSRFHHPILSPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRETKKHLFGGQSQKAVQPPHLFLWLMKLKTMLLAKFSFYFHEALSRQTTASEMKALTAKANP KNPKVFASLK'

# custom data accession
template_pdb = f"../data/{cath_id}/pdb/{cath_id}"
original_pose = pose_from_pdb(template_pdb)
original_sequence = original_pose.sequence()
residue_set = original_pose.residue_type_set_for_pose()

# Modified Pose
modified_pose = Pose()

for i, residue in enumerate(original_sequence):
    if original_pose.residue(1).name().split(':')[0] == 'pdb_UNK': # Handle UNK residues
        # Use resolved residue from new_seq
        residue_from_new_seq = new_seq[i] # 1 letter AA code
        aa_code = seq3(residue_from_new_seq).upper() # Convert to 3 letter code, all uppercase
        new_res_type = residue_set.name_map(aa_code)
        new_res = ResidueFactory.create_residue(new_res_type)
        modified_pose.append_residue_by_jump(new_res, i)
    else:
        # Add original residue information
        modified_pose.append_residue_by_jump(original_pose.residue(i+1), i) # pyrosetta numbers residues starting from 1, not 0


# Add in resolved GAP residues
if cath_id in result_dict:
    gaps = result_dict[cath_id]
    for resnum in gaps:               
                
        # Create residue object
        sequence_index = resnum - original_pose.pdb_info().number(1)
        insertion_1_letter_code = new_seq[sequence_index]
        aa_code = seq3(insertion_1_letter_code).upper() # Convert to 3 letter code, all uppercase
        new_res_type = residue_set.name_map(aa_code)
        new_res = ResidueFactory.create_residue(new_res_type)

        # Determine insertion index
        insertion_idx = sequence_index

        print(f'Adding {aa_code} at position {insertion_idx}')

        # Insertion
        modified_pose.append_polymer_residue_after_seqpos(new_res, insertion_idx, True)
        # modified_pose.insert_residue_by_jump(new_res, insertion_idx, modified_pose.size(), "", "", False)

        # modified_pose.append_residue_by_jump(new_res, insertion_idx)

core.chemical.GlobalResidueTypeSet: Finished initializing fa_standard residue type set.  Created 985 residue types
core.chemical.GlobalResidueTypeSet: Total time to initialize 0.458517 seconds.
core.import_pose.import_pose: File '../data/2gnxA01/pdb/2gnxA01' automatically determined to be of type PDB
core.chemical.GlobalResidueTypeSet: Loading (but possibly not actually using) 'UNK' from the PDB components dictionary for residue type 'pdb_UNK'
core.pack.pack_missing_sidechains: packing residue number 1 because of missing atom number 6 atom name CG
core.pack.pack_missing_sidechains: packing residue number 2 because of missing atom number 6 atom name CG
core.pack.pack_missing_sidechains: packing residue number 3 because of missing atom number 6 atom name CG
core.pack.pack_missing_sidechains: packing residue number 4 because of missing atom number 6 atom name CG
core.pack.pack_missing_sidechains: packing residue number 5 because of missing atom number 6 atom name CG
core.pack.pack_missing

In [None]:
# Iterate through each residue in the pose
for i in range(1, original_pose.total_residue() + 1):
    res = original_pose.residue(i)

# If you want more detailed information about e`ach residue, you can print additional attributes
    print(f"Residue {i} name: {res.name()}")
    print(f"Residue {i} phi angle: {original_pose.phi(i)}")
    print(f"Residue {i} psi angle: {original_pose.psi(i)}")
    print(f"Residue {i} omega angle: {original_pose.omega(i)}\n")

In [None]:
# Iterate through each residue in the pose
for i in range(1, modified_pose.total_residue() + 1):
    res = modified_pose.residue(i)

# If you want more detailed information about e`ach residue, you can print additional attributes
    print(f"Residue {i} name: {res.name()}")
    print(f"Residue {i} phi angle: {modified_pose.phi(i)}")
    print(f"Residue {i} psi angle: {modified_pose.psi(i)}")
    print(f"Residue {i} omega angle: {modified_pose.omega(i)}\n")

In [7]:


# # Example: Use LoopModeler to model missing loops or regions
# # Replace this with your specific modeling logic
# loop_modeler = LoopModeler()
# loop_modeler.apply(modified_pose)


# Example: Perform structure refinement (if needed)
# Replace this with your specific modeling logic
# For example, energy minimization
# relax = pyrosetta.rosetta.protocols.relax.FastRelax()
# relax.apply(modified_pose)

output_pdb = f"{cath_id}_remodeled.pdb"
# Save the modeled structure
modified_pose.dump_pdb(output_pdb)

True

In [None]:
# idx = 50 - pose.pdb_info().number(1)
# new_sequence[idx]

In [None]:
# for i, residue in enumerate(original_sequence):
#     if residue == new_sequence[i]:
#         print(f'[   MATCH] Original: {residue} == New: {new_sequence[i]}')
#     else:
#         print(f'[MISMATCH] Original: {residue} == New: {new_sequence[i]}')

In [None]:
# def perform_homology_modeling(cath_id, new_sequence, template_pdb, output_pdb):
#     """
#     Perform homology modeling with larger sequences using PyRosetta.

#     Args:
#     - cath_id (str): Identifier for the structure.
#     - new_sequence (str): New sequence to model.
#     - template_pdb (str): Path to the template structure in PDB format.
#     - output_pdb (str): Path to save the modeled structure in PDB format.
#     """
    
#     # Create a Pose object from the template PDB
#     pose = pose_from_pdb(template_pdb)

#     # Extend the pose to match the length of the new sequence
#     extended_pose = extend_pose_to_match_sequence(pose, new_sequence)

#     # Example: Use LoopModeler to model missing loops or regions
#     # Replace this with your specific modeling logic
#     loop_modeler = LoopModeler()
#     loop_modeler.apply(extended_pose)
   

#     # Example: Perform structure refinement (if needed)
#     # Replace this with your specific modeling logic
#     # For example, energy minimization
#     relax = pyrosetta.rosetta.protocols.relax.FastRelax()
#     relax.apply(pose)

#     # Save the modeled structure
#     pose.dump_pdb(output_pdb)

In [None]:
 # # Create a Pose object from the template PDB
 #    pose = pose_from_pdb(template_pdb)

 #    # Create a Pose object from the new sequence
 #    extended_pose = pose_from_sequence(new_sequence)


 #    # Set the new sequence in the Pose object
 #    for i, residue in enumerate(new_sequence):
 #        if i < pose.total_residue():
 #            # Replace existing residues with new sequence residues
 #            pose.set_residue(i + 1, residue)
 #        else:
 #            # Insert new residues for the extended part of the sequence
 #            pose.append_residue_by_bond(residue)

 #    # Update PDBInfo to reflect new sequence length
 #    pdb_info = PDBInfo(pose)
 #    pdb_info.set_number(pose.total_residue())  # Update total residue count in PDBInfo

 #    # Example: Use LoopModeler to model missing loops or regions
 #    # Replace this with your specific modeling logic
 #    loop_modeler = LoopModeler()
 #    loop_modeler.set_loop_file("path/to/loop_file.txt")  # Provide loop definition file if needed
 #    loop_modeler.apply(pose)

In [None]:
# Extract necessary data
cath_id = '3be3A00'
new_sequence = 'QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPMSTRLLRIWNETVDTGAGPQPRFAYVGHVTPE'


new_fasta_seq_file = f"../data/{cath_id}/seqs/modified.fasta"
# Write sequences to modified.fasta
with open(new_fasta_seq_file, 'w') as fasta_file:
    fasta_file.write(new_sequence)

template_pdb = f"../data/{cath_id}/pdb/{cath_id}"
output_pdb = f"../data/{cath_id}/pdb/{cath_id}_remodeled.pdb"


perform_homology_modeling(cath_id, new_sequence, template_pdb, output_pdb)