In [1]:
from utils import *
import os
import shutil

import warnings

warnings.filterwarnings("ignore", message="Used element '.' for Atom")

# Data Loading

In [2]:
data = pd.read_csv('../data/cath_w_seqs_share.csv', index_col=0)
data['cath_indices'] = data['cath_indices'].apply(safe_eval) # Convert `cath_indices` from str to list of tuples

# Map architecture IDs to names and labels
architecture_names = {
    (1, 10): {"name": "Mainly Alpha: Orthogonal Bundle", "label": 0},
    (1, 20): {"name": "Mainly Alpha: Up-down Bundle", "label": 1},
    (2, 30): {"name": "Mainly Beta: Roll", "label": 2},
    (2, 40): {"name": "Mainly Beta: Beta Barrel", "label": 3},
    (2, 60): {"name": "Mainly Beta: Sandwich", "label": 4},
    (3, 10): {"name": "Alpha Beta: Roll", "label": 5},
    (3, 20): {"name": "Alpha Beta: Alpha-Beta Barrel", "label": 6},
    (3, 30): {"name": "Alpha Beta: 2-Layer Sandwich", "label": 7},
    (3, 40): {"name": "Alpha Beta: 3-Layer(aba) Sandwich", "label": 8},
    (3, 90): {"name": "Alpha Beta: Alpha-Beta Complex", "label": 9}
}

# Create 'architecture_id' and 'label' columns directly from architecture_names
data['architecture_id'] = data.apply(lambda row: (row['class'], row['architecture']), axis=1)
data['label'] = data.apply(lambda row: architecture_names[(row['class'], row['architecture'])]['label'], axis=1)


data.head(10)

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
0,2w3sB01,3,90,1170,50,2.6,2w3s,SVGKPLPHDSARAHVTGQARYLDDLPCPANTLHLAFGLSTEASAAI...,"[(2, 124)]","(3, 90)",9
1,3be3A00,2,30,30,320,2.04,3be3,QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPST...,"[(6, 81)]","(2, 30)",2
2,3zq4C03,3,10,20,580,3.0,3zq4,DIGNIVLRDRRILSEEGLVIVVVSIDMDDFKISAGPDLISRGFVIN...,"[(449, 555)]","(3, 10)",5
3,1peqA03,1,10,1650,20,2.8,1peq,DITFRLAKENAQMALFSPYDIQRRYGKPFGDIAISERYDELIADPH...,"[(294, 346)]","(1, 10)",0
4,1bdoA00,2,40,50,100,1.8,1bdo,EISGHIVRSPMVGTFYRTPSPDAKAFIEVGQKVNVGDTLCIVEAMK...,"[(77, 156)]","(2, 40)",3
5,3r0hG01,2,30,42,10,2.6,3r0h,TAEIKPNKKILIELKVEKKPMGVIVCGGKNNHVTTGCVITHVYPEG...,"[(479, 579)]","(2, 30)",2
6,1aqcA00,2,30,29,30,2.3,1aqc,EDLIDGIIFAANYLGSTQLLSDKTPSKNVRQAQEAVSRIKAQKLTE...,"[(324, 488)]","(2, 30)",2
7,2gnxA01,1,10,3450,30,2.45,2gnx,,"[(101, 295), (431, 440)]","(1, 10)",0
8,6hjfA02,3,10,310,10,1.7,6hjf,VPSFLYQQDVVVVLPKPYGEVRVDIAFGGNFFAIVPAEQLGIDISV...,"[(189, 367)]","(3, 10)",5
9,1o1zA00,3,20,20,190,1.6,1o1z,HHHHVIVLGHRGYSAKYLENTLEAFMKAIEAGANGVELDVRLSKDG...,"[(-3, 222)]","(3, 20)",6


# File Hierarchy Construction

In [3]:
for index, row in data.iterrows():
    cath_id = row['cath_id']
    
    # Create ../data/{cath_id}/pdb directory if it doesn't exist
    pdb_dir = f"../data/{cath_id}/pdb"
    os.makedirs(pdb_dir, exist_ok=True)
    
    # Create ../data/{cath_id}/networkx directory if it doesn't exist
    networkx_dir = f"../data/{cath_id}/networkx"
    os.makedirs(networkx_dir, exist_ok=True)

    # Create ../data/{cath_id}/networkx directory if it doesn't exist
    seq_dir = f"../data/{cath_id}/seqs"
    os.makedirs(seq_dir, exist_ok=True)
    
    # Copy file from ../data/pdb_share/{cath_id} to ../data/{cath_id}/pdb
    source_file = f"../data/pdb_share/{cath_id}"
    dest_file = f"../data/{cath_id}/pdb/{cath_id}"
    shutil.copyfile(source_file, dest_file)

In [4]:
# def safe_len(x):
#     if isinstance(x, (list, tuple)):
#         return len(x)
#     return 0  # or you might want to return a different value for non-list/tuple entries

# subset_df = data[data['cath_indices'].apply(safe_len) > 1]
# subset_df

In [5]:
# # PDB: 4adi
# # cath_id: 4adiA01
# # iloc: 33
# # cath_indices: [(2, 30), (164, 198), (319, 331)]
# uniprot_seq = 'MASTTPITMEDLQKALEAQSRALRAGLAAGASQSRRPRPPRQRDSSTSGDDSGRDSGGPRRRRGNRGRGQRKDWSRAPPPPEERQESRSQTPAPKPSRAPPQQPQPPRMQTGRGGSAPRPELGPPTNPFQAAVARGLRPPLHDPDTEAPTEACVTSWLWSEGEGAVFYRVDLHFTNLGTPPLDEDGRWDPALMYNPCGPEPPAHVVRAYNQPAGDVRGVWGKGERTYAEQDFRVGGTRWHRLLRMPVRGLDGDTAPLPPHTTERIETRSARHPWRIRFGAPQAFLAGLLLAAVAVGTARAGLQPRADMAAPPMPPQPPRAHGQHYGHHHHQLPFLGHDGHHGGTLRVGQHHRNASDVLPGHWLQGGWGCYNLSDWHQGTHVCHTKHMDFWCVEHDRPPPATPTSLTTAANSTTAATPATAPPPCHAGLNDSCGGFLSGCGPMRLRHGADTRCGRLICGLSTTAQYPPTRFGCAMRWGLPPWELVVLTARPEDGWTCRGVPAHPGTRCPELVSPMGRATCSPASALWLATANALSLDHAFAAFVLLVPWVLIFMVCRRACRRRGAAAALTAVVLQGYNPPAYGEEAFTYLCTAPGCATQTPVPVRLAGVRFESKIVDGGCFAPWDLEATGACICEIPTDVSCEGLGAWVPTAPCARIWNGTQRACTFWAVNAYSSGGYAQLASYFNPGGSYYKQYHPTACEVEPAFGHSDAACWGFPTDTVMSVFALASYVQHPHKTVRVKFHTETRTVWQLSVAGVSCNVTTEHPFCNTPHGQLEVQVPPDPGDLVEYIMNYTGNQQSRWGLGSPNCHGPDWASPVCQRHSPDCSRLVGATPERPRLRLVDADDPLLRTAPGPGEVWVTPVIGSQARKCGLHIRAGPYGHATVEMPEWIHAHTTSDPWHPPGPLGLKFKTVRPVALPRALAPPRNVRVTGCYQCGTPALVEGLAPGGGNCHLTVNGEDVGAFPPGKFVTAALLNTPPPYQVSCGGESDRASARVIDPAAQSFTGVVYGTHTTAVSETRQTWAEWAAAHWWQLTLGAICALLLAGLLACCAKCLYYLRGAIAPR'
# print(uniprot_seq[1:29])
# print(uniprot_seq[163:197])
# print(uniprot_seq[318:330])

In [6]:
# data.iloc[33]['sequences']

In [7]:
# # PDB: 2gnx
# # cath_id: 2gnxA01
# # iloc: 7
# # cath_indices: [(101, 295), (431, 440)]
# seq = 'MGESIPLAAPVPVEQAVLETFFSHLGIFSYDKAKDNVEKEREANKSAGGSWLSLLAALAHLAAAEKVYHSLTYLGQKLGGQSFFSRKDSIRTIYTSLHNELKKVVAGRGAPGGTAPHVEELLPHLSEQLCFFVQARMEIADFYEKMYALSTQKFINTEELVSTLDTILRKYSSRFHHPILSPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRETKKHLFGGQSQKAVQPPHLFLWLMKLKTMLLAKFSFYFHEALSRQTTASEMKALTAKANPDLFGKISSFIRKYDAANVSLIFDNRGSESFQGHGYHHPHSYREAPKGVDQYPAVVSLPSDRPVMHWPNVIMIMTDRASDLNSLEKVVHFYDDKVQSTYFLTRPEPHFTIVVIFESKKSERDSHFISFLNELSLALKNPKVFASLKPGSKG'
# a = seq[100:294]
# b = seq[430:439]
# print(a + b)
# print(len(a+b))

In [8]:
# cath_indices = data.iloc[7]['cath_indices']
# print(cath_indices)

# Handling Missing Values

In [9]:
# Identify rows with any NaN values
rows_with_nan = data[data.isna().any(axis=1)]
rows_with_nan

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
7,2gnxA01,1,10,3450,30,2.45,2gnx,,"[(101, 295), (431, 440)]","(1, 10)",0
178,19hcA01,3,90,10,10,1.8,19hc,CHQSETKERRECAGCHAITTPKDDEAWCATTVAPVSPMLAPYKVVI...,,"(3, 90)",9
2208,2fd4A00,3,30,40,110,1.8,2fd4,KLAALDPIASQFSQLRTISKALGFKDAADDVTHCLFGGELSLSNPD...,,"(3, 30)",7
2827,2xskA00,2,60,40,2420,1.7,2xsk,,"[(3, 97)]","(2, 60)",4
3138,5inwA02,2,30,39,10,2.7,5inw,WGNKFEPDLTKNVRFWVNSSYSMMVPTMHQRAKLSYTQDRKLRSTV...,,"(2, 30)",2
3838,1dkiC01,3,90,70,50,1.6,1dki,FARNEKEAKDSAITFIQKIKLDKVNLGGELSGSNMYVYNISTGGFV...,,"(3, 90)",9
4330,1kskA01,3,10,290,10,2.0,1ksk,GSHRLDKFIAQQLGVSRAIAGREIRGNRVTVDGEIVRNAAFKLLPE...,,"(3, 10)",5
4668,4dgwC00,2,60,40,2690,3.11,4dgw,,"[(101, 253)]","(2, 60)",4
5776,1nthA00,3,20,20,460,1.55,1nth,,"[(2, 458)]","(3, 20)",6
6010,3pieA02,3,30,1370,250,2.9,3pie,,"[(379, 487)]","(3, 30)",7


### CASE 1: Rows that have `sequences` but NaN `cath_indices` can be automatically corrected with residue indices present in the provided pdb files using a call to helper function `extract_resid_ranges()` which considers contiguous ranges

In [10]:
case1 = data[data['sequences'].notna() & data['cath_indices'].isna()]
case1

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
178,19hcA01,3,90,10,10,1.8,19hc,CHQSETKERRECAGCHAITTPKDDEAWCATTVAPVSPMLAPYKVVI...,,"(3, 90)",9
2208,2fd4A00,3,30,40,110,1.8,2fd4,KLAALDPIASQFSQLRTISKALGFKDAADDVTHCLFGGELSLSNPD...,,"(3, 30)",7
3138,5inwA02,2,30,39,10,2.7,5inw,WGNKFEPDLTKNVRFWVNSSYSMMVPTMHQRAKLSYTQDRKLRSTV...,,"(2, 30)",2
3838,1dkiC01,3,90,70,50,1.6,1dki,FARNEKEAKDSAITFIQKIKLDKVNLGGELSGSNMYVYNISTGGFV...,,"(3, 90)",9
4330,1kskA01,3,10,290,10,2.0,1ksk,GSHRLDKFIAQQLGVSRAIAGREIRGNRVTVDGEIVRNAAFKLLPE...,,"(3, 10)",5


In [11]:
'''
Define the threshold value for grouping contiguous residue ranges.
Residue ranges are considered contiguous if the difference between consecutive residue IDs
does not exceed this threshold value.
This threshold value directly influences the identification of gaps in the next section, so it is important 
to choose this number carefully.
'''
gap_threshold = 20

# Iterate over case1, apply get_sequence_from_pdb, and update data
for index, row in case1.iterrows():
    # Data accession
    cath_id = row['cath_id']
    pdb_filename = f"../data/pdb_share/{cath_id}"

    # Call provided helper fxn
    cath_indices = extract_resid_ranges(pdb_filename, threshold=gap_threshold)

    # Update the data frame to remove case2 NaN values
    data.at[index, 'cath_indices'] = cath_indices
    print(f"Updated `cath_indices` for cath_id {cath_id}:\n{cath_indices}\n")

Updated `cath_indices` for cath_id 19hcA01:
[(100, 129), (168, 292)]

Updated `cath_indices` for cath_id 2fd4A00:
[(435, 553)]

Updated `cath_indices` for cath_id 5inwA02:
[(251, 348), (408, 416)]

Updated `cath_indices` for cath_id 1dkiC01:
[(1, 112), (154, 253)]

Updated `cath_indices` for cath_id 1kskA01:
[(1, 63)]



### CASE 2: Rows that have `cath_indices` but NaN `sequences` can be automatically corrected with the sequence present in UniProtKB
- Note: Helper function `get_sequence_from_pdb()` does not suffice here because of the prescence of `UNK` resnames in case2 pdb files

In [12]:
case2 = data[data['sequences'].isna() & data['cath_indices'].notna()]
case2

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
7,2gnxA01,1,10,3450,30,2.45,2gnx,,"[(101, 295), (431, 440)]","(1, 10)",0
2827,2xskA00,2,60,40,2420,1.7,2xsk,,"[(3, 97)]","(2, 60)",4
4668,4dgwC00,2,60,40,2690,3.11,4dgw,,"[(101, 253)]","(2, 60)",4
5776,1nthA00,3,20,20,460,1.55,1nth,,"[(2, 458)]","(3, 20)",6
6010,3pieA02,3,30,1370,250,2.9,3pie,,"[(379, 487)]","(3, 30)",7


In [13]:
# cath_indices = [(101, 295), (431, 440)]

# for idx_range in cath_indices:
#     start_idx, end_idx = idx_range
#     idx_range_set = set(range(start_idx, end_idx + 1))  # Create a set of the range for quick lookup
#     print(idx_range_set)


In [14]:
# from Bio.Align import PairwiseAligner

# def extract_aligned_segment(given_sequence, ground_truth_sequence):
#     """
#     Extracts the segment from the ground truth sequence that most aligns with the given sequence
#     and replaces the 'U' amino acids with the corresponding amino acids from the ground truth sequence.
    
#     Args:
#     given_sequence (str): The sequence to be aligned.
#     ground_truth_sequence (str): The reference sequence to align against.
    
#     Returns:
#     str: The given sequence with 'U' amino acids replaced by aligned ground truth amino acids.
#     """
    
#     # Initialize the aligner
#     aligner = PairwiseAligner()
#     aligner.mode = 'global'
    
#     # Perform the alignment
#     alignments = aligner.align(given_sequence.replace('U', ''), ground_truth_sequence)
    
#     # Get the best alignment
#     best_alignment = alignments[0]
    
#     # Extract the aligned sequences
#     aligned_given = best_alignment.aligned[0]
#     aligned_truth = best_alignment.aligned[1]
    
#     # Construct the aligned segment from the ground truth sequence
#     aligned_segment = ""
#     for i in range(len(aligned_given)):
#         start_given, end_given = aligned_given[i]
#         start_truth, end_truth = aligned_truth[i]
        
#         if given_sequence[start_given] != 'U':
#             aligned_segment += ground_truth_sequence[start_truth:end_truth]
#         else:
#             aligned_segment += given_sequence[start_given:end_given]

#     # Print the aligned segment
#     print(f"Aligned segment from the ground truth sequence: {aligned_segment}")
    
#     # Print the alignment for reference
#     print(best_alignment)

#     # Print detailed alignment positions and segments
#     print("Aligned positions in given sequence:")
#     for start, end in aligned_given:
#         print(f"Start: {start}, End: {end}, Segment: {given_sequence[start:end]}")
    
#     print("Aligned positions in ground truth sequence:")
#     for start, end in aligned_truth:
#         print(f"Start: {start}, End: {end}, Segment: {ground_truth_sequence[start:end]}")

#     return aligned_segment

# # Example usage
# ground_truth_sequence = "PHLSEQLCFFVQAREIADFYEKYALSTQKFINTEELVSTLDTILRKYSPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRPPHLFLWLKLKTLLAKFSFYFHEALSRQTTASEKALTAKANP"
#                         "PHLSEQLCFFVQAREIADFYEKYALSTQKFINTEELVSTLDTILRKYSPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRPPHLFLWLKLKTLLAKFSFYFHEALSRQTTASEKALTAKANP"
# given_sequence = "MGESIPLAAPVPVEQAVLETFFSHLGIFSYDKAKDNVEKEREANKSAGGSWLSLLAALAHLAAAEKVYHSLTYLGQKLGGQSFFSRKDSIRTIYTSLHNELKKVVAGRGAPGGTAPHVEELLPHLSEQLCFFVQARMEIADFYEKMYALSTQKFINTEELVSTLDTILRKYSSRFHHPILSPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRETKKHLFGGQSQKAVQPPHLFLWLMKLKTMLLAKFSFYFHEALSRQTTASEMKALTAKANPDLFGKISSFIRKYDAANVSLIFDNRGSESFQGHGYHHPHSYREAPKGVDQYPAVVSLPSDRPVMHWPNVIMIMTDRASDLNSLEKVVHFYDDKVQSTYFLTRPEPHFTIVVIFESKKSERDSHFISFLNELSLALKNPKVFASLKPGSKG"

# aligned_segment = extract_aligned_segment(given_sequence, ground_truth_sequence)


In [15]:
try:
    result = []
    
    # Data accession
    pdb_code = '2gnx'
    cath_id = '2gnxA01'
    cath_indices = [(101, 295), (431, 440)]
    given_seq = data[data['cath_id'] == '2gnxA01']['sequences'].item()
    print(f'[CATH_ID] {cath_id}\n')

    # Web scraping fxn calls
    uniprot_accession = get_uniprot_accession_from_pdb(pdb_code)
    subject_sequence = get_fasta_sequence_from_uniprot(uniprot_accession)
    print(f'Subject:  (UniProt accession {uniprot_accession})\n{subject_sequence}\n')
    subject_file = f'../data/{cath_id}/seqs/subject.fasta'
    with open(subject_file, 'w') as t:
        t.write(subject_sequence)

    if pd.isna(given_seq):    
        print(f'Given Seq:\nCase 2 - {given_seq}\nBuilding sequence from pdb_share...\n')
    else:
        print(f'Given Seq:\n{given_seq}\n')




    pdb_filename = f"../data/pdb_share/{cath_id}"
    for i, idx_range in enumerate(cath_indices):
        # The query sequence corresponds to the protein domain specific to the idx_range tuple
        query_sequence = get_domain_sequence_from_pdb(pdb_filename, idx_range, cath_id) 
        print(f'\nQuery Sequence {i+1}:  {idx_range}\n{query_sequence}\n')
        query_file = f'../data/{cath_id}/seqs/query_{i+1}.fasta'
        with open(query_file, 'w') as q:
            q.write(query_sequence)

        # Perform sequence alignment for this domain segment using NCBI Blast+
        output_file = f'../data/{cath_id}/seqs/output_{i+1}.txt'
        aligned_segment = extract_aligned_segment(query_sequence, subject_sequence, query_file, subject_file, output_file)
        print(f'Alignment Segment (from subject):\n{aligned_segment}\n')

        # Error Handling - Sequence alignment w/ BLAST+ sometimes leaves out amino acids. If not handled, this will corrupt remodeling.
        expected_num_AAs = idx_range[1] - idx_range[0] + 1
        if len(aligned_segment) != expected_num_AAs:
            print(f"WARNING: Length of aligned segment [{len(aligned_segment)}] does not match the expected number of amino acids [{expected_num_AAs}]\n")
            # Manual sequence alignment
            aligned_segment = manual_alignment(query_sequence, aligned_segment, subject_sequence)

        result.append(aligned_segment)


    # Concatenate the indexed fasta sequences if len(cath_indices) > 1
    updated_sequence = ''.join(result)

    # Update the data frame to remove case2 NaN values
    data.at[index, 'sequences'] = updated_sequence
    print('[REPAIRED]')
    print(f"Updated FASTA sequence for CATH ID {cath_id} :\n{updated_sequence}\n")
        
except Exception as e:
    print(f"Error: {str(e)}")

[CATH_ID] 2gnxA01

Subject:  (UniProt accession Q6P1I3)
MGESIPLAAPVPVEQAVLETFFSHLGIFSYDKAKDNVEKEREANKSAGGSWLSLLAALAHLAAAEKVYHSLTYLGQKLGGQSFFSRKDSIRTIYTSLHNELKKVVAGRGAPGGTAPHVEELLPHLSEQLCFFVQARMEIADFYEKMYALSTQKFINTEELVSTLDTILRKYSSRFHHPILSPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRETKKHLFGGQSQKAVQPPHLFLWLMKLKTMLLAKFSFYFHEALSRQTTASEMKALTAKANPDLFGKISSFIRKYDAANVSLIFDNRGSESFQGHGYHHPHSYREAPKGVDQYPAVVSLPSDRPVMHWPNVIMIMTDRASDLNSLEKVVHFYDDKVQSTYFLTRPEPHFTIVVIFESKKSERDSHFISFLNELSLALKNPKVFASLKPGSKG

Given Seq:
Case 2 - nan
Building sequence from pdb_share...

[GAP DETECTED for cath_id 2gnxA01]
Missing residues in range 101 - 295: [122, 137, 146, 173, 174, 175, 176, 177, 178, 179, 180, 181, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 259, 264, 286]

Repairing...

Query Sequence 1:  (101, 295)
UUUUUUUUUUUUUUUUUUUUUUPHLSEQLCFFVQARUEIADFYEKUYALSTQKFINTEELVSTLDTILRKYSUUUUUUUUUPLESSFQLEVGVLSHLLKAQAQISEWKFLPSLVTLHNAHTKLQSWGQTFEKQRUUUUUUUUUUUUUUUUPPHLFLWLUKLKT

In [None]:
'''
Iterate over case2, apply get_fasta_sequence_from_uniprot, and update data

The indexing approach maps the provided `cath_indices` onto the UniProtKB sequence to extract the relevant segment
'''
for index, row in case2.iterrows():
    try:
        # Data accession
        pdb_code = row['pdb_id']
        # cath_indices = row['cath_indices']
        cath_id = row['cath_id']


        # Web scraping fxn calls
        uniprot_accession = get_uniprot_accession_from_pdb(pdb_code)
        ground_truth_fasta_sequence = get_fasta_sequence_from_uniprot(uniprot_accession)

        pdb_filename = f"../data/pdb_share/{cath_id}"
        available_fasta_sequence = get_sequence_from_pdb(pdb_filename)

        aligned_segment = extract_aligned_segment(available_fasta_sequence, ground_truth_fasta_sequence)
        print(aligned_segment)





        
        
       
    except Exception as e:
        print(f"Failed to fetch sequence for PDB code {pdb_code}: {str(e)}")

# Handling Sequence Gaps
- Phase 1: Detect Gaps
- Phase 2: Replace Gaps in `sequences` col w/ AAs known by UniProtKB
- Phase 3: Protein domain remodeling w/ SCWRL4
  #### Note: Case 2 data entries already have gaps corrected by the previous UniProtKB look up and do not need Phase 2 processing. They still require remodeling, however.

In [None]:
# example_gaps = {'3zq4C03': [('C', 493, 501)],
#                 '3i9v600': [('6', 58, 74)]}

In [None]:
# Add a column to track whether or not a given sequence entry has a gap
for index, row in data.iterrows():
    cath_indices = row['cath_indices'] # grab the indices
    cath_id = row['cath_id']
    
    # Open the PDB file
    pdb_filename = f"../data/pdb_share/{cath_id}"
    gap = detect_seq_gaps(pdb_filename, cath_indices, cath_id)

    data.at[index, 'gap'] = int(gap) # Will be 1 or 0

### Case study 1

In [None]:
given_seq = data[data['cath_id'] == '3be3A00']['sequences'].item()
print(given_seq)
print(len(given_seq))

In [None]:
# This accession originally has both `cath_indices` and `sequences`

# PDB: 3be3
# cath_id: 3be3A00
# iloc: 1
# cath_indices: [(6, 81)]	
# sequences (given): QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPSTRLLRIWNETVDTGAGPQPRFAYVGHVTPE

# It is missing resid 50, and thus has a gap
# [GAP DETECTED for cath_id 3be3A00] Missing residues in range 6 - 81: [50]

uniprot_seq = 'MAMQDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPMSTRLLRIWNETVDTGAGPQPRFAYVGHVTPEQG'
print(uniprot_seq[5:81])
print(len(uniprot_seq))

actual = uniprot_seq[3:79]
print(actual)
print(len(actual))

In [None]:
# provided (Missing residue 50)
# QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPSTRLLRIWNETVDTGAGPQPRFAYVGHVTPE

# UniProtKB GT
# MAMQDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPMSTRLLRIWNETVDTGAGPQPRFAYVGHVTPEQG

# extracted
# QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPMSTRLLRIWNETVDTGAGPQPRFAYVGHVTPE

### Case study 2 - Indexing Worked

In [None]:
data[data['cath_id'] == '1vkwA02']['sequences'].item()

In [None]:
# This accession originally has both `cath_indices` and `sequences`

# PDB: 1vkw
# cath_id: 1vkwA02
# iloc: 6222
# cath_indices: [(111, 206)]	
# sequences (given): 'FTRKRRPITSFLENDLEELPPEIVKIVETILAPSALNRQPWKIKYTGGELCISSERPVDLGIALSHAYLTAREIFKREPVIQKRGEDTYCLILNP'

# It is missing resid 139, and thus has a gap
# [GAP DETECTED for cath_id 1vkwA02] Missing residues in range 111 - 206: [139]

uniprot_seq = 'MNIFEAIENRHSVRDFLERKMPERVKDDIENLLVKFITKKLDWKINLSSFPSYIYAKAEKHFDELVEYGFQGEQIVLFLTAQGFGTCWMARSPHPDVPYIIVFGYPRTRNFTRKRRPITSFLENDLEELPPEIVKIVEMTILAPSALNRQPWKIKYTGGELCISSERPVDLGIALSHAYLTAREIFKREPVIQKRGEDTYCLILNP'
corrected = uniprot_seq[110:206]
print(corrected)
print(len(corrected))

In [None]:
line = 'FTRKRRPITSFLENDLEELPPEIVKIVETILAPSALNRQPWKIKYTGGELCISSERPVDLGIALSHAYLTAREIFKREPVIQKRGEDTYCLILNP'
print(len(line))

### Case study 3 - Indexing Worked

In [None]:
given_seq = data[data['cath_id'] == '3ltiA01']['sequences'].item()
print(given_seq)
print(len(given_seq))

In [None]:
# This accession originally has both `cath_indices` and `sequences`

# PDB: 3lti
# cath_id: 3ltiA01
# iloc: 6127
# cath_indices: [(152, 224), (343, 443)]
# sequences (given): 
# SPGVFFDSDVLYNARIIPYRGSWLDFEFDPKDNLFVRIDRRRKLPATIILRALNYTTEQILDLFHGPYISETLRVDPTNDRLSALVEIYRRPGEPPTREAAESLFENLFFSEDRYDLSAVGRFNRSLLREEIEGSGILSDDIIDVLIDIRNGGEVD

# It is missing many residues, and thus has a gap

# [GAP DETECTED for cath_id 3ltiA01] Missing residues in range 152 - 224: [161, 162, 163, 164, 165, 166, 167, 168, 169]
# [GAP DETECTED for cath_id 3ltiA01] Missing residues in range 343 - 443: [369, 370, 403, 404, 422, 429, 430, 431, 439]

uniprot_seq = 'MVYSYTEKKRIRKDFGKRPQVLDVPYLLSIQLDSFQKFIEQDPEGQYGLEAAFRSVFPIQSYSGNSELQYVSYRLGEPVFDVQECQIRGVTYSAPLRVKLRLVIYEREAPEGTVKDIKEQEVYMGEIPLMTDNGTFVINGTERVIVSQLHRSPGVFFDSDKGKTHSSGKVLYNARIIPYRGSWLDFEFDPKDNLFVRIDRRRKLPATIILRALNYTTEQILDLFFEKVIFEIRDNKLQMELVPERLRGETASFDIEANGKVYVEKGRRITARHIRQLEKDDVKLIEVPVEYIAGKVVAKDYIDESTGELICAANMELSLDLLAKLSQSGHKRIETLFTNDLDHGPYISETLRVDPTNDRLSALVEIYRMMRPGEPPTREAAESLFENLFFSEDRYDLSAVGRMKFNRSLLREEIEGSGILSKDDIIDVMKKLIDIRNGKGEVDDIDHLGNRRIRSVGEMAENQFRVGLVRVERAVKERLSLGDLDTLMPQDMINAKPISAAVKEFFGSSQLSQFMDQNNPLSEITHKRRISALGPGGLTRERAGFEVRDVHPTHYGRVCPIETPEGPNIGLINSLSVYAQTNEYGFLETPYRKVTDGVVTDEIHYLSAIEEGNYVIAQANSNLDEEGHFVEDLVTCRSKGESSLFSRDQVDYMDVSTQQVVSVGASLIPFLEHDDANRALMGANMQRQAVPTLRADKPLVGTGMERAVAVDSGVTAVAKRGGVVQYVDASRIVIKVNEDEMYPGEAGIDIYNLTKYTRSNQNTCINQMPCVSLGEPVERGDVLADGPSTDLGELALGQNMRVAFMPWNGYNFEDSILVSERVVQEDRFTTIHIQELACVSRDTKLGPEEITADIPNVGEAALSKLDESGIVYIGAEVTGGDILVGKVTPKGETQLTPEEKLLRAIFGEKASDVKDSSLRVPNGVSGTVIDVQVFTRDGVEKDKRALEIEEMQLKQAKKDLSEELQILEAGLFSRIRAVLVAGGVEAEKLDKLPRDRWLELGLTDEEKQNQLEQLAEQYDELKHEFEKKLEAKRRKITQGDDLAPGVLKIVKVYLAVKRRIQPGDKMAGRHGNKGVISKINPIEDMPYDENGTPVDIVLNPLGVPSRMNIGQILETHLGMAAKGIGDKINAMLKQQQEVAKLREFIQRAYDLGADVRQKVDLSTFSDEEVMRLAENLRKGMPIATPVFDGAKEAEIKELLKLGDLPTSGQIRLYDGRTGEQFERPVTVGYMYMLKLNHLVDDKMHARSTGSYSLVTQQPLGGKAQFGGQRFGEMEVWALEAYGAAYTLQEMLTVKSDDVNGRTKMYKNIVDGNHQMEPGMPESFNVLLKEIRSLGINIELEDE'
corrected = uniprot_seq[151:224] + uniprot_seq[342:443]
print(corrected)
print(len(corrected))

### Case Study 4 - 

In [None]:
given_seq = data[data['cath_id'] == '4dgwC00']['sequences'].item()
print(given_seq)
print(len(given_seq))

In [None]:
# This accession is CASE 2: Rows that have cath_indices but NaN sequences

# PDB: 4dgw
# cath_id: 4dgwC00
# iloc: 4668
# cath_indices: [(101, 253)]		
# sequences (given): NaN

# It is missing many residues, and thus has a gap
# [GAP DETECTED for cath_id 4dgwC00] Missing residues in range 101 - 253: [137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 153, 226]

uniprot_seq = 'MNLLETRRSLLEEMEIIENAIAERIQRNPELYYHYIQESSKVFPDTKLPRSSLIAENKIYKFKKVKRKRKQIILQQHEINIFLRDYQEKQQTFNKINRPEETQEDDKDLPNFERKLQQLEKELKNEDENFELDINSKKDKYALFSSSSDPSRRTNILSDRARDLDLNEIFTRDEQYGEYMELEQFHSLWLNVIKRGDCSLLQFLDILELFLDDEKYLLTPPMDRKNDRYMAFLLKLSKYVETFFFKSYALLDAAAVENLIKSDFEHSYCRGSLRSEAKGIYCPFCSRWFKTSSVFESHLVGKIHKKNESKRRNFVYSEYKLHRYLKYLNDEFSRTRSFVERKLAFTANERMAEMDILTQKYEAPAYDSTEKEGAEQVDGEQRDGQLQEEHLSGKSFDMPLGPDGLPMPYWLYKLHGLDREYRCEICSNKVYNGRRTFERHFNEERHIYHLRCLGIEPSSVFKGITKIKEAQELWKNMQGQSQLTSIAAVPPKPNPSQLKVPTELELEEEDEEGNVMSKKVYDELKKQGLV'

actual = uniprot_seq[100:253]
print(actual)
print(len(actual))

In [None]:
try:
    # Data accession
    pdb_code = '2gnx'
    # cath_indices = [(101, 253)]

    # Web scraping fxn calls
    uniprot_accession = get_uniprot_accession_from_pdb(pdb_code)
    fasta_sequence = get_fasta_sequence_from_uniprot(uniprot_accession)
    print(fasta_sequence)
    
    # Concatenate the indexed fasta sequences if len(cath_indices) > 1
    indexed_sequence = ''.join([fasta_sequence[start-1:end] for start, end in cath_indices]) 

    print(f"Updated FASTA sequence for PDB code {pdb_code} (UniProt accession {uniprot_accession}):\n{indexed_sequence}\n")
    
except Exception as e:
    print(f"Failed to fetch sequence for PDB code {pdb_code}: {str(e)}")

In [None]:
import Bio

from Bio import SeqIO
from io import StringIO
from Bio.PDB.Polypeptide import protein_letters_3to1

def get_sequence_from_pdb(pdb_filename):
    """
    The sequences come from the PDB files
    """
    pdb_parser = Bio.PDB.PDBParser()
    structure = pdb_parser.get_structure(pdb_filename, pdb_filename)
    assert len(structure) == 1

    seq = []

    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.get_id()[0] == " ":  # This checks if it's a standard residue
                    try:
                        seq.append(protein_letters_3to1[residue.get_resname()])
                    except KeyError:
                        seq.append('U')
                else:
                    print('nonstandard', residue.get_id())

    return ''.join(seq)


In [None]:
cath_id = '4dgwC00'

# Open the PDB file
pdb_filename = f"../data/pdb_share/{cath_id}"
res = get_sequence_from_pdb(pdb_filename)


In [None]:
res

In [None]:
# Isolate data entries with sequence gaps
gapped_data = data[data['gap'] == 1]
gapped_data

In [None]:
# Load complete sequences from UniProt

In [None]:
# Edit gap sequences in the main data

In [None]:
# Confirm that all sequence gaps have been removed

In [None]:
# Add sequence motifs

# Structure Data

In [None]:
# Obtain and Store PDB files for non-gap sequences

In [None]:
# Obtain PDB files for gap sequences

In [None]:
# Remodel using SCWRL4 and Store

# Graph Database

In [None]:
# Create NetworkX graphical representations of stored PDB files

In [None]:
# Split data into train/test splits based on homological similarity