In [1]:
from utils import *
import os
import shutil

import warnings

warnings.filterwarnings("ignore", message="Used element '.' for Atom")

# Data Loading

In [2]:
data = pd.read_csv('../data/cath_w_seqs_share.csv', index_col=0)
data['cath_indices'] = data['cath_indices'].apply(safe_eval) # Convert `cath_indices` from str to list of tuples

# Map architecture IDs to names and labels
architecture_names = {
    (1, 10): {"name": "Mainly Alpha: Orthogonal Bundle", "label": 0},
    (1, 20): {"name": "Mainly Alpha: Up-down Bundle", "label": 1},
    (2, 30): {"name": "Mainly Beta: Roll", "label": 2},
    (2, 40): {"name": "Mainly Beta: Beta Barrel", "label": 3},
    (2, 60): {"name": "Mainly Beta: Sandwich", "label": 4},
    (3, 10): {"name": "Alpha Beta: Roll", "label": 5},
    (3, 20): {"name": "Alpha Beta: Alpha-Beta Barrel", "label": 6},
    (3, 30): {"name": "Alpha Beta: 2-Layer Sandwich", "label": 7},
    (3, 40): {"name": "Alpha Beta: 3-Layer(aba) Sandwich", "label": 8},
    (3, 90): {"name": "Alpha Beta: Alpha-Beta Complex", "label": 9}
}

# Create 'architecture_id' and 'label' columns directly from architecture_names
data['architecture_id'] = data.apply(lambda row: (row['class'], row['architecture']), axis=1)
data['label'] = data.apply(lambda row: architecture_names[(row['class'], row['architecture'])]['label'], axis=1)


data.head(10)

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
0,2w3sB01,3,90,1170,50,2.6,2w3s,SVGKPLPHDSARAHVTGQARYLDDLPCPANTLHLAFGLSTEASAAI...,"[(2, 124)]","(3, 90)",9
1,3be3A00,2,30,30,320,2.04,3be3,QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPST...,"[(6, 81)]","(2, 30)",2
2,3zq4C03,3,10,20,580,3.0,3zq4,DIGNIVLRDRRILSEEGLVIVVVSIDMDDFKISAGPDLISRGFVIN...,"[(449, 555)]","(3, 10)",5
3,1peqA03,1,10,1650,20,2.8,1peq,DITFRLAKENAQMALFSPYDIQRRYGKPFGDIAISERYDELIADPH...,"[(294, 346)]","(1, 10)",0
4,1bdoA00,2,40,50,100,1.8,1bdo,EISGHIVRSPMVGTFYRTPSPDAKAFIEVGQKVNVGDTLCIVEAMK...,"[(77, 156)]","(2, 40)",3
5,3r0hG01,2,30,42,10,2.6,3r0h,TAEIKPNKKILIELKVEKKPMGVIVCGGKNNHVTTGCVITHVYPEG...,"[(479, 579)]","(2, 30)",2
6,1aqcA00,2,30,29,30,2.3,1aqc,EDLIDGIIFAANYLGSTQLLSDKTPSKNVRQAQEAVSRIKAQKLTE...,"[(324, 488)]","(2, 30)",2
7,2gnxA01,1,10,3450,30,2.45,2gnx,,"[(101, 295), (431, 440)]","(1, 10)",0
8,6hjfA02,3,10,310,10,1.7,6hjf,VPSFLYQQDVVVVLPKPYGEVRVDIAFGGNFFAIVPAEQLGIDISV...,"[(189, 367)]","(3, 10)",5
9,1o1zA00,3,20,20,190,1.6,1o1z,HHHHVIVLGHRGYSAKYLENTLEAFMKAIEAGANGVELDVRLSKDG...,"[(-3, 222)]","(3, 20)",6


# File Hierarchy Construction

In [3]:
for index, row in data.iterrows():
    cath_id = row['cath_id']
    
    # Create ../data/{cath_id}/pdb directory if it doesn't exist
    pdb_dir = f"../data/{cath_id}/pdb"
    os.makedirs(pdb_dir, exist_ok=True)
    
    # Create ../data/{cath_id}/networkx directory if it doesn't exist
    networkx_dir = f"../data/{cath_id}/networkx"
    os.makedirs(networkx_dir, exist_ok=True)

    # Create ../data/{cath_id}/networkx directory if it doesn't exist
    seq_dir = f"../data/{cath_id}/seqs"
    os.makedirs(seq_dir, exist_ok=True)
    
    # Copy file from ../data/pdb_share/{cath_id} to ../data/{cath_id}/pdb
    source_file = f"../data/pdb_share/{cath_id}"
    dest_file = f"../data/{cath_id}/pdb/{cath_id}"
    shutil.copyfile(source_file, dest_file)

In [4]:
cath_id = '1qouB00'
dest_file = f"../data/{cath_id}/pdb/{cath_id}"

# Remove amino acid ALT LOCs, should they occur in any destination pdb files
with open(dest_file, 'r') as file:
    lines = file.readlines()

# Extract all residue identifiers
resids = []
for line in lines:
    if line.startswith("ATOM"):
        resid = line[22:30].strip()
        resids.append(resid)

# Determine unique values with any alt locs
unique_resids = sorted(set(resids), key=lambda x: int(x[:-1]) if x[-1].isalpha() else int(x))

# Create a list of elements that end with a letter
alt_locs = [resid for resid in unique_resids if resid[-1].isalpha()]

# Check if any element ends with a letter
contains_alt_loc = len(alt_locs) > 0

if contains_alt_loc:
    print(f'Renumbering Alt Locs detected in PDB File {cath_id}:\n{alt_locs}')
    corrected_resids = correct_alt_locs(dest_file, unique_resids)
    renumber_pdb_file(corrected_resids, dest_file)

Renumbering Alt Locs detected in PDB File 1qouB00:
['32C', '32B', '94A', '129A']


# Handling Missing Values

In [5]:
# Identify rows with any NaN values
rows_with_nan = data[data.isna().any(axis=1)]
rows_with_nan

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
7,2gnxA01,1,10,3450,30,2.45,2gnx,,"[(101, 295), (431, 440)]","(1, 10)",0
178,19hcA01,3,90,10,10,1.8,19hc,CHQSETKERRECAGCHAITTPKDDEAWCATTVAPVSPMLAPYKVVI...,,"(3, 90)",9
2208,2fd4A00,3,30,40,110,1.8,2fd4,KLAALDPIASQFSQLRTISKALGFKDAADDVTHCLFGGELSLSNPD...,,"(3, 30)",7
2827,2xskA00,2,60,40,2420,1.7,2xsk,,"[(3, 97)]","(2, 60)",4
3138,5inwA02,2,30,39,10,2.7,5inw,WGNKFEPDLTKNVRFWVNSSYSMMVPTMHQRAKLSYTQDRKLRSTV...,,"(2, 30)",2
3838,1dkiC01,3,90,70,50,1.6,1dki,FARNEKEAKDSAITFIQKIKLDKVNLGGELSGSNMYVYNISTGGFV...,,"(3, 90)",9
4330,1kskA01,3,10,290,10,2.0,1ksk,GSHRLDKFIAQQLGVSRAIAGREIRGNRVTVDGEIVRNAAFKLLPE...,,"(3, 10)",5
4668,4dgwC00,2,60,40,2690,3.11,4dgw,,"[(101, 253)]","(2, 60)",4
5776,1nthA00,3,20,20,460,1.55,1nth,,"[(2, 458)]","(3, 20)",6
6010,3pieA02,3,30,1370,250,2.9,3pie,,"[(379, 487)]","(3, 30)",7


### CASE 1: Rows that have `sequences` but NaN `cath_indices` can be automatically corrected with residue indices present in the provided pdb files using a call to helper function `extract_resid_ranges()` which considers contiguous ranges of residues when assigning index tuples to `cath_indices`

In [6]:
case1 = data[data['sequences'].notna() & data['cath_indices'].isna()]
case1

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
178,19hcA01,3,90,10,10,1.8,19hc,CHQSETKERRECAGCHAITTPKDDEAWCATTVAPVSPMLAPYKVVI...,,"(3, 90)",9
2208,2fd4A00,3,30,40,110,1.8,2fd4,KLAALDPIASQFSQLRTISKALGFKDAADDVTHCLFGGELSLSNPD...,,"(3, 30)",7
3138,5inwA02,2,30,39,10,2.7,5inw,WGNKFEPDLTKNVRFWVNSSYSMMVPTMHQRAKLSYTQDRKLRSTV...,,"(2, 30)",2
3838,1dkiC01,3,90,70,50,1.6,1dki,FARNEKEAKDSAITFIQKIKLDKVNLGGELSGSNMYVYNISTGGFV...,,"(3, 90)",9
4330,1kskA01,3,10,290,10,2.0,1ksk,GSHRLDKFIAQQLGVSRAIAGREIRGNRVTVDGEIVRNAAFKLLPE...,,"(3, 10)",5


In [7]:
'''
Define the threshold value for grouping contiguous residue ranges.
Residue ranges are considered contiguous if the difference between consecutive residue IDs
does not exceed this threshold value.
This threshold value directly influences the identification of gaps in the next section, so it is important 
to choose this number carefully.
'''
gap_threshold = 20

# Iterate over case1, apply get_sequence_from_pdb, and update data
for index, row in case1.iterrows():
    # Data accession
    cath_id = row['cath_id']
    pdb_filename = f"../data/{cath_id}/pdb/{cath_id}"

    # Call provided helper fxn
    cath_indices = extract_resid_ranges(pdb_filename, threshold=gap_threshold)

    # Update the data frame to remove case2 NaN values
    data.at[index, 'cath_indices'] = cath_indices
    print(f"Updated `cath_indices` for cath_id {cath_id}:\n{cath_indices}\n")

Updated `cath_indices` for cath_id 19hcA01:
[(100, 129), (168, 292)]

Updated `cath_indices` for cath_id 2fd4A00:
[(435, 553)]

Updated `cath_indices` for cath_id 5inwA02:
[(251, 348), (408, 416)]

Updated `cath_indices` for cath_id 1dkiC01:
[(1, 112), (154, 253)]

Updated `cath_indices` for cath_id 1kskA01:
[(1, 63)]



### CASE 2: Rows that have `cath_indices` but NaN `sequences` can be automatically corrected with the sequence present in UniProtKB
- Note: Helper function `get_sequence_from_pdb()` does not suffice here because of the prescence of `UNK` resnames in case2 pdb files

In [8]:
case2 = data[data['sequences'].isna() & data['cath_indices'].notna()]
case2

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label
7,2gnxA01,1,10,3450,30,2.45,2gnx,,"[(101, 295), (431, 440)]","(1, 10)",0
2827,2xskA00,2,60,40,2420,1.7,2xsk,,"[(3, 97)]","(2, 60)",4
4668,4dgwC00,2,60,40,2690,3.11,4dgw,,"[(101, 253)]","(2, 60)",4
5776,1nthA00,3,20,20,460,1.55,1nth,,"[(2, 458)]","(3, 20)",6
6010,3pieA02,3,30,1370,250,2.9,3pie,,"[(379, 487)]","(3, 30)",7


# Handling Sequence Gaps
- Phase 1: Detect Gaps
- Phase 2: Replace Gaps in `sequences` col w/ AAs known by UniProtKB
- Phase 3: Protein domain remodeling w/ SCWRL4
  ##### Note: Case 2 data entries can be treated at the same time as sequence gaps because the NaN sequences for Case 2 entries, when filled from pdb_share, may still contain sequence gaps. The process for resolving Case 2 sequence gaps is the same for the larger data set. We have killed 2 birds with one stone!


In [9]:
edge_cases = ['1qouB00']

In [10]:
duplicate_pdb_ids = set(data.groupby('pdb_id').filter(lambda x: len(x) > 1)['pdb_id']) # There are 720 PDB IDs with duplicate entries

with open('data_processing_main_loop_output.txt', 'w') as output_file: # Open a file for all output to reduce I/O burden
    for index, row in data.iterrows():
        try:
            result = []
            
            # Data accession
            pdb_code = row['pdb_id']
            cath_id = row['cath_id']
            cath_indices = row['cath_indices']
            given_seq = row['sequences']
            print(f'[CATH_ID] {cath_id} ({index+1} / {len(data)})\n', file=output_file)
            
            # Web scraping fxn calls (UniProtKB and RCSB PDB)
            try:
                uniprot_accession = get_uniprot_accession_from_pdb(pdb_code, duplicate_pdb_ids, output_file)
                if uniprot_accession:
                    # `uniprot_accession` is returned as a list from `get_uniprot_accession_from_pdb` if 
                    # 1) Your `pdb_code` reoccurs throughout the data set (duplicates), and multiple UniProt IDs are associated with it. 
                    #    This is logical for well studied proteins.
                    # 2) [EDGE CASE] Your `pdb_code` is unique in the data set, but multiple UniProt IDs are associated with it.
                    if isinstance(uniprot_accession, list): 
                        print(f'Identifying the correct subject sequence for [CATH ID] {cath_id}.\n', file=output_file)
                        if len(uniprot_accession) == 1: # 1 Uniprot ID represents mutltiple data entries
                            uniprot_accession = uniprot_accession[0] # We pass a str to the function
                            subject_sequence, _ = get_fasta_sequence_from_uniprot(uniprot_accession, output_file) 
                        else: 
                            # PDB ID is member of duplicate_pdb_ids with more than 1 possible Uniprot ID. 
                            # Filter through sequences for all Uniprot IDs to select the best match.
                            # `uniprot_accession` is a list passed to the function. We return the seq and ID associated with the filtered match
                            subject_sequence, uniprot_accession = get_fasta_sequence_from_uniprot(uniprot_accession, output_file, cath_id=cath_id, duplicates=True)
                    else:
                        # Here, `uniprot_accession` is of type str because only one UniprotID is associated with the `pdb_code`
                        subject_sequence, _ = get_fasta_sequence_from_uniprot(uniprot_accession, output_file)
                else:
                    raise ValueError(f"[EDGE CASE] UniProtKB does not contain data for PDB code {pdb_code}\n")
    
                print(f'Subject:  (UniProt accession {uniprot_accession})\n{subject_sequence}\n', file=output_file)
                data.at[index, 'uniprot_id'] = uniprot_accession
    
                    
            # Plan B: RCSB Protein Data Bank
            except (requests.exceptions.HTTPError, ValueError):
                print(f"Trying to get FASTA sequence for PDB code {pdb_code} from RCSB PDB...", file=output_file)
                subject_sequence = get_fasta_sequence_from_rcsb(pdb_code, output_file)
                if subject_sequence:
                    print(f'Subject: (RCSB PDB)\n{subject_sequence}\n', file=output_file)
                    data.at[index, 'uniprot_id'] = 'RCSB Lookup'
                else:
                    print(f"Failed to retrieve both UniProt accession and FASTA sequence for PDB code {pdb_code}.", file=output_file)
    
            # Write the `subject_sequence` to a file for use by NCBI BLAST+
            subject_file = f'../data/{cath_id}/seqs/subject.fasta'
            with open(subject_file, 'w') as t:
                t.write(subject_sequence)
            
            if pd.isna(given_seq):    
                print(f'Given Seq:\nCase 2 - {given_seq}\nBuilding query sequence(s) from pdb_share...\n', file=output_file)
            else:
                print(f'Given Seq:\n{given_seq}\n', file=output_file)
                # The `given_seq`, when present in the data frame, may consist of multiple protein domains 
                # which are treated separately in the folllowing for loop by enumerating the `cath_indices`
            
            pdb_filename = f"../data/{cath_id}/pdb/{cath_id}"
            for i, idx_range in enumerate(cath_indices):
                # The query sequence corresponds to the protein domain specific to the idx_range tuple
                query_sequence = get_domain_sequence_from_pdb(pdb_filename, idx_range, cath_id, output_file) 
                print(f'\nQuery Sequence {i+1}:  {idx_range}\n{query_sequence}\n', file=output_file)
                query_file = f'../data/{cath_id}/seqs/query_{i+1}.fasta'
                with open(query_file, 'w') as q:
                    q.write(query_sequence)
            
                # Perform sequence alignment for this domain segment using NCBI Blast+
                response_text = f'../data/{cath_id}/seqs/output_{i+1}.txt'
                aligned_segment = extract_aligned_segment(query_sequence, subject_sequence, query_file, subject_file, response_text)
                print(f'Alignment Segment (from subject):\n{aligned_segment}\n', file=output_file)
    
                if cath_id not in edge_cases:
                    # Error Handling - Sequence alignment w/ BLAST+ sometimes leaves out amino acids. If not handled, this will corrupt remodeling.
                    # However, certain edge_cases do not need this step because the aligned segment is already the correct result
                    expected_num_AAs = idx_range[1] - idx_range[0] + 1
                    if len(aligned_segment) != expected_num_AAs:
                        print(f"WARNING: Length of aligned segment [{len(aligned_segment)}] does not match the expected number of amino acids [{expected_num_AAs}]\n", file=output_file)
                        # Manual sequence alignment
                        aligned_segment = manual_alignment(query_sequence, aligned_segment, subject_sequence, output_file)
            
                result.append(aligned_segment)
            
            
            # Concatenate the indexed fasta sequences if len(cath_indices) > 1
            updated_sequence = ''.join(result)
            
            # Update the data frame to 1) Remove Case2 NaN values and 2) Handle Sequence Gaps
            data.at[index, 'sequences'] = updated_sequence
            print('[REPAIRED FULL SEQUENCE]', file=output_file)
            print(f"Updated FASTA sequence for CATH ID {cath_id} :\n{updated_sequence}\n", file=output_file)
        
        except Exception as e:
            with open('error_log.txt', 'a') as log_file:
                log_file.write(f"Error: {str(e)} for CATH ID: {cath_id}\n")
            
            # Optionally print the error to the console as well
            print(f"Error: {str(e)} for CATH ID: {cath_id}")
            # raise

Error: string index out of range for CATH ID: 4dnnA00
Error: string index out of range for CATH ID: 3hw5B00
Error: string index out of range for CATH ID: 3uepA00
Error: string index out of range for CATH ID: 3lmlA01
Error: cannot access local variable 'modified' where it is not associated with a value for CATH ID: 2gzqA00
Error: string index out of range for CATH ID: 4lgdB02
Error: string index out of range for CATH ID: 3k6tB00
Error: write() argument must be str, not None for CATH ID: 3ayhA00
Error: string index out of range for CATH ID: 2vdfA00
Error: string index out of range for CATH ID: 1ukfA00
Error: cannot access local variable 'modified' where it is not associated with a value for CATH ID: 3g5tA00
Error: string index out of range for CATH ID: 1xmxA01
Error: string index out of range for CATH ID: 4iluA02
Error: string index out of range for CATH ID: 3efmA02
Error: string index out of range for CATH ID: 3bptA00
Error: string index out of range for CATH ID: 3gmiA01
Error: string i



Error: string index out of range for CATH ID: 2oogD00
Error: string index out of range for CATH ID: 3l7iA01
Error: string index out of range for CATH ID: 3bryA00
Error: string index out of range for CATH ID: 3t4rA00
Error: string index out of range for CATH ID: 2o6kA00




Error: no element found: line 1, column 0 for CATH ID: 2ox5B00
Error: write() argument must be str, not None for CATH ID: 4tpsD00
Error: string index out of range for CATH ID: 2gdqA02
Error: string index out of range for CATH ID: 5jheA01
Error: string index out of range for CATH ID: 2hjmA01
Error: string index out of range for CATH ID: 4fm3A00
Error: write() argument must be str, not None for CATH ID: 4iqjM02
Error: object of type 'NoneType' has no len() for CATH ID: 3l81A02
Error: string index out of range for CATH ID: 3cygA02
Error: string index out of range for CATH ID: 6fnuA00
Error: object of type 'NoneType' has no len() for CATH ID: 3lzdA01
Error: string index out of range for CATH ID: 4fprB00
Error: write() argument must be str, not None for CATH ID: 1h6wA03
Error: string index out of range for CATH ID: 4b9gA00
Error: string index out of range for CATH ID: 4p79A00
Error: object of type 'NoneType' has no len() for CATH ID: 1y4cA03
Error: string index out of range for CATH ID: 3s9



Error: no element found: line 1, column 0 for CATH ID: 4bpxD00
Error: string index out of range for CATH ID: 2cayB00
Error: write() argument must be str, not None for CATH ID: 2b39A10
Error: string index out of range for CATH ID: 1yleA02
Error: object of type 'NoneType' has no len() for CATH ID: 3vz9B00
Error: string index out of range for CATH ID: 3syyA00
Error: string index out of range for CATH ID: 2cxhA01
Error: string index out of range for CATH ID: 4gc3A00
Error: object of type 'NoneType' has no len() for CATH ID: 3isrA01
Error: string index out of range for CATH ID: 3lfjB00
Error: More than one record found in handle for CATH ID: 3uwsA01
Error: object of type 'NoneType' has no len() for CATH ID: 2qejD01
Error: string index out of range for CATH ID: 3c19A01
Error: string index out of range for CATH ID: 1dtoA01
Error: string index out of range for CATH ID: 4cbvA02
Error: cannot access local variable 'modified' where it is not associated with a value for CATH ID: 1zn6A00
Error: str




Error: More than one record found in handle for CATH ID: 6ijfC01
Error: string index out of range for CATH ID: 2gwnA01
Error: string index out of range for CATH ID: 3nkzA00
Error: More than one record found in handle for CATH ID: 3iylB03
Error: string index out of range for CATH ID: 1acxA00
Error: object of type 'NoneType' has no len() for CATH ID: 4heaA00
Error: cannot access local variable 'modified' where it is not associated with a value for CATH ID: 1s4cC00
Error: string index out of range for CATH ID: 4beuA01
Error: string index out of range for CATH ID: 2xxpA02
Error: string index out of range for CATH ID: 2d00A01
Error: string index out of range for CATH ID: 4ub9A01
Error: string index out of range for CATH ID: 3ipjA01
Error: write() argument must be str, not None for CATH ID: 3aqbA00
Error: cannot access local variable 'modified' where it is not associated with a value for CATH ID: 1ameA00
Error: string index out of range for CATH ID: 3am2A02
Error: More than one record found 

In [26]:
data.head(10)

Unnamed: 0,cath_id,class,architecture,topology,superfamily,resolution_in_angstroms,pdb_id,sequences,cath_indices,architecture_id,label,uniprot_id
0,2w3sB01,3,90,1170,50,2.6,2w3s,SVGKPLPHDSARAHVTGQARYLDDLPCPANTLHLAFGLSTEASAAI...,"[(2, 124)]","(3, 90)",9,O54051
1,3be3A00,2,30,30,320,2.04,3be3,QDFRPGVYRHYKGDHYLALGLARADETDEVVVVYTRLYARAGLPMS...,"[(6, 81)]","(2, 30)",2,A0A0H3LS10
2,3zq4C03,3,10,20,580,3.0,3zq4,DIGNIVLRDRRILSEEGLVIVVVSIDMDDFKISAGPDLISRGFVYM...,"[(449, 555)]","(3, 10)",5,Q45493
3,1peqA03,1,10,1650,20,2.8,1peq,DITFRLAKENAQMALFSPYDIQRRYGKPFGDIAISERYDELIADPH...,"[(294, 346)]","(1, 10)",0,Q08698
4,1bdoA00,2,40,50,100,1.8,1bdo,EISGHIVRSPMVGTFYRTPSPDAKAFIEVGQKVNVGDTLCIVEAMK...,"[(77, 156)]","(2, 40)",3,P0ABD8
5,3r0hG01,2,30,42,10,2.6,3r0h,TAEIKPNKKILIELKVEKKPMGVIVCGGKNNHVTTGCVITHVYPEG...,"[(479, 579)]","(2, 30)",2,Q24008
6,1aqcA00,2,30,29,30,2.3,1aqc,EDLIDGIIFAANYLGSTQLLSDKTPSKNVRMMQAQEAVSRIKMAQK...,"[(324, 488)]","(2, 30)",2,Q02410
7,2gnxA01,1,10,3450,30,2.45,2gnx,LKKVVAGRGAPGGTAPHVEELLPHLSEQLCFFVQARMEIADFYEKM...,"[(101, 295), (431, 440)]","(1, 10)",0,Q6P1I3
8,6hjfA02,3,10,310,10,1.7,6hjf,VPSFLYQQDVVVVLPKPYGEVRVDIAFGGNFFAIVPAEQLGIDISV...,"[(189, 367)]","(3, 10)",5,Q4DA80
9,1o1zA00,3,20,20,190,1.6,1o1z,HHHHMIVLGHRGYSAKYLENTLEAFMKAIEAGANGVELDVRLSKDG...,"[(-3, 222)]","(3, 20)",6,Q9X1V6


In [24]:
import pandas as pd

# Open the error_log.txt file and read its content
with open('error_log.txt', 'r') as file:
    lines = file.readlines()

# Extract the last 7 characters of each line
cath_id_error_list = [line[-8:].strip() for line in lines]

# Obtain the unique members from this list
unique = set(cath_id_error_list)

# Remove entries from the DataFrame if their 'cath_id' is in the set of unique members
filtered_data = data[~data['cath_id'].isin(unique)]

# Save the filtered DataFrame to a CSV file
filtered_data.to_csv('cleaned_data.csv', index=False)

# Checkpoint
print("[CHECKPOINT] Filtered data has been saved to 'cleaned_data.csv'.")


Filtered data has been saved to 'cleaned_data.csv'.


In [12]:
# duplicate_pdb_ids = set(data.groupby('pdb_id').filter(lambda x: len(x) > 1)['pdb_id']) # There are 720 PDB IDs with duplicate entries

# try:
#     result = []
    
#     # Data accession
#     pdb_code = '3uep'
#     cath_id = '3uepA00'
#     cath_indices = [(229, 313)]	
#     given_seq = 'PLTDLNQLPVQVSFEVGRQILDWHTLTSLEPGSLIDLTTPVDGEVRLLANGRLLGHGRLVEIQGRLGVRIERLTEVTISLEVLFQ'
#     print(f'[CATH_ID] {cath_id}\n')
    
#     # Web scraping fxn calls (UniProtKB and RCSB PDB)
#     try:
#         uniprot_accession = get_uniprot_accession_from_pdb(pdb_code, duplicate_pdb_ids)
#         if uniprot_accession:
#             # `uniprot_accession` is returned as a list from `get_uniprot_accession_from_pdb` if 
#             # 1) Your `pdb_code` reoccurs throughout the data set (duplicates), and multiple UniProt IDs are associated with it. 
#             #    This is logical for well studied proteins.
#             # 2) [EDGE CASE] Your `pdb_code` is unique in the data set, but multiple UniProt IDs are associated with it.
#             if isinstance(uniprot_accession, list): 
#                 print(f'Identifying the correct subject sequence for [CATH ID] {cath_id}.\n')
#                 if len(uniprot_accession) == 1: # 1 Uniprot ID represents mutltiple data entries
#                     uniprot_accession = uniprot_accession[0] # We pass a str to the function
#                     subject_sequence, _ = get_fasta_sequence_from_uniprot(uniprot_accession) 
#                 else: 
#                     # PDB ID is member of duplicate_pdb_ids with more than 1 possible Uniprot ID. 
#                     # Filter through sequences for all Uniprot IDs to select the best match.
#                     # `uniprot_accession` is a list passed to the function. We return the seq and ID associated with the filtered match
#                     subject_sequence, uniprot_accession = get_fasta_sequence_from_uniprot(uniprot_accession, cath_id=cath_id, duplicates=True)
#             else:
#                 # Here, `uniprot_accession` is of type str because only one UniprotID is associated with the `pdb_code`
#                 subject_sequence, _ = get_fasta_sequence_from_uniprot(uniprot_accession)
#         else:
#             raise ValueError(f"[EDGE CASE] UniProtKB does not contain data for PDB code {pdb_code}\n")

#         print(f'Subject:  (UniProt accession {uniprot_accession})\n{subject_sequence}\n')

            
#     # Plan B: RCSB Protein Data Bank
#     except (requests.exceptions.HTTPError, ValueError):
#         print(f"Trying to get FASTA sequence for PDB code {pdb_code} from RCSB PDB...")
#         subject_sequence = get_fasta_sequence_from_rcsb(pdb_code)
#         if subject_sequence:
#             print(f'Subject: (RCSB PDB)\n{subject_sequence}\n')
#         else:
#             print(f"Failed to retrieve both UniProt accession and FASTA sequence for PDB code {pdb_code}.")

#     # Write the `subject_sequence` to a file for use by NCBI BLAST+
#     subject_file = f'../data/{cath_id}/seqs/subject.fasta'
#     with open(subject_file, 'w') as t:
#         t.write(subject_sequence)
    
#     if pd.isna(given_seq):    
#         print(f'Given Seq:\nCase 2 - {given_seq}\nBuilding query sequence(s) from pdb_share...\n')
#     else:
#         print(f'Given Seq:\n{given_seq}\n')
#         # The `given_seq`, when present in the data frame, may consist of multiple protein domains 
#         # which are treated separately in the folllowing for loop by enumerating the `cath_indices`
    
#     pdb_filename = f"../data/{cath_id}/pdb/{cath_id}"
#     for i, idx_range in enumerate(cath_indices):
#         # The query sequence corresponds to the protein domain specific to the idx_range tuple
#         query_sequence = get_domain_sequence_from_pdb(pdb_filename, idx_range, cath_id) 
#         print(f'\nQuery Sequence {i+1}:  {idx_range}\n{query_sequence}\n')
#         query_file = f'../data/{cath_id}/seqs/query_{i+1}.fasta'
#         with open(query_file, 'w') as q:
#             q.write(query_sequence)
    
#         # Perform sequence alignment for this domain segment using NCBI Blast+
#         output_file = f'../data/{cath_id}/seqs/output_{i+1}.txt'
#         aligned_segment = extract_aligned_segment(query_sequence, subject_sequence, query_file, subject_file, output_file)
#         print(f'Alignment Segment (from subject):\n{aligned_segment}\n')

#         if cath_id not in edge_cases:
#             # Error Handling - Sequence alignment w/ BLAST+ sometimes leaves out amino acids. If not handled, this will corrupt remodeling.
#             # However, certain edge_cases do not need this step because the aligned segment is already the correct result
#             expected_num_AAs = idx_range[1] - idx_range[0] + 1
#             if len(aligned_segment) != expected_num_AAs:
#                 print(f"WARNING: Length of aligned segment [{len(aligned_segment)}] does not match the expected number of amino acids [{expected_num_AAs}]\n")
#                 # Manual sequence alignment
#                 aligned_segment = manual_alignment(query_sequence, aligned_segment, subject_sequence, output_file)
    
#         result.append(aligned_segment)
    
    
#     # Concatenate the indexed fasta sequences if len(cath_indices) > 1
#     updated_sequence = ''.join(result)
    
#     # Update the data frame to 1) Remove Case2 NaN values and 2) Handle Sequence Gaps
#     data.at[index, 'sequences'] = updated_sequence
#     print('[REPAIRED FULL SEQUENCE]')
#     print(f"Updated FASTA sequence for CATH ID {cath_id} :\n{updated_sequence}\n")

# except Exception as e:
#     print(f"Error: {str(e)}\n")
#     raise

In [13]:
# Confirm that all sequence gaps have been removed

In [14]:
# Add sequence motifs

# Structure Data

In [15]:
# Obtain and Store PDB files for non-gap sequences

In [16]:
# Obtain PDB files for gap sequences

In [17]:
# Remodel using SCWRL4 and Store

# Graph Database

In [18]:
# Create NetworkX graphical representations of stored PDB files

In [19]:
# Split data into train/test splits based on homological similarity