In [30]:
def pdb_to_sequence(pdb_file_path):
    """
    Extract amino acid sequence from a PDB file using all chains.
    Gets protein sequence from column 4 (residue name) and ignores chain IDs.
    
    Args:
        pdb_file_path (str): Path to the PDB file
    
    Returns:
        str: Combined amino acid sequence from all chains
    """
    
    # Three-letter to one-letter amino acid code mapping
    # Standard 20 amino acids
    aa_codes = {
        'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
        'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
        'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
        'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y',
        
        # Non-standard amino acids with dedicated codes
        'SEC': 'U',  # Selenocysteine
        'PYL': 'O',  # Pyrrolysine
        'PYH': 'O',  # Alternative name for Pyrrolysine
        'UNK': 'X',  # Unknown amino acid
        
        # Ambiguity codes for uncertain residues
        'ASX': 'B',  # Asparagine or Aspartic acid (Asn or Asp)
        'GLX': 'Z',  # Glutamine or Glutamic acid (Gln or Glu)
        'XAA': 'X',  # Unknown or unspecified amino acid
        'XLE': 'J',  # Leucine or Isoleucine (Leu or Ile)
        
        # Additional common non-standard amino acids
        'MSE': 'M',  # Selenomethionine (treat as Methionine)
        'CSD': 'C',  # Cysteine sulfinic acid (treat as Cysteine)
        'CAS': 'C',  # S-(dimethylarsenic)cysteine (treat as Cysteine)
        'CAF': 'C',  # S-dimethylarsinoyl-cysteine (treat as Cysteine)
        'PCA': 'E',  # Pyroglutamic acid (treat as Glutamic acid)
        'HYP': 'P',  # Hydroxyproline (treat as Proline)
        'SEP': 'S',  # Phosphoserine (treat as Serine)
        'TPO': 'T',  # Phosphothreonine (treat as Threonine)
        'PTR': 'Y',  # Phosphotyrosine (treat as Tyrosine)
    }
    
    sequence = ''
    prev_residue = None
    
    try:
        with open(pdb_file_path, 'r') as file:
            for line in file:
                # Only process ATOM records for CA (alpha carbon) atoms
                if line.startswith('ATOM') and line[12:16].strip() == 'CA':
                    # Get residue name from column 4 (positions 17-19 in PDB format)
                    residue_name = line[17:20].strip()
                    residue_number = int(line[22:26].strip())
                    
                    # Process both standard and non-standard amino acids
                    if residue_name in aa_codes:
                        # Add residue to sequence (avoid duplicates from same residue number)
                        if prev_residue != residue_number:
                            sequence += aa_codes[residue_name]
                            prev_residue = residue_number
                    else:
                        # Handle completely unknown residues
                        print(f"Warning: Unknown residue '{residue_name}' at position {residue_number}, using 'X'")
                        if prev_residue != residue_number:
                            sequence += 'X'
                            prev_residue = residue_number
    
    except FileNotFoundError:
        raise FileNotFoundError(f"PDB file not found: {pdb_file_path}")
    except Exception as e:
        raise Exception(f"Error reading PDB file: {str(e)}")
    
    return sequence

#Example 1: Get all chains
sequence = pdb_to_sequence("4twp_protein.pdb")
print(sequence)

DKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIIIEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSISDKWEMERTDITMKHKLGGGQYGEVYEGVWKKYSLTVAVKTLKEDTMEVEEFLKEAAVMKEIKHPNLVQLLGVCTREPPFYIIIEFMTYGNLLDYLRECNRQEVNAVVLLYMATQISSAMEYLEKKNFIHRDLAARNCLVGENHLVKVADFGLSRLMTGDTYTAHAGAKFPIKWTAPESLAYNKFSIKSDVWAFGVLLWEIATYGMSPYPGIDLSQVYELLEKDYRMERPEGCPEKVYELMRACWQWNPSDRPSFAEIHQAFETMFQESSIS


In [31]:
def process_pdb_folders(csv_file_path, base_directory, output_csv):
    """
    Process PDB folders based on PDB codes from CSV file and extract protein sequences.
    
    Args:
        csv_file_path (str): Path to CSV file containing PDB codes
        base_directory (str): Path to directory containing PDB folders
        output_csv (str): Path for output CSV file
        
    Returns:
        pandas.DataFrame: DataFrame containing results
    """
    import pandas as pd
    import os
    import glob
    
    # Read the CSV file to get PDB codes
    df_pdb = pd.read_csv(csv_file_path)
    pdb_codes = df_pdb['PDB_Code'].tolist()
    
    data = []
    processed_count = 0
    missing_folders = []
    
    print(f"Processing {len(pdb_codes)} PDB codes...")
    
    for pdb_code in pdb_codes:
        pdb_folder_path = os.path.join(base_directory, pdb_code)
        
        # Check if folder exists
        if not os.path.exists(pdb_folder_path):
            missing_folders.append(pdb_code)
            continue
            
        # Find all PDB files in the folder (look for _protein.pdb pattern first, then any .pdb)
        protein_pdb_files = glob.glob(os.path.join(pdb_folder_path, "*_protein.pdb"))
        
        if not protein_pdb_files:
            # If no _protein.pdb files, look for any .pdb files
            protein_pdb_files = glob.glob(os.path.join(pdb_folder_path, "*.pdb"))
        
        if not protein_pdb_files:
            print(f"Warning: No PDB files found in {pdb_folder_path}")
            continue
            
        for pdb_file in protein_pdb_files:
            try:
                # Extract protein sequence
                sequence = pdb_to_sequence(pdb_file)
                
                if not sequence:
                    print(f"  Warning: No sequence found in {pdb_file}")
                    continue
                
                
                # Add results
                data.append({
                    'PDB_Code': pdb_code,
                    'PDB_File': os.path.basename(pdb_file),
                    'Sequence': sequence,
                    'Sequence_Length': len(sequence),
                    
                })
                
                print(f"  ✓ {pdb_code}: {len(sequence)} amino acids from {os.path.basename(pdb_file)}")
                processed_count += 1
                
            except Exception as e:
                print(f"  ✗ Error processing {pdb_file}: {e}")
    
    # Create DataFrame and save to CSV
    df_results = pd.DataFrame(data)
    
    if not df_results.empty:
        # For CSV output, we'll exclude the one-hot encoding (too complex for CSV)
        # Save a simplified version
        df_csv = df_results[['PDB_Code', 'Sequence', 'Sequence_Length']].copy()
        df_csv.to_csv(output_csv, index=False)
        
        print(f"\nResults saved to {output_csv}")
        print(f"Total protein sequences extracted: {len(df_results)}")
        print(f"Processed folders: {processed_count}")
    else:
        print("No protein sequences were extracted!")
    
    if missing_folders:
        print(f"\nMissing folders for PDB codes: {missing_folders[:10]}...")  # Show first 10
        if len(missing_folders) > 10:
            print(f"... and {len(missing_folders) - 10} more")
    
    return df_results


df1 = process_pdb_folders(
    csv_file_path='temp_full_w-bad-remove.csv',
    base_directory='/Users/bodenretherford/Desktop/demo/',  # Update this path
    output_csv='pdb_sequences.csv'
)
df1.head(10)

Processing 282 PDB codes...
  ✓ 4tmn: 316 amino acids from 4tmn_protein.pdb
  ✓ 5tmn: 316 amino acids from 5tmn_protein.pdb
  ✓ 1ydr: 354 amino acids from 1ydr_protein.pdb
  ✓ 1ydt: 354 amino acids from 1ydt_protein.pdb
  ✓ 1bcu: 249 amino acids from 1bcu_protein.pdb
  ✓ 1a30: 198 amino acids from 1a30_protein.pdb
  ✓ 1bzc: 297 amino acids from 1bzc_protein.pdb
  ✓ 1qf1: 316 amino acids from 1qf1_protein.pdb
  ✓ 1qkt: 248 amino acids from 1qkt_protein.pdb
  ✓ 1c5z: 235 amino acids from 1c5z_protein.pdb
  ✓ 1k1i: 220 amino acids from 1k1i_protein.pdb
  ✓ 1e66: 533 amino acids from 1e66_protein.pdb
  ✓ 1g2k: 198 amino acids from 1g2k_protein.pdb
  ✓ 1eby: 198 amino acids from 1eby_protein.pdb
  ✓ 1gpn: 529 amino acids from 1gpn_protein.pdb
  ✓ 1h22: 529 amino acids from 1h22_protein.pdb
  ✓ 1gpk: 529 amino acids from 1gpk_protein.pdb
  ✓ 1h23: 528 amino acids from 1h23_protein.pdb
  ✓ 1p1n: 258 amino acids from 1p1n_protein.pdb
  ✓ 1q8u: 361 amino acids from 1q8u_protein.pdb
  ✓ 1p1q: 77

Unnamed: 0,PDB_Code,PDB_File,Sequence,Sequence_Length
0,4tmn,4tmn_protein.pdb,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316
1,5tmn,5tmn_protein.pdb,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316
2,1ydr,1ydr_protein.pdb,VKEFLAKAKEDFLKKWENPAQNTAHLDQFERIKTLGTGSFGRVMLV...,354
3,1ydt,1ydt_protein.pdb,VKEFLAKAKEDFLKKWENPAQNTAHLDQFERIKTLGTGSFGRVMLV...,354
4,1bcu,1bcu_protein.pdb,AGLRPLFEKKSLEDIVEGSDAEIGMSPWQVMLFRKPQELLCGASLI...,249
5,1a30,1a30_protein.pdb,PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKM...,198
6,1bzc,1bzc_protein.pdb,EMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRYR...,297
7,1qf1,1qf1_protein.pdb,ITGTSTVGVGRGVLGDQKNINTTYSTYYYLQDNTRGDGIFTYDAKY...,316
8,1qkt,1qkt_protein.pdb,NSLALSLTADQMVSALLDAEPPILYSEYDPTRPFSEASMMGLLTNL...,248
9,1c5z,1c5z_protein.pdb,LKFQCGQKTIIGGEFTTIENQPWFAAIYRRHVTYVCGGSLMSPCWV...,235


In [32]:
import pandas as pd
# Read the CSV file to get PDB codes
df_pdb = pd.read_csv('temp_full_w-bad-remove.csv')
merged_df = pd.merge(df_pdb, df1, on='PDB_Code', how='left')

df_final = merged_df.drop(['MOL2_File', 'Molecule_Index', 'PDB_File'], axis=1)
df_final.head()
df_final.to_csv('final_w-bad-r.csv', index=False)