In [1]:
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import molecular_weight, ProtParamData
import re
import warnings
from Bio import BiopythonWarning

# Ignore Biopython warnings for invalid amino acids
warnings.filterwarnings("ignore", category=BiopythonWarning)

def detail_proteins(proteins_df, sequence_col=None, group_col=None, output_file=None):
    """
    Process a dataframe containing protein sequences and add detailed molecular descriptors.
    
    Parameters:
    -----------
    proteins_df : pandas.DataFrame
        DataFrame containing protein information
    sequence_col : str, optional
        Column name containing the amino acid sequence
        If None, will attempt to auto-detect based on common column names
    group_col : str, optional
        Column name containing the protein group ID
        If None, will attempt to auto-detect based on common column names
    output_file : str, optional
        If provided, save the detailed dataframe to this file path
        
    Returns:
    --------
    pandas.DataFrame
        Original dataframe with additional protein descriptor columns
    """
    # Make a copy to avoid modifying the original
    df = proteins_df.copy()
    
    # Try to auto-detect the sequence column if not provided
    if sequence_col is None:
        potential_seq_cols = ['Sequence', 'Amino Acid Sequence', 'AminoAcidSequence', 
                             'Protein Sequence', 'ProteinSequence', 'AASequence', 'AA_Sequence']
        for col in potential_seq_cols:
            if col in df.columns:
                sequence_col = col
                break
        
        # If still not found, use the first column that might contain amino acid sequences
        if sequence_col is None:
            for col in df.columns:
                # Check a sample of values to see if they look like protein sequences
                sample = df[col].dropna().astype(str).iloc[:5].tolist()
                if any([re.match('^[ACDEFGHIKLMNPQRSTVWY]+$', s.upper()) for s in sample]):
                    sequence_col = col
                    break
    
    # If we still couldn't find a sequence column, raise an error
    if sequence_col is None:
        raise ValueError("Could not identify a column containing protein sequences. Please specify using sequence_col parameter.")
    
    # Try to auto-detect the protein group column if not provided
    if group_col is None:
        potential_group_cols = ['ProteinGroup', 'Protein Group', 'Group', 'ProteinGroupID', 'Protein_Group']
        for col in potential_group_cols:
            if col in df.columns:
                group_col = col
                break
    
    # If no group column, create one with indices
    if group_col is None:
        df['ProteinGroup'] = [f"Protein_{i}" for i in range(len(df))]
        group_col = 'ProteinGroup'
    
    print(f"Using '{sequence_col}' as the sequence column and '{group_col}' as the protein group column")
    
    # Add detailed descriptors
    detailed_data = []
    
    for idx, row in df.iterrows():
        try:
            protein_group = row[group_col]
            sequence = row[sequence_col]
            
            if pd.isna(sequence) or sequence == '':
                print(f"Warning: Empty sequence for {protein_group}, skipping")
                detailed_data.append({group_col: protein_group})
                continue
                
            # Clean sequence (remove non-standard amino acids and spaces)
            sequence = re.sub(r'[^ACDEFGHIKLMNPQRSTVWY]', '', sequence.upper())
            
            if len(sequence) == 0:
                print(f"Warning: No valid amino acids in sequence for {protein_group}")
                detailed_data.append({group_col: protein_group})
                continue
                
            # Calculate descriptors
            descriptors = calculate_protein_descriptors(sequence)
            
            # Add protein group to descriptors
            descriptors[group_col] = protein_group
            
            # Add to data
            detailed_data.append(descriptors)
            
        except Exception as e:
            print(f"Error processing {idx}: {e}")
            detailed_data.append({group_col: row.get(group_col, f"Row_{idx}")})
    
    # Convert to DataFrame
    detailed_df = pd.DataFrame(detailed_data)
    
    # Merge with original data to preserve any columns we didn't process
    if group_col in detailed_df.columns:
        result = pd.merge(df, detailed_df, on=group_col, how='left')
    else:
        # If we somehow don't have the group column in the detailed data, just return it as is
        result = detailed_df
    
    # Save to file if requested
    if output_file is not None:
        result.to_csv(output_file, index=False)
        print(f"Saved detailed dataset to {output_file}")
    
    return result

def calculate_protein_descriptors(sequence):
    """
    Calculate descriptors for a protein sequence.
    
    Parameters:
    -----------
    sequence : str
        Amino acid sequence (single letter code)
    
    Returns:
    --------
    dict
        Dictionary of calculated descriptors
    """
    descriptors = {}
    
    # Basic properties
    descriptors["SequenceLength"] = len(sequence)
    
    try:
        # Use Biopython's ProteinAnalysis
        protanalysis = ProteinAnalysis(sequence)
        
        # Molecular weight and basic properties
        descriptors["MolecularWeight"] = protanalysis.molecular_weight()
        descriptors["Aromaticity"] = protanalysis.aromaticity()
        descriptors["InstabilityIndex"] = protanalysis.instability_index()
        if descriptors["InstabilityIndex"] < 40:
            descriptors["Stability"] = "Stable"
        else:
            descriptors["Stability"] = "Unstable"
        
        # Amino acid composition
        aa_count = protanalysis.count_amino_acids()
        for aa, count in aa_count.items():
            descriptors[f"Count_{aa}"] = count
            descriptors[f"Percent_{aa}"] = (count / len(sequence)) * 100
        
        # Amino acid class percentages
        aa_classes = {
            'Hydrophobic': ['A', 'I', 'L', 'M', 'F', 'W', 'Y', 'V'],
            'Polar': ['N', 'C', 'Q', 'S', 'T'],
            'Charged': ['R', 'H', 'K', 'D', 'E'],
            'Positive': ['R', 'H', 'K'],
            'Negative': ['D', 'E']
        }
        
        for class_name, aas in aa_classes.items():
            class_count = sum(aa_count.get(aa, 0) for aa in aas)
            descriptors[f"{class_name}Count"] = class_count
            descriptors[f"{class_name}Percent"] = (class_count / len(sequence)) * 100
        
        # Secondary structure predictions
        sec_struct = protanalysis.secondary_structure_fraction()
        descriptors["Helix"] = sec_struct[0]
        descriptors["Turn"] = sec_struct[1]
        descriptors["Sheet"] = sec_struct[2]
        
        # Flexibility
        flexibility = protanalysis.flexibility()
        descriptors["AverageFlexibility"] = sum(flexibility) / len(flexibility)
        descriptors["MaxFlexibility"] = max(flexibility)
        descriptors["MinFlexibility"] = min(flexibility)
        
        # Hydrophobicity
        descriptors["Gravy"] = protanalysis.gravy()  # Grand average of hydropathy
        
        # Isoelectric point
        descriptors["IsoelectricPoint"] = protanalysis.isoelectric_point()
        
        # Charge at pH 7.4
        descriptors["ChargeAtpH7.4"] = protanalysis.charge_at_pH(7.4)
        
        # Extinction coefficient
        ec_stats = protanalysis.molar_extinction_coefficient()
        descriptors["ExtinctionCoefficient_Cystines"] = ec_stats[0]  # Assuming Cys form cystines
        descriptors["ExtinctionCoefficient_Reduced"] = ec_stats[1]   # Assuming Cys are reduced
        
    except Exception as e:
        print(f"Error in calculating protein descriptors: {e}")
    
    # Add protein-specific features
    add_protein_specific_features(descriptors, sequence)
    
    return descriptors

def add_protein_specific_features(descriptors, sequence):
    """
    Add protein-specific features to the descriptors dictionary.
    
    Parameters:
    -----------
    descriptors : dict
        Dictionary of descriptors to add to
    sequence : str
        Amino acid sequence
    """
    # Calculate sequence-based motifs and patterns
    try:
        # Look for common motifs
        motifs = {
            'NXS/T': len(re.findall(r'N[^P][ST]', sequence)),         # N-glycosylation site
            'RGD': sequence.count('RGD'),                             # Cell attachment motif
            'KDEL': sequence.count('KDEL'),                           # ER retention signal
            'NLS': len(re.findall(r'K[KR].[KR]', sequence)),          # Nuclear localization signal
            'Phospho_S/T': len(re.findall(r'[RK].{2}[ST]', sequence)) # Kinase phosphorylation site
        }
        
        for motif_name, count in motifs.items():
            descriptors[f"Motif_{motif_name}"] = count
        
        # Potential disulfide bonds (based on cysteine count)
        cys_count = sequence.count('C')
        descriptors["PotentialDisulfideBonds"] = cys_count // 2
        
        # Protein class estimation
        # This is a very simplified approach - real classification would be more complex
        if descriptors.get("Helix", 0) > 0.45:
            descriptors["EstimatedClass"] = "Alpha-rich"
        elif descriptors.get("Sheet", 0) > 0.45:
            descriptors["EstimatedClass"] = "Beta-rich"
        elif descriptors.get("Helix", 0) > 0.3 and descriptors.get("Sheet", 0) > 0.3:
            descriptors["EstimatedClass"] = "Alpha/Beta"
        else:
            descriptors["EstimatedClass"] = "Mixed"
        
        # Potential transmembrane regions (very simplified - just looking for hydrophobic stretches)
        hydrophobic_aas = set("AVILMFYW")
        tm_regions = 0
        hydrophobic_stretch = 0
        
        for aa in sequence:
            if aa in hydrophobic_aas:
                hydrophobic_stretch += 1
                if hydrophobic_stretch >= 20:  # Typical TM helix is ~20-25 AA
                    tm_regions += 1
                    hydrophobic_stretch = 0
            else:
                hydrophobic_stretch = 0
        
        descriptors["PotentialTMRegions"] = tm_regions
        
        # Complexity score (simple heuristic)
        complexity = (
            len(set(sequence)) * 5 +  # Amino acid diversity
            (descriptors.get("Helix", 0) + descriptors.get("Sheet", 0)) * 100 +  # Secondary structure
            sum(motifs.values()) * 10 +  # Functional motifs
            tm_regions * 20  # Transmembrane regions
        )
        descriptors["ComplexityScore"] = complexity
        
    except Exception as e:
        print(f"Error calculating protein-specific features: {e}")

    return descriptors

# Example usage:
if __name__ == "__main__":
    # Create a sample dataframe with protein sequences
    sample_df = pd.DataFrame({
        'ProteinGroup': ['PG1', 'PG2', 'PG3'],
        'Accession': ['P12345', 'P23456', 'P34567'],
        'Uniprot': ['PROT1_HUMAN', 'PROT2_HUMAN', 'PROT3_HUMAN'],
        'Description': ['Sample protein 1', 'Sample protein 2', 'Sample protein 3'],
        'Amino Acid Sequence': [
            'MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTDGLLYGSQTPNEECLFLERLEENHYNTYISKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPVSSD',
            'MLSAEDMTFKTYEGWLLPESGGKKLLAQFTDNSSRNLDEDKPRQIFNILKDELEHLNNPQEEVLGLMNNSGESRNLRTWVSKDKKSHDIKGPLKLVNPDPTDLKKQINQLKKSKKNKTEEDNFNKQDTLPNEKKFTVPSNKIQEQPTKKQYF',
            'MVTTFVALVLLSFASSTKPPTVTGRFADGPYGSSSAIKGWLLRDKNGKLNDLKHTTKSIQTQLSKFQSTLDELKFAVNQTLTRLKERVANLDVKPKLGYWVELDPANQAIMKGDIKNVNPRRLSIQHKGQVMTVWAFRFGVTEKEMLTEEQ'
        ]
    })
    
    # Process the sample dataframe
    detailed_df = detail_proteins(sample_df, output_file="detailed_proteins.csv")
    
    # Print the columns in the new dataframe
    print(f"Original columns: {sample_df.columns.tolist()}")
    print(f"New columns: {detailed_df.columns.tolist()}")
    print(f"Added {len(detailed_df.columns) - len(sample_df.columns)} new columns")

Using 'Amino Acid Sequence' as the sequence column and 'ProteinGroup' as the protein group column
Saved detailed dataset to detailed_proteins.csv
Original columns: ['ProteinGroup', 'Accession', 'Uniprot', 'Description', 'Amino Acid Sequence']
New columns: ['ProteinGroup', 'Accession', 'Uniprot', 'Description', 'Amino Acid Sequence', 'SequenceLength', 'MolecularWeight', 'Aromaticity', 'InstabilityIndex', 'Stability', 'Count_A', 'Percent_A', 'Count_C', 'Percent_C', 'Count_D', 'Percent_D', 'Count_E', 'Percent_E', 'Count_F', 'Percent_F', 'Count_G', 'Percent_G', 'Count_H', 'Percent_H', 'Count_I', 'Percent_I', 'Count_K', 'Percent_K', 'Count_L', 'Percent_L', 'Count_M', 'Percent_M', 'Count_N', 'Percent_N', 'Count_P', 'Percent_P', 'Count_Q', 'Percent_Q', 'Count_R', 'Percent_R', 'Count_S', 'Percent_S', 'Count_T', 'Percent_T', 'Count_V', 'Percent_V', 'Count_W', 'Percent_W', 'Count_Y', 'Percent_Y', 'HydrophobicCount', 'HydrophobicPercent', 'PolarCount', 'PolarPercent', 'ChargedCount', 'ChargedPerc

In [6]:
import pandas as pd

# Load your protein data
rows = []
with open('../data/Protein-Sequence-Table.txt', 'r') as file:
    next(file) # skip header line
    for line in file:
        parts = line.split()
        
        protein_group = parts[0]
        accession = parts[1]
        uniprot = parts[2]
        description_and_sequence = ' '.join(parts[3:]) 
        
        # split on last space then assign description to before split and amino acid after the split
        split_index = description_and_sequence.rfind(' ')
        description = description_and_sequence[:split_index].strip()
        amino_acid_sequence = description_and_sequence[split_index + 1:].strip()
        
        
        rows.append([protein_group, accession, uniprot, description, amino_acid_sequence])


proteins = pd.DataFrame(rows, columns=['ProteinGroup', 'Accession', 'Uniprot', 'Description', 'Amino Acid Sequence'])

# Process the data (it will auto-detect your columns)
detailed_proteins = detail_proteins(proteins, output_file="protein_descriptors_output.csv")

# Save the enhanced dataset if desired
detailed_proteins.to_csv('../data/proteins_detailed.csv', index=False)

Using 'Amino Acid Sequence' as the sequence column and 'ProteinGroup' as the protein group column
Saved detailed dataset to protein_descriptors_output.csv
