In [2]:
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqUtils import ProtParamData
from Bio.Seq import Seq
import os
import re
import warnings

def detail_proteins(proteins_df, sequence_col=None, name_col=None, output_file=None, monoisotopic=False):
    """
    Process a dataframe containing protein sequences and add detailed protein descriptors.
    
    Parameters:
    -----------
    proteins_df : pandas.DataFrame
        DataFrame containing protein sequences
    sequence_col : str, optional
        Column name containing the protein sequence
        If None, will attempt to auto-detect based on common column names
    name_col : str, optional
        Column name containing the protein name or ID
        If None, will attempt to auto-detect based on common column names
    output_file : str, optional
        If provided, save the detailed dataframe to this file path
    monoisotopic : bool, optional
        If True, calculate molecular weights using monoisotopic masses
        
    Returns:
    --------
    pandas.DataFrame
        Original dataframe with additional protein descriptor columns
    """
    # Make a copy to avoid modifying the original
    df = proteins_df.copy()
    
    # Try to auto-detect the sequence column if not provided
    if sequence_col is None:
        potential_seq_cols = ['Sequence', 'ProteinSequence', 'AminoAcidSequence', 
                           'Protein', 'AminoAcids', 'AA_Sequence', 'Seq', 'AASeq']
        for col in potential_seq_cols:
            if col in df.columns:
                sequence_col = col
                break
        
        # If still not found, use the first column that might contain protein sequences
        if sequence_col is None:
            for col in df.columns:
                # Check a sample of values to see if they look like protein sequences
                sample = df[col].dropna().astype(str).iloc[:5].tolist()
                if all([is_protein_sequence(s) for s in sample]):
                    sequence_col = col
                    break
    
    # If we still couldn't find a sequence column, raise an error
    if sequence_col is None:
        raise ValueError("Could not identify a column containing protein sequences. Please specify using sequence_col parameter.")
    
    # Try to auto-detect the name column if not provided
    if name_col is None:
        potential_name_cols = ['Name', 'ID', 'ProteinName', 'ProteinID', 'Protein_Name', 'Protein_ID', 'AccessionNumber']
        for col in potential_name_cols:
            if col in df.columns:
                name_col = col
                break
    
    # If no name column, create one with indices
    if name_col is None:
        df['ProteinID'] = [f"Protein_{i}" for i in range(len(df))]
        name_col = 'ProteinID'
    
    print(f"Using '{sequence_col}' as the sequence column and '{name_col}' as the name column")
    
    # Clean sequences (remove whitespace, convert to uppercase)
    df['CleanSequence'] = df[sequence_col].apply(lambda x: str(x).upper().replace(' ', ''))
    
    # Add detailed descriptors
    detailed_data = []
    
    for idx, row in df.iterrows():
        try:
            protein_name = row[name_col]
            sequence = row['CleanSequence']
            
            if pd.isna(sequence) or sequence == '':
                print(f"Warning: Empty sequence for {protein_name}, skipping")
                detailed_data.append({name_col: protein_name})
                continue
                
            # Check if sequence contains only valid amino acids
            if not is_valid_protein_sequence(sequence):
                print(f"Warning: Sequence for {protein_name} contains invalid amino acid characters")
                # Try to clean the sequence
                sequence = clean_protein_sequence(sequence)
                if not is_valid_protein_sequence(sequence):
                    print(f"Error: Could not clean sequence for {protein_name}, skipping")
                    detailed_data.append({name_col: protein_name})
                    continue
            
            # Calculate descriptors
            descriptors = calculate_protein_descriptors(sequence, monoisotopic=monoisotopic)
            if descriptors is None:
                print(f"Warning: Could not calculate descriptors for {protein_name}")
                detailed_data.append({name_col: protein_name})
                continue
                
            # Add name to descriptors
            descriptors[name_col] = protein_name
            
            # Add protein-specific features
            add_protein_specific_features(descriptors, sequence)
                
            # Add to data
            detailed_data.append(descriptors)
            
        except Exception as e:
            print(f"Error processing protein at index {idx}: {e}")
            detailed_data.append({name_col: row.get(name_col, f"Row_{idx}")})
    
    # Convert to DataFrame
    detailed_df = pd.DataFrame(detailed_data)
    
    # Merge with original data to preserve any columns we didn't process
    if name_col in detailed_df.columns:
        result = pd.merge(df, detailed_df, on=name_col, how='left')
    else:
        # If we somehow don't have the name column in the detailed data, just return it as is
        result = detailed_df
    
    # Save to file if requested
    if output_file is not None:
        result.to_csv(output_file, index=False)
        print(f"Saved detailed dataset to {output_file}")
    
    return result

def is_protein_sequence(sequence):
    """
    Check if a string looks like a protein sequence.
    
    Parameters:
    -----------
    sequence : str
        String to check
        
    Returns:
    --------
    bool
        True if the string might be a protein sequence, False otherwise
    """
    if not isinstance(sequence, str):
        return False
    
    # Remove whitespace
    sequence = sequence.replace(' ', '')
    
    # Too short to be meaningful
    if len(sequence) < 5:
        return False
    
    # Check if it contains mostly valid amino acid characters
    valid_aa = set("ACDEFGHIKLMNPQRSTVWY")
    char_count = sum(1 for c in sequence.upper() if c in valid_aa)
    
    # If at least 80% of characters are amino acids, it's probably a protein sequence
    return (char_count / len(sequence)) >= 0.8

def is_valid_protein_sequence(sequence):
    """
    Check if a sequence contains only valid amino acid characters.
    
    Parameters:
    -----------
    sequence : str
        Protein sequence
        
    Returns:
    --------
    bool
        True if all characters are valid amino acids, False otherwise
    """
    valid_aa = set("ACDEFGHIKLMNPQRSTVWY*-")  # Include stop codon (*) and gap (-)
    return all(c in valid_aa for c in sequence.upper())

def clean_protein_sequence(sequence):
    """
    Clean a protein sequence by removing invalid characters.
    
    Parameters:
    -----------
    sequence : str
        Protein sequence to clean
        
    Returns:
    --------
    str
        Cleaned protein sequence
    """
    valid_aa = set("ACDEFGHIKLMNPQRSTVWY*-")
    return ''.join(c for c in sequence.upper() if c in valid_aa)

def calculate_protein_descriptors(sequence, monoisotopic=False):
    """
    Calculate descriptors for a protein sequence.
    
    Parameters:
    -----------
    sequence : str
        Protein sequence
    monoisotopic : bool
        If True, use monoisotopic masses instead of average masses
        
    Returns:
    --------
    dict
        Dictionary of calculated descriptors
    """
    # Remove any stop codons (*) or gaps (-) for analysis
    analysis_seq = sequence.replace('*', '').replace('-', '')
    
    # Ensure we have at least one amino acid
    if not analysis_seq:
        return None
        
    # Create ProteinAnalysis object (ignoring any warnings)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        try:
            protein = ProteinAnalysis(analysis_seq, monoisotopic=monoisotopic)
        except Exception as e:
            print(f"Error creating ProteinAnalysis object: {e}")
            return None
    
    descriptors = {}
    
    # Basic properties
    descriptors["SequenceLength"] = len(sequence)
    descriptors["ValidSequenceLength"] = len(analysis_seq)
    
    # Amino acid composition
    aa_counts = protein.count_amino_acids()
    aa_percents = protein.amino_acids_percent
    
    # Add each amino acid count and percentage as a separate column
    for aa in sorted(aa_counts.keys()):
        descriptors[f"{aa}_count"] = aa_counts[aa]
        descriptors[f"{aa}_percent"] = aa_percents[aa]
    
    # Physical properties
    try:
        descriptors["MolecularWeight"] = protein.molecular_weight()
    except:
        descriptors["MolecularWeight"] = None
        
    try:
        descriptors["IsoelectricPoint"] = protein.isoelectric_point()
    except:
        descriptors["IsoelectricPoint"] = None
        
    # Calculate charge at various pH levels
    for ph in [5.0, 7.0, 9.0]:
        try:
            descriptors[f"Charge_pH{ph}"] = protein.charge_at_pH(ph)
        except:
            descriptors[f"Charge_pH{ph}"] = None
    
    # Structural properties
    try:
        helix, turn, sheet = protein.secondary_structure_fraction()
        descriptors["HelixFraction"] = helix
        descriptors["TurnFraction"] = turn
        descriptors["SheetFraction"] = sheet
        descriptors["PredictedSecondaryStructure"] = get_dominant_structure(helix, turn, sheet)
    except:
        descriptors["HelixFraction"] = None
        descriptors["TurnFraction"] = None
        descriptors["SheetFraction"] = None
        descriptors["PredictedSecondaryStructure"] = None
    
    # Stability and other properties
    try:
        descriptors["Aromaticity"] = protein.aromaticity()
    except:
        descriptors["Aromaticity"] = None
        
    try:
        instability = protein.instability_index()
        descriptors["InstabilityIndex"] = instability
        descriptors["Stability"] = "Stable" if instability < 40 else "Unstable"
    except:
        descriptors["InstabilityIndex"] = None
        descriptors["Stability"] = None
    
    # GRAVY (hydropathy)
    try:
        descriptors["GRAVY_KyteDoolitle"] = protein.gravy()
    except:
        descriptors["GRAVY_KyteDoolitle"] = None
        
    # Try other hydrophobicity scales
    try:
        for scale_name in ["Eisenberg", "Hopp-Woods", "Fauchere", "Rose"]:
            scale_key = scale_name_to_key(scale_name)
            if scale_key in ProtParamData.gravy_scales:
                descriptors[f"GRAVY_{scale_name}"] = protein.gravy(scale=scale_key)
    except:
        pass
        
    # Extinction coefficient
    try:
        ec_reduced, ec_oxidized = protein.molar_extinction_coefficient()
        descriptors["ExtinctionCoefficient_reduced"] = ec_reduced
        descriptors["ExtinctionCoefficient_oxidized"] = ec_oxidized
    except:
        descriptors["ExtinctionCoefficient_reduced"] = None
        descriptors["ExtinctionCoefficient_oxidized"] = None
    
    return descriptors

def scale_name_to_key(scale_name):
    """Convert a user-friendly scale name to the key used in ProtParamData."""
    scale_map = {
        "Kyte-Doolittle": "KyteDoolitle",
        "Hopp-Woods": "hw",  # This is in a different dict in ProtParamData
        "Eisenberg": "Eisenberg",
        "Fauchere": "Fauchere",
        "Rose": "Rose"
    }
    return scale_map.get(scale_name, scale_name)

def get_dominant_structure(helix, turn, sheet):
    """
    Determine the dominant secondary structure based on fractions.
    
    Parameters:
    -----------
    helix : float
        Fraction of amino acids predicted to be in alpha helix
    turn : float
        Fraction of amino acids predicted to be in turns
    sheet : float
        Fraction of amino acids predicted to be in beta sheet
        
    Returns:
    --------
    str
        Name of dominant secondary structure
    """
    structures = [("Helix", helix), ("Turn", turn), ("Sheet", sheet)]
    dominant = max(structures, key=lambda x: x[1])
    return dominant[0]

def add_protein_specific_features(descriptors, sequence):
    """
    Add protein-specific features to the descriptors dictionary.
    
    Parameters:
    -----------
    descriptors : dict
        Dictionary of descriptors to add to
    sequence : str
        Protein sequence
    """
    # Create clean sequence without gaps or stop codons
    clean_seq = sequence.replace('-', '').replace('*', '')
    
    # Count specific amino acid patterns
    try:
        # Count charged residues
        pos_charged = sum(clean_seq.count(aa) for aa in "RK")  # Arg, Lys
        neg_charged = sum(clean_seq.count(aa) for aa in "DE")  # Asp, Glu
        descriptors["PositivelyChargedResidues"] = pos_charged
        descriptors["NegativelyChargedResidues"] = neg_charged
        descriptors["NetCharge"] = pos_charged - neg_charged
        descriptors["ChargedResiduesRatio"] = (pos_charged + neg_charged) / len(clean_seq) if len(clean_seq) > 0 else 0
        
        # Count polar and nonpolar residues
        polar = sum(clean_seq.count(aa) for aa in "QNHSTYCRKDE")  # Gln, Asn, His, Ser, Thr, Tyr, Cys, Arg, Lys, Asp, Glu
        nonpolar = sum(clean_seq.count(aa) for aa in "AVILMFWPG")  # Ala, Val, Ile, Leu, Met, Phe, Trp, Pro, Gly
        descriptors["PolarResidues"] = polar
        descriptors["NonpolarResidues"] = nonpolar
        descriptors["PolarRatio"] = polar / len(clean_seq) if len(clean_seq) > 0 else 0
        
        # Count aromatic residues
        aromatic = sum(clean_seq.count(aa) for aa in "FYW")  # Phe, Tyr, Trp
        descriptors["AromaticResidues"] = aromatic
        descriptors["AromaticRatio"] = aromatic / len(clean_seq) if len(clean_seq) > 0 else 0
        
        # Count aliphatic residues
        aliphatic = sum(clean_seq.count(aa) for aa in "AVIL")  # Ala, Val, Ile, Leu
        descriptors["AliphaticResidues"] = aliphatic
        descriptors["AliphaticRatio"] = aliphatic / len(clean_seq) if len(clean_seq) > 0 else 0
        
        # Aliphatic index (Ikai, 1980)
        # AI = X(Ala) + a * X(Val) + b * (X(Ile) + X(Leu))
        # where X(Ala), X(Val), X(Ile), and X(Leu) are mole fractions of these amino acids,
        # a = 2.9 and b = 3.9 are the relative volumes of aliphatic side chains
        try:
            mole_fractions = {aa: clean_seq.count(aa) / len(clean_seq) for aa in "AVIL"}
            aliphatic_index = (mole_fractions.get('A', 0) + 
                              2.9 * mole_fractions.get('V', 0) + 
                              3.9 * (mole_fractions.get('I', 0) + mole_fractions.get('L', 0)))
            descriptors["AliphaticIndex"] = aliphatic_index * 100  # Convert to percentage
        except:
            descriptors["AliphaticIndex"] = None
            
        # Count Cysteine residues (important for disulfide bonds)
        cys_count = clean_seq.count('C')
        descriptors["CysteineCount"] = cys_count
        descriptors["CysteineRatio"] = cys_count / len(clean_seq) if len(clean_seq) > 0 else 0
        descriptors["PotentialDisulfideBonds"] = cys_count // 2
        
        # Count potential post-translational modification sites
        # N-glycosylation: N-X-S/T where X is any amino acid except P
        n_glyc_sites = 0
        for i in range(len(clean_seq) - 2):
            if clean_seq[i] == 'N' and clean_seq[i+1] != 'P' and clean_seq[i+2] in 'ST':
                n_glyc_sites += 1
        descriptors["NglycosylationSites"] = n_glyc_sites
        
        # Phosphorylation sites (simplified): S, T, Y
        phos_sites = sum(clean_seq.count(aa) for aa in "STY")
        descriptors["PotentialPhosphorylationSites"] = phos_sites
        
        # Determine protein class based on features
        protein_class = determine_protein_class(descriptors, clean_seq)
        descriptors["PredictedProteinClass"] = protein_class
        
    except Exception as e:
        print(f"Error calculating protein-specific features: {e}")
    
    return descriptors

def determine_protein_class(descriptors, sequence):
    """
    Make a simple prediction of protein class based on properties.
    
    Parameters:
    -----------
    descriptors : dict
        Dictionary of protein descriptors
    sequence : str
        Protein sequence
        
    Returns:
    --------
    str
        Predicted protein class
    """
    # This is a simplified prediction based on general properties
    
    # Membrane protein indicators
    hydrophobic_threshold = 0.0  # GRAVY score threshold for potential membrane proteins
    tm_helix_indicator = False
    
    # Check for transmembrane helices - look for long hydrophobic stretches
    hydrophobic_stretch = 0
    max_hydrophobic_stretch = 0
    
    for aa in sequence:
        if aa in "AVILMFYW":  # Hydrophobic amino acids
            hydrophobic_stretch += 1
            max_hydrophobic_stretch = max(max_hydrophobic_stretch, hydrophobic_stretch)
        else:
            hydrophobic_stretch = 0
    
    # Potential transmembrane helix if we have a stretch of ~20 hydrophobic residues
    if max_hydrophobic_stretch >= 18:
        tm_helix_indicator = True
    
    # Get key properties from descriptors
    gravy = descriptors.get("GRAVY_KyteDoolitle", -999)
    charged_ratio = descriptors.get("ChargedResiduesRatio", 0)
    cys_ratio = descriptors.get("CysteineRatio", 0)
    aromatic_ratio = descriptors.get("AromaticRatio", 0)
    stability = descriptors.get("Stability", "Unknown")
    
    # Decision tree for protein class
    if gravy > hydrophobic_threshold or tm_helix_indicator:
        return "Membrane Protein"
    elif cys_ratio > 0.05:  # High cysteine content
        return "Disulfide-rich Protein"
    elif charged_ratio > 0.25:  # High proportion of charged residues
        if descriptors.get("PositivelyChargedResidues", 0) > descriptors.get("NegativelyChargedResidues", 0):
            return "Basic Protein"
        else:
            return "Acidic Protein"
    elif descriptors.get("HelixFraction", 0) > 0.6:
        return "Alpha-helical Protein"
    elif descriptors.get("SheetFraction", 0) > 0.45:
        return "Beta-sheet Protein"
    elif aromatic_ratio > 0.1:  # High aromatic content
        return "Aromatic-rich Protein"
    elif stability == "Stable" and descriptors.get("InstabilityIndex", 100) < 30:
        return "Highly Stable Protein"
    elif stability == "Unstable" and descriptors.get("InstabilityIndex", 0) > 50:
        return "Highly Unstable Protein"
    else:
        return "Globular Protein"



In [3]:
import pandas as pd

# Load your protein data
rows = []
with open('../data/Protein-Sequence-Table.txt', 'r') as file:
    next(file) # skip header line
    for line in file:
        parts = line.split()
        
        protein_group = parts[0]
        accession = parts[1]
        uniprot = parts[2]
        description_and_sequence = ' '.join(parts[3:]) 
        
        # split on last space then assign description to before split and amino acid after the split
        split_index = description_and_sequence.rfind(' ')
        description = description_and_sequence[:split_index].strip()
        amino_acid_sequence = description_and_sequence[split_index + 1:].strip()
        
        
        rows.append([protein_group, accession, uniprot, description, amino_acid_sequence])


proteins = pd.DataFrame(rows, columns=['ProteinGroup', 'Accession', 'Uniprot', 'Description', 'Amino Acid Sequence'])


# Process the sample dataframe
detailed_df = detail_proteins(proteins, output_file="detailed_proteins.csv")
    
# Print the columns in the new dataframe
print(f"Original columns: {proteins.columns.tolist()}")
print(f"New columns: {detailed_df.columns.tolist()}")
print(f"Added {len(detailed_df.columns) - len(proteins.columns)} new columns")

Using 'Amino Acid Sequence' as the sequence column and 'ProteinID' as the name column
Saved detailed dataset to detailed_proteins.csv
Original columns: ['ProteinGroup', 'Accession', 'Uniprot', 'Description', 'Amino Acid Sequence']
New columns: ['ProteinGroup', 'Accession', 'Uniprot', 'Description', 'Amino Acid Sequence', 'ProteinID', 'CleanSequence', 'SequenceLength', 'ValidSequenceLength', 'A_count', 'A_percent', 'C_count', 'C_percent', 'D_count', 'D_percent', 'E_count', 'E_percent', 'F_count', 'F_percent', 'G_count', 'G_percent', 'H_count', 'H_percent', 'I_count', 'I_percent', 'K_count', 'K_percent', 'L_count', 'L_percent', 'M_count', 'M_percent', 'N_count', 'N_percent', 'P_count', 'P_percent', 'Q_count', 'Q_percent', 'R_count', 'R_percent', 'S_count', 'S_percent', 'T_count', 'T_percent', 'V_count', 'V_percent', 'W_count', 'W_percent', 'Y_count', 'Y_percent', 'MolecularWeight', 'IsoelectricPoint', 'Charge_pH5.0', 'Charge_pH7.0', 'Charge_pH9.0', 'HelixFraction', 'TurnFraction', 'Sheet

In [4]:
detailed_df

Unnamed: 0,ProteinGroup,Accession,Uniprot,Description,Amino Acid Sequence,ProteinID,CleanSequence,SequenceLength,ValidSequenceLength,A_count,...,AromaticRatio,AliphaticResidues,AliphaticRatio,AliphaticIndex,CysteineCount,CysteineRatio,PotentialDisulfideBonds,NglycosylationSites,PotentialPhosphorylationSites,PredictedProteinClass
0,1,Q41358,SNAIB_SAMNI,Ribosome-inactivating protein SNAI,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,Protein_0,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,570,570,34,...,0.085965,176,0.308772,93.824561,12,0.021053,6,8,106,Globular Protein
1,2,P22972,LEC1_ULEEU,Anti-H(O) lectin 1,SDDLSFKFKNFSQNGKDLSFQGDASVIETGVLQLNKVGNNLPDETG...,Protein_1,SDDLSFKFKNFSQNGKDLSFQGDASVIETGVLQLNKVGNNLPDETG...,243,243,15,...,0.119342,64,0.263374,78.230453,1,0.004115,0,2,52,Aromatic-rich Protein
2,4,A8WDZ4,A8WDZ4_CANEN,Concanavalin A,MAISKKSSLFLPIFTFITMFLMVVNKVSSSTHETNALHFMFNQFSK...,Protein_2,MAISKKSSLFLPIFTFITMFLMVVNKVSSSTHETNALHFMFNQFSK...,290,290,22,...,0.096552,88,0.303448,88.413793,0,0.0,0,2,69,Globular Protein
3,6,P09382,LEG1_HUMAN,Galectin-1,MACGLVASNLNLKPGECLRVRGEVAPDAKSFVLNLGKDSNNLCLHF...,Protein_3,MACGLVASNLNLKPGECLRVRGEVAPDAKSFVLNLGKDSNNLCLHF...,135,135,14,...,0.096296,40,0.296296,78.074074,6,0.044444,3,1,11,Highly Stable Protein
4,7,P16045,LEG1_MOUSE,Galectin-1,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...,Protein_4,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...,135,135,12,...,0.088889,36,0.266667,73.037037,6,0.044444,3,0,13,Highly Stable Protein
5,8,P18891,LECF_ALEAU,Fucose-specific lectin,MPTEFLYTSKIAAISWAATGGRQQRVYFQDLNGKIREAQRGGDNPW...,Protein_5,MPTEFLYTSKIAAISWAATGGRQQRVYFQDLNGKIREAQRGGDNPW...,313,313,30,...,0.121406,84,0.268371,70.798722,2,0.00639,1,2,68,Aromatic-rich Protein
6,9,P18674,LECA_MACPO,Agglutinin alpha chain,GVTFDDGAYTGIREINFEYNSETAIGGLRVTYDLNGMPFVAEDHKS...,Protein_6,GVTFDDGAYTGIREINFEYNSETAIGGLRVTYDLNGMPFVAEDHKS...,133,133,3,...,0.165414,34,0.255639,84.887218,0,0.0,0,1,32,Membrane Protein
7,10,P10968,AGI1_WHEAT,Agglutinin isolectin 1,MKMMSTRALALGAAAVLAFAAATAQAQRCGEQGSNMECPNNLCCSQ...,Protein_7,MKMMSTRALALGAAAVLAFAAATAQAQRCGEQGSNMECPNNLCCSQ...,212,212,24,...,0.075472,39,0.183962,37.5,32,0.150943,16,1,32,Disulfide-rich Protein
8,13,P17931,LEG3_HUMAN,Galectin-3,MADNFSLHDALSGSGNPNPQGWPGAWGNQPAGAGGYPGASYPGAYP...,Protein_8,MADNFSLHDALSGSGNPNPQGWPGAWGNQPAGAGGYPGASYPGAYP...,250,250,27,...,0.1,66,0.264,66.04,1,0.004,0,1,34,Highly Stable Protein
9,16,Q71QF2,Q71QF2_AMACA,Agglutinin,MAGLPVIMCLKSNNNQKYLRYQSDNIQQYGLLQFSADKILDPLAQF...,Protein_9,MAGLPVIMCLKSNNNQKYLRYQSDNIQQYGLLQFSADKILDPLAQF...,304,304,14,...,0.128289,78,0.256579,80.460526,5,0.016447,2,3,54,Aromatic-rich Protein
