In [2]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.stats import bootstrap
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from collections import Counter

In [4]:
proteins = pd.read_csv('../data/Protein-Sequence-Table.txt', sep='\t')


In [5]:
proteins.head()

Unnamed: 0,ProteinGroup,Accession,Uniprot,Description,Amino Acid Sequence
0,1,Q41358,SNAIB_SAMNI,Ribosome-inactivating protein SNAI,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...
1,2,P22972,LEC1_ULEEU,Anti-H(O) lectin 1,SDDLSFKFKNFSQNGKDLSFQGDASVIETGVLQLNKVGNNLPDETG...
2,4,A8WDZ4,A8WDZ4_CANEN,Concanavalin A,MAISKKSSLFLPIFTFITMFLMVVNKVSSSTHETNALHFMFNQFSK...
3,6,P09382,LEG1_HUMAN,Galectin-1,MACGLVASNLNLKPGECLRVRGEVAPDAKSFVLNLGKDSNNLCLHF...
4,7,P16045,LEG1_MOUSE,Galectin-1,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...


In [9]:
# Definition: Length of amino acid sequence
# Usability: Fundamental property related to protein size that affects binding potential
# Correlations: Correlated with molecular weight
proteins['length'] = proteins['Amino Acid Sequence'].apply(lambda x: len(x))

# Definition: Molecular weight in Daltons
# Usability: Important for understanding protein size and potential interactions with glycans
# Correlations: Correlated with sequence length
proteins['molecular_weight'] = proteins['Amino Acid Sequence'].apply(lambda x: ProteinAnalysis(x).molecular_weight())

# Definition: Measure of protein stability (>40 considered unstable)
# Usability: Helps predict protein half-life and stability in solution
# Correlations: Can relate to protein function and glycan interaction capabilities
proteins['instability_index'] = proteins['Amino Acid Sequence'].apply(lambda x: ProteinAnalysis(x).instability_index())

# Definition: Net charge at physiological pH
# Usability: Critical for understanding electrostatic interactions with charged glycans
# Correlations: Related to isoelectric point and acidic/basic amino acid content
proteins['net_charge_pH7'] = proteins['Amino Acid Sequence'].apply(lambda x: ProteinAnalysis(x).charge_at_pH(7.0))

# Definition: pH at which the protein has no net charge
# Usability: Important for understanding protein behavior in different pH environments
# Correlations: Related to the ratio of acidic to basic residues
proteins['isoelectric_point'] = proteins['Amino Acid Sequence'].apply(lambda x: ProteinAnalysis(x).isoelectric_point())

# Definition: Fraction of each amino acid in sequence
# Usability: Basic composition that influences various protein properties
# Correlations: Related to many other features like charge, hydrophobicity
for aa in 'ACDEFGHIKLMNPQRSTVWY':
    proteins[f'frac_{aa}'] = proteins['Amino Acid Sequence'].apply(
        lambda x: ProteinAnalysis(x).get_amino_acids_percent().get(aa, 0.0)
    )

# Definition: Polar amino acids that often engage in hydrogen bonding with glycans
# Usability: High percentages may indicate glycan-binding potential
# Correlations: May correlate with hydrophilicity
proteins['polar_aa_percent'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(ProteinAnalysis(x).get_amino_acids_percent().get(aa, 0.0) for aa in ['N', 'Q', 'S', 'T'])
)

# Definition: Basic amino acids that can form hydrogen bonds and electrostatic interactions
# Usability: Important for ionic interactions with negatively charged glycans
# Correlations: Contributes to net positive charge
proteins['basic_aa_percent'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(ProteinAnalysis(x).get_amino_acids_percent().get(aa, 0.0) for aa in ['K', 'R', 'H'])
)

# Definition: Acidic amino acids that can interact with positively charged groups
# Usability: Can form salt bridges with positive regions of glycans
# Correlations: Contributes to net negative charge
proteins['acidic_aa_percent'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(ProteinAnalysis(x).get_amino_acids_percent().get(aa, 0.0) for aa in ['D', 'E'])
)

# Definition: Aromatic amino acids known to engage in CH-π interactions with glycans
# Usability: High aromatic content often indicates glycan-binding potential
# Correlations: Related to hydrophobicity
proteins['aromatic_aa_percent'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(ProteinAnalysis(x).get_amino_acids_percent().get(aa, 0.0) for aa in ['F', 'Y', 'W'])
)

# Definition: Weighted score emphasizing the importance of aromatics in glycan binding
# Usability: Custom metric specifically designed for glycan interaction potential
# Correlations: May correlate with glycan binding affinity
proteins['aromatic_binding_score'] = proteins['Amino Acid Sequence'].apply(
    lambda x: (ProteinAnalysis(x).get_amino_acids_percent().get('F', 0.0) * 1.0 +
               ProteinAnalysis(x).get_amino_acids_percent().get('Y', 0.0) * 1.2 +
               ProteinAnalysis(x).get_amino_acids_percent().get('W', 0.0) * 1.5)
)

# Definition: Hydrophobic amino acids that can participate in hydrophobic interactions
# Usability: Important for understanding protein folding and interaction surfaces
# Correlations: Related to overall hydrophobicity (GRAVY score)
proteins['hydrophobic_aa_percent'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(ProteinAnalysis(x).get_amino_acids_percent().get(aa, 0.0) for aa in ['A', 'I', 'L', 'M', 'F', 'V', 'W', 'Y'])
)

# Definition: Fraction of residues predicted to be in helical conformation
# Usability: Helps understand potential structural elements that may interact with glycans
# Correlations: May relate to protein stability and binding surface availability
proteins['helix_fraction'] = proteins['Amino Acid Sequence'].apply(
    lambda x: ProteinAnalysis(x).secondary_structure_fraction()[0]
)

# Definition: Fraction of residues predicted to be in turn conformation
# Usability: Turns often expose residues for interactions with glycans
# Correlations: May relate to binding site accessibility
proteins['turn_fraction'] = proteins['Amino Acid Sequence'].apply(
    lambda x: ProteinAnalysis(x).secondary_structure_fraction()[1]
)

# Definition: Fraction of residues predicted to be in sheet conformation
# Usability: Sheets can create flat binding surfaces for glycan interactions
# Correlations: May relate to protein stability
proteins['sheet_fraction'] = proteins['Amino Acid Sequence'].apply(
    lambda x: ProteinAnalysis(x).secondary_structure_fraction()[2]
)

# Definition: Measure of aromatic amino acid content
# Usability: Related to protein stability and potential for stacking interactions with glycans
# Correlations: Connected to hydrophobicity and aromatic binding potential
proteins['aromaticity'] = proteins['Amino Acid Sequence'].apply(
    lambda x: ProteinAnalysis(x).aromaticity()
)

# Definition: Grand average of hydropathy (measure of hydrophobicity)
# Usability: Positive values indicate hydrophobic proteins, negative indicate hydrophilic
# Correlations: Related to solubility and membrane association
proteins['hydrophobicity_gravy'] = proteins['Amino Acid Sequence'].apply(
    lambda x: ProteinAnalysis(x).gravy()
)

# Definition: Potential N-glycosylation sites (N-X-S/T where X is not P)
# Usability: Identifies potential glycosylation sites on the protein itself
# Correlations: May indicate proteins involved in glycan processing
# Asked ChatGPT for this
proteins['n_glycosylation_sites'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(1 for i in range(len(x)-2) if x[i] == 'N' and x[i+1] != 'P' and x[i+2] in ['S', 'T'])
)

# Definition: Count of potential O-glycosylation sites (S/T)
# Usability: Identifies residues where O-glycans might attach
# Correlations: Higher counts might indicate glycoprotein potential
# Asked ChatGPT for this
proteins['potential_o_glycosylation_sites'] = proteins['Amino Acid Sequence'].apply(
    lambda x: sum(1 for aa in x if aa in ['S', 'T'])
)


# Definition: Average flexibility score based on amino acid composition
# Usability: Flexible regions may be important for accommodating glycan structures
# Correlations: May correlate with binding adaptability
proteins['avg_flexibility'] = proteins['Amino Acid Sequence'].apply(
    lambda x: np.mean(ProteinAnalysis(x).flexibility())
)

In [10]:
proteins.head(1)

Unnamed: 0,ProteinGroup,Accession,Uniprot,Description,Amino Acid Sequence,length,molecular_weight,instability_index,net_charge_pH7,isoelectric_point,...,helix_fraction,turn_fraction,sheet_fraction,aromaticity,hydrophobicity_gravy,n_glycosylation_sites,potential_o_glycosylation_sites,aromatic_triplets,avg_flexibility,sequence_entropy
0,1,Q41358,SNAIB_SAMNI,Ribosome-inactivating protein SNAI,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...,570,63100.9796,30.327544,-8.842719,5.52392,...,0.249123,0.305263,0.412281,0.085965,-0.069298,8,87,7,0.994882,4.136523
