In [1]:
import pandas as pd
import rdkit.Chem as Chem
from rdkit.Chem import rdFingerprintGenerator
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np

In [48]:
def cluster_glycans(glycans, radius, fp_size, n_clusters):

    def get_morgan_count_fingerprint(smiles, radius, fp_size):
        
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return {f"mf_{i}": 0 for i in range(fp_size)} 


        #The useChirality parameter in Morgan fingerprints determines whether chirality is considered when encoding a molecule.
        #includeChirality=True = Differentiates between enantiomers (model will treat mirror-image molecules as different)
        #includeChirality=False = Ignores chirality (model will treat mirror-image molecules as the same)
        kid_named_morgan_finger = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=fp_size, includeChirality=True)

        cfp = kid_named_morgan_finger.GetCountFingerprint(mol)  
        bit_counts = cfp.GetNonzeroElements()  

        # Convert to a full fp_size-length feature vector
        fingerprint_vector = {f"mf_{i}": bit_counts.get(i, 0) for i in range(fp_size)}
        return fingerprint_vector

    fingerprint_df = glycans['SMILES'].apply(lambda x: get_morgan_count_fingerprint(x, radius, fp_size)).apply(pd.Series)
    
    glycans = pd.concat([glycans, fingerprint_df], axis=1)
    
    # matrix version of fingerprint features. Each row is a glycan, each column is a fingerprint component shape: (611, 2048)
    finger_counts_matrix = fingerprint_df.values
    # pdist calculates the euclidean distance between the combination of each glycan with every other glycan. Then squareform() turns this into a matrix representation where each row is a glycan and each column is the same list of glycans so we can have a comparison matrix. Shape: (611, 611)
    dist_matrix = squareform(pdist(finger_counts_matrix, metric="euclidean"))
    

    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(dist_matrix)
    
    glycans['cluster_label'] = labels
    
    return glycans

def cluster_proteins(proteins, n_clusters):
    
    
    def compute_protein_features(seq):

        # Add reasoning for feature vectors
        
        # Protein Analysis is a Tool from Biopython
        analysis = ProteinAnalysis(seq)
        features = {}
        
        # The following are Basic Features
        features['length'] = len(seq)
        features['mw'] = analysis.molecular_weight()
        features['instability_index'] = analysis.instability_index()

        features['net_charge_pH7'] = analysis.charge_at_pH(7.0)

        aa_percent = analysis.get_amino_acids_percent()

        # Prompted ChatGPT to ask how to parse a
        # N, Q, S, T: Polar Amino Acids, often involved in hydrogen bonding with glycans
        # K, R: Basic Amino Acids, can form hydrogen bonds and electrostatic bonds
        # D, E: Acidic Amino Acids, can interact with positively charged groups of glycans
        for aa in ['N', 'Q', 'S', 'T', 'K', 'R', 'D', 'E']:
            features[f'frac_{aa}'] = aa_percent.get(aa, 0.0)

    
    # F, Y, W are aromatic amino acids which bind with glycans
        for aa in ['F', 'Y', 'W']:
            features[f'frac_{aa}'] = aa_percent.get(aa, 0.0)
            features['aromatic_binding_score'] = (
            aa_percent.get('F', 0.0) +
            aa_percent.get('Y', 0.0) +
            aa_percent.get('W', 0.0)
        )

        features['aromaticity'] = analysis.aromaticity()

        features['hydrophobicity'] = analysis.gravy()

        return features

    feature_dicts = proteins['Amino Acid Sequence'].apply(compute_protein_features)
    features_df = pd.DataFrame(list(feature_dicts))

    proteins = pd.concat([proteins, features_df], axis=1)
    
    # Select the feature columns (all columns from the feature extraction)
    feature_columns = features_df.columns.tolist()
    feature_data = proteins[feature_columns].values

    # apply k means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    proteins['cluster_label'] = kmeans.fit_predict(feature_data)
    
    return proteins

def stratified_kfold_split(fractions_df, glycans_df, proteins_df, n_splits, random_state):
    """
    Create a stratified k-fold split where each fold:
    1. Contains unique GlycanIDs and ProteinGroups not seen in training
    2. Maintains the distribution of cluster_labels for both glycans and proteins
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing ['ObjId', 'ProteinGroup', 'Concentration', 'GlycanID', 'f']
    glycans_df : pandas.DataFrame
        DataFrame containing ['Name', 'cluster_label'] where Name maps to GlycanID
    proteins_df : pandas.DataFrame
        DataFrame containing ['ProteinGroup', 'cluster_label']
    n_splits : int, default=5
        Number of folds for cross-validation
    random_state : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    fold_indices : list of tuples
        List of (train_indices, test_indices) pairs for each fold
    """
    # Set random seed
    np.random.seed(random_state)
    
    # Initialize a list to store fold indices
    fold_indices = []
    
    # Create folds for glycans - using glycans_df directly as it contains unique glycan IDs
    # glycan_folds = {0: [[0:20], [20:40], ...], 1: [[0:10], [10:20], ...], 2: [...]}
    glycan_folds = {}
    for cluster in glycans_df['cluster_label'].unique():
        cluster_glycans = glycans_df[glycans_df['cluster_label'] == cluster]['Name'].tolist()
        np.random.shuffle(cluster_glycans)
        
        # Create approximately equal sized groups
        glycan_folds[cluster] = []
        for i in range(n_splits):
            start_idx = int(i * len(cluster_glycans) / n_splits)
            end_idx = int((i + 1) * len(cluster_glycans) / n_splits)
            glycan_folds[cluster].append(cluster_glycans[start_idx:end_idx])
    
    # Create folds for proteins - using proteins_df directly as it contains unique protein IDs
    protein_folds = {}
    for cluster in proteins_df['cluster_label'].unique():
        cluster_proteins = proteins_df[proteins_df['cluster_label'] == cluster]['ProteinGroup'].tolist()
        np.random.shuffle(cluster_proteins)
        
        # Create approximately equal sized groups
        protein_folds[cluster] = []
        for i in range(n_splits):
            start_idx = int(i * len(cluster_proteins) / n_splits)
            end_idx = int((i + 1) * len(cluster_proteins) / n_splits)
            protein_folds[cluster].append(cluster_proteins[start_idx:end_idx])
    
    # for each fold: 0, 1, 2, ... k-1 (k iterations)
    for fold_idx in range(n_splits):
        # Collect test glycans and proteins for this fold
        # test_glycans = [cluster_0_fold_fold_idx + cluster_1_fold_fold_idx + cluster_2_fold_fold_idx]
        test_glycans = []
        for cluster, fold_lists in glycan_folds.items():
            test_glycans.extend(fold_lists[fold_idx])
        
        test_proteins = []
        for cluster, fold_lists in protein_folds.items():
            test_proteins.extend(fold_lists[fold_idx])
        
        # if one of the test_glycans OR one of the test_proteins is in this sample then put it in test, otherwise put it in train
        # becuase of this functionality we need a larger k_fold (something like 8) to get a test_size of around 20% as the OR operation grabs a lot of samples if the test_glycans and test_proteins is high
        # ex: k_fold=2: test_glycans=[50% of our glycans], test_proteins=[50% of our proteins] --> 50% of glycans OR 50% of proteins ~= 80% samples. (This creates a test set of size 80%)
        is_test = ((fractions_df['GlycanID'].isin(test_glycans)) | 
                   (fractions_df['ProteinGroup'].isin(test_proteins)))
        
        test_indices = fractions_df[is_test].index
        train_indices = fractions_df[~is_test].index
        
        fold_indices.append((train_indices, test_indices))
    
    
    # fold_indicies at K_fold=2 = [
    #   fold_1: (train_indicies, test_indicies), -> (the_rest, [1/k*total_glycans OR 1/k*total_proteins]) (first half of proteins and glycans in test set)
    #   fold_2: (train_indicies, test_indicies), -> (the_rest, [1/k*total_glycans OR 1/k*total_proteins]) (SECOND half of proteins and glycans in test set)
    #]
    
    return fold_indices

In [49]:
glycans_df = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep="\t")
proteins_df = pd.read_csv('../data/Protein-Sequence-Table.txt', sep='\t')
fractions_df = pd.read_csv('../pipeline/data/Train_Fractions.csv', sep='\t')

In [75]:
np.median(list(fractions_df['f']))

np.float64(4.435205128148839e-06)

# Train-Validation K-fold split

## Train-Validation OR operation K-fold

In [66]:
radius = 3
fp_size = 1024
n_clusters = 3
glycans_df = cluster_glycans(glycans_df, radius, fp_size, n_clusters)

n_protein_clusters = 3
proteins_df = cluster_proteins(proteins_df, n_protein_clusters)

k_folds = 8
random_state = 42

# need to cluster both glycans and proteins so that we can create a stratified k-fold split for training 
full_indicies_OR = stratified_kfold_split(fractions_df, glycans_df, proteins_df, k_folds, random_state)



In [67]:
for fold_idx, (train_idx, test_idx) in enumerate(full_indicies_OR):
    # Get data for this fold
    train_data = fractions_df.loc[train_idx]
    val_data = fractions_df.loc[test_idx]
    
    print(f"Training with {len(train_data)} samples: ({(len(train_data) / (len(train_data) + len(val_data))) * 100:.2f}%), "
                  f"validating with {len(val_data)} samples: ({(len(val_data) / (len(train_data) + len(val_data))) * 100:.2f}%) ")

Training with 64325 samples: (93.92%), validating with 4167 samples: (6.08%) 
Training with 62396 samples: (91.10%), validating with 6096 samples: (8.90%) 
Training with 54564 samples: (79.66%), validating with 13928 samples: (20.34%) 
Training with 44751 samples: (65.34%), validating with 23741 samples: (34.66%) 
Training with 51055 samples: (74.54%), validating with 17437 samples: (25.46%) 
Training with 49121 samples: (71.72%), validating with 19371 samples: (28.28%) 
Training with 48222 samples: (70.41%), validating with 20270 samples: (29.59%) 
Training with 46298 samples: (67.60%), validating with 22194 samples: (32.40%) 


## Train-Validation AND Operation K-fold split

In [72]:
n_splits = 4
np.random.seed(random_state)
    
# Initialize a list to store fold indices
fold_indices_AND = []

# Create folds for glycans - using glycans_df directly as it contains unique glycan IDs
# glycan_folds = {0: [[0:20], [20:40], ...], 1: [[0:10], [10:20], ...], 2: [...]}
glycan_folds = {}
for cluster in glycans_df['cluster_label'].unique():
    cluster_glycans = glycans_df[glycans_df['cluster_label'] == cluster]['Name'].tolist()
    np.random.shuffle(cluster_glycans)
    
    # Create approximately equal sized groups
    glycan_folds[cluster] = []
    for i in range(n_splits):
        start_idx = int(i * len(cluster_glycans) / n_splits)
        end_idx = int((i + 1) * len(cluster_glycans) / n_splits)
        glycan_folds[cluster].append(cluster_glycans[start_idx:end_idx])

# Create folds for proteins - using proteins_df directly as it contains unique protein IDs
protein_folds = {}
for cluster in proteins_df['cluster_label'].unique():
    cluster_proteins = proteins_df[proteins_df['cluster_label'] == cluster]['ProteinGroup'].tolist()
    np.random.shuffle(cluster_proteins)
    
    # Create approximately equal sized groups
    protein_folds[cluster] = []
    for i in range(n_splits):
        start_idx = int(i * len(cluster_proteins) / n_splits)
        end_idx = int((i + 1) * len(cluster_proteins) / n_splits)
        protein_folds[cluster].append(cluster_proteins[start_idx:end_idx])
        
for fold_idx in range(n_splits):
        # Collect test glycans and proteins for this fold
        # test_glycans = [cluster_0_fold_fold_idx + cluster_1_fold_fold_idx + cluster_2_fold_fold_idx]
        test_glycans = []
        for cluster, fold_lists in glycan_folds.items():
            test_glycans.extend(fold_lists[fold_idx])
        
        test_proteins = []
        for cluster, fold_lists in protein_folds.items():
            test_proteins.extend(fold_lists[fold_idx])
        
        # if one of the test_glycans OR one of the test_proteins is in this sample then put it in test, otherwise put it in train
        # becuase of this functionality we need a larger k_fold (something like 8) to get a test_size of around 20% as the OR operation grabs a lot of samples if the test_glycans and test_proteins is high
        # ex: k_fold=2: test_glycans=[50% of our glycans], test_proteins=[50% of our proteins] --> 50% of glycans OR 50% of proteins ~= 80% samples. (This creates a test set of size 80%)
        is_test = ((fractions_df['GlycanID'].isin(test_glycans)) & 
                   (fractions_df['ProteinGroup'].isin(test_proteins)))
        
        is_train = ((~fractions_df['GlycanID'].isin(test_glycans)) & 
                   (~fractions_df['ProteinGroup'].isin(test_proteins)))
        
        test_indices = fractions_df[is_test].index
        #not_test_indices = fractions_df[~is_test].index
        
        train_indices = fractions_df[is_train].index
        
        print(f'train size: {len(train_indices)}, test size: {len(test_indices)}, total: {len(fractions_df)}')
        
        print(f'train size: {round((len(train_indices)/len(fractions_df))*100, 2)}%, test size: {round((len(test_indices)/len(fractions_df))*100, 2)}%')
        
        fold_indices_AND.append((train_indices, test_indices))

train size: 58265, test size: 84, total: 68492
train size: 85.07%, test size: 0.12%
train size: 33787, test size: 5960, total: 68492
train size: 49.33%, test size: 8.7%
train size: 34496, test size: 5624, total: 68492
train size: 50.37%, test size: 8.21%
train size: 29955, test size: 7851, total: 68492
train size: 43.74%, test size: 11.46%


2 fold (50% of each class)

    train size: 28149, test size: 26509, total: 109768

    train size: 25.64%, test size: 24.15%



3 fold (33% of each glycan and protein classes)

    train size: 53445, test size: 9950, total: 109768

    train size: 48.69%, test size: 9.06%


4 fold (25%)

    train size: 67441, test size: 5029, total: 109768

    train size: 61.44%, test size: 4.58%

5 fold (20%)

    train size: 71899, test size: 3904, total: 109768
    
    train size: 65.5%, test size: 3.56%

In [30]:
for fold_idx, (train_idx, test_idx) in enumerate(fold_indices_AND):
    # Get data for this fold
    train_data = fractions_df.loc[train_idx]
    val_data = fractions_df.loc[test_idx]
    
    print(f"Training with {len(train_data)} samples: ({(len(train_data) / (len(train_data) + len(val_data))) * 100:.2f}%), "
                  f"validating with {len(val_data)} samples: ({(len(val_data) / (len(train_data) + len(val_data))) * 100:.2f}%) ")

Training with 75918 samples: (96.24%), validating with 2964 samples: (3.76%) 
Training with 66597 samples: (92.54%), validating with 5368 samples: (7.46%) 
Training with 71899 samples: (94.85%), validating with 3904 samples: (5.15%) 
Training with 71903 samples: (94.85%), validating with 3904 samples: (5.15%) 
Training with 64952 samples: (91.77%), validating with 5825 samples: (8.23%) 


In [42]:


fractions_df = pd.read_csv('../data/Fractions-Bound-Table.txt', sep='\t')

np.random.seed(random_state)

    
# Filter out rows where ProteinGroup is not in proteins_df
valid_protein_groups = set(proteins_df['ProteinGroup'])
fractions_df = fractions_df[fractions_df['ProteinGroup'].isin(valid_protein_groups)].copy()


# Calculate target counts for each cluster in test set
glycan_cluster_counts = glycans_df['cluster_label'].value_counts().to_dict()
protein_cluster_counts = proteins_df['cluster_label'].value_counts().to_dict()

test_sizes = [0.10, 0.20, 0.25, 0.27, 0.30, 0.40, 0.50]

#test_sizes = [0.25, 0.27, 0.28, 0.30, 0.33, 0.35]

for test_size in test_sizes:
    

    glycan_test_counts = {cluster: max(1, int(np.ceil(count * test_size))) 
                            for cluster, count in glycan_cluster_counts.items()}
    protein_test_counts = {cluster: max(1, int(np.ceil(count * test_size))) 
                            for cluster, count in protein_cluster_counts.items()}

    # Select glycans and proteins for test set while respecting cluster distributions
    test_glycans = []
    for cluster, target_count in glycan_test_counts.items():
        cluster_glycans = glycans_df[glycans_df['cluster_label'] == cluster]['Name'].tolist()
        selected = np.random.choice(cluster_glycans, size=min(target_count, len(cluster_glycans)), replace=False)
        test_glycans.extend(selected)

    test_proteins = []
    for cluster, target_count in protein_test_counts.items():
        cluster_proteins = proteins_df[proteins_df['cluster_label'] == cluster]['ProteinGroup'].tolist()
        selected = np.random.choice(cluster_proteins, size=min(target_count, len(cluster_proteins)), replace=False)
        test_proteins.extend(selected)

    # Create train and test masks
    is_test = ((fractions_df['GlycanID'].isin(test_glycans)) & 
                (fractions_df['ProteinGroup'].isin(test_proteins)))

    is_train = ((~fractions_df['GlycanID'].isin(test_glycans)) & 
                    (~fractions_df['ProteinGroup'].isin(test_proteins)))
            
    test_indices = fractions_df[is_test].index


    train_indices = fractions_df[is_train].index
    
    print(f'-------------Test size: {test_size*100}% -------------')

    print(f'train size: {len(train_indices)}, test size: {len(test_indices)}, total: {len(fractions_df)}')
            
    print(f'train size: {round((len(train_indices)/len(fractions_df))*100, 2)}%, test size: {round((len(test_indices)/len(fractions_df))*100, 2)}%')
    
    print(f'test size % in terms of test/(training+test) size: {round((len(test_indices)/(len(train_indices)+len(test_indices)))*100, 2)}%')
    
    print(f'Total % of dataset used: {round(((len(train_indices)+len(test_indices))/len(fractions_df))*100, 2)}%\n')
    


-------------Test size: 10.0% -------------
train size: 99882, test size: 1860, total: 129489
train size: 77.14%, test size: 1.44%
test size % in terms of training+test set size: 1.83%
Total % of dataset used: 78.57%

-------------Test size: 20.0% -------------
train size: 80842, test size: 5697, total: 129489
train size: 62.43%, test size: 4.4%
test size % in terms of training+test set size: 6.58%
Total % of dataset used: 66.83%

-------------Test size: 25.0% -------------
train size: 69436, test size: 9240, total: 129489
train size: 53.62%, test size: 7.14%
test size % in terms of training+test set size: 11.74%
Total % of dataset used: 60.76%

-------------Test size: 27.0% -------------
train size: 66156, test size: 10506, total: 129489
train size: 51.09%, test size: 8.11%
test size % in terms of training+test set size: 13.7%
Total % of dataset used: 59.2%

-------------Test size: 30.0% -------------
train size: 63174, test size: 11773, total: 129489
train size: 48.79%, test size: 9.