In [1]:
import pandas as pd
import rdkit.Chem as Chem
from rdkit.Chem import rdFingerprintGenerator
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import numpy as np

In [58]:
glycans = pd.read_csv('../data/Glycan-Structures-CFG611.txt', sep="\t")

In [59]:
def cluster_glycans(glycans, radius, fp_size, n_clusters):

    def get_morgan_count_fingerprint(smiles, radius, fp_size):
        
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return {f"mf_{i}": 0 for i in range(fp_size)} 


        #The useChirality parameter in Morgan fingerprints determines whether chirality is considered when encoding a molecule.
        #includeChirality=True = Differentiates between enantiomers (model will treat mirror-image molecules as different)
        #includeChirality=False = Ignores chirality (model will treat mirror-image molecules as the same)
        kid_named_morgan_finger = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=fp_size, includeChirality=True)

        cfp = kid_named_morgan_finger.GetCountFingerprint(mol)  
        bit_counts = cfp.GetNonzeroElements()  

        # Convert to a full fp_size-length feature vector
        fingerprint_vector = {f"mf_{i}": bit_counts.get(i, 0) for i in range(fp_size)}
        return fingerprint_vector

    fingerprint_df = glycans['SMILES'].apply(lambda x: get_morgan_count_fingerprint(x, radius, fp_size)).apply(pd.Series)
    
    glycans = pd.concat([glycans, fingerprint_df], axis=1)
    
    # matrix version of fingerprint features. Each row is a glycan, each column is a fingerprint component shape: (611, 2048)
    finger_counts_matrix = fingerprint_df.values
    # pdist calculates the euclidean distance between the combination of each glycan with every other glycan. Then squareform() turns this into a matrix representation where each row is a glycan and each column is the same list of glycans so we can have a comparison matrix. Shape: (611, 611)
    dist_matrix = squareform(pdist(finger_counts_matrix, metric="euclidean"))
    

    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    labels = kmeans.fit_predict(dist_matrix)
    
    glycans['cluster_label'] = labels
    
    return glycans

In [60]:
# play with these parameters. Will vary preformance and computational costs
# https://github.com/rdkit/rdkit/discussions/5390
radius = 3
fp_size = 1024
n_clusters = 3

glycans = cluster_glycans(glycans, radius, fp_size, n_clusters)

In [61]:
glycans['cluster_label'].value_counts()

cluster_label
0    486
1    109
2     16
Name: count, dtype: int64

In [62]:
proteins = pd.read_csv('../data/Protein-Sequence-Table.txt', sep='\t')

In [63]:
def cluster_proteins(proteins, n_clusters):
    
    
    def compute_protein_features(seq):
        # Protein Analysis is a Tool from Biopython
        analysis = ProteinAnalysis(seq)
        features = {}
        
        # The following are Basic Features
        features['length'] = len(seq)
        features['mw'] = analysis.molecular_weight()
        features['aromaticity'] = analysis.aromaticity()
        features['instability_index'] = analysis.instability_index()

        features['net_charge_pH7'] = analysis.charge_at_pH(7.0)

        aa_percent = analysis.get_amino_acids_percent()
        
        for aa in ['N', 'Q', 'S', 'T', 'K', 'R', 'D', 'E']:
            features[f'frac_{aa}'] = aa_percent.get(aa, 0.0)

    
        for aa in ['F', 'Y', 'W']:
            features[f'frac_{aa}'] = aa_percent.get(aa, 0.0)
            features['aromatic_binding_score'] = (
            aa_percent.get('F', 0.0) +
            aa_percent.get('Y', 0.0) +
            aa_percent.get('W', 0.0)
        )

        features['aromaticity'] = analysis.aromaticity()

        hydrophobicity_values = {
            'A': 1.8,  'C': 2.5,  'D': -3.5, 'E': -3.5,
            'F': 2.8,  'G': -0.4, 'H': -3.2, 'I': 4.5,
            'K': -3.9, 'L': 3.8,  'M': 1.9,  'N': -3.5,
            'P': -1.6, 'Q': -3.5, 'R': -4.5, 'S': -0.8,
            'T': -0.7, 'V': 4.2,  'W': -0.9, 'Y': -1.3
        }
        if len(seq) > 0:
            hydro_scores = [hydrophobicity_values.get(res, 0.0) for res in seq]
            features['avg_hydrophobicity'] = np.mean(hydro_scores)
        else:
            features['avg_hydrophobicity'] = 0.0

        return features

    feature_dicts = proteins['Amino Acid Sequence'].apply(compute_protein_features)
    features_df = pd.DataFrame(list(feature_dicts))

    proteins = pd.concat([proteins, features_df], axis=1)
    
    # Select the feature columns (all columns from the feature extraction)
    feature_columns = features_df.columns.tolist()
    feature_data = proteins[feature_columns].values

    # comput dist
    dist_matrix = squareform(pdist(feature_data, metric='euclidean'))

    # apply k means clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    proteins['cluster_label'] = kmeans.fit_predict(feature_data)
    
    return proteins
    

In [64]:
n_clusters = 3
proteins = cluster_proteins(proteins, n_clusters)



In [9]:
fractions = pd.read_csv('../data/Fractions-Bound-Table.txt', sep="\t")

In [20]:
fractions['GlycanID'].value_counts()

GlycanID
CFG-007-Sp8     548
CFG-435-Sp21    548
CFG-427-Sp19    548
CFG-428-Sp24    548
CFG-429-Sp14    548
               ... 
CFG-189-Sp0     548
CFG-190-Sp9     548
CFG-640-Sp21    548
CFG-338-Sp12    539
CFG-051-Sp13    408
Name: count, Length: 611, dtype: int64

In [19]:
fractions['ProteinGroup'].value_counts()

ProteinGroup
1      5499
3      4277
2      4277
4      3666
5      3666
       ... 
118    1830
117    1830
145    1827
146    1827
147    1827
Name: count, Length: 147, dtype: int64

In [None]:
fractions.head()

In [37]:
frac_proteins = set(fractions['ProteinGroup'].unique())

In [49]:
len(frac_proteins)

147

In [28]:
proteins.head()

Unnamed: 0,ProteinGroup,Accession,Uniprot,Description,Amino Acid Sequence
0,1,Q41358,SNAIB_SAMNI,Ribosome-inactivating protein SNAI,MRLVAKLLYLAVLAICGLGIHGALTHPRVTPPVYPSVSFNLTGADT...
1,2,P22972,LEC1_ULEEU,Anti-H(O) lectin 1,SDDLSFKFKNFSQNGKDLSFQGDASVIETGVLQLNKVGNNLPDETG...
2,4,A8WDZ4,A8WDZ4_CANEN,Concanavalin A,MAISKKSSLFLPIFTFITMFLMVVNKVSSSTHETNALHFMFNQFSK...
3,6,P09382,LEG1_HUMAN,Galectin-1,MACGLVASNLNLKPGECLRVRGEVAPDAKSFVLNLGKDSNNLCLHF...
4,7,P16045,LEG1_MOUSE,Galectin-1,MACGLVASNLNLKPGECLKVRGEVASDAKSFVLNLGKDSNNLCLHF...


In [38]:
prot_proteins = set(proteins['ProteinGroup'].unique())

In [43]:
a = [int(val) for val in frac_proteins - prot_proteins]

In [50]:
len(a)

95

In [None]:
fractions[fractions['ProteinGroup'].isin(a)].shape

(205190, 5)

In [42]:
fractions.shape

(334679, 5)

In [47]:
proteins.shape

(52, 5)

In [48]:
len(proteins['ProteinGroup'].value_counts().to_dict())

52

In [65]:
new_fractions = fractions[fractions['ProteinGroup'].isin(proteins['ProteinGroup'].unique())]

In [68]:
old_train_data, old_test_data = create_stratified_train_test_split(new_fractions, glycans, proteins, test_size=0.1, random_state=42)

Test data size created: ~22.86%


In [103]:
new_fractions['GlycanID'].value_counts().to_dict()

{'CFG-007-Sp8': 212,
 'CFG-435-Sp21': 212,
 'CFG-427-Sp19': 212,
 'CFG-428-Sp24': 212,
 'CFG-429-Sp14': 212,
 'CFG-430-Sp0': 212,
 'CFG-431-Sp0': 212,
 'CFG-432-Sp8': 212,
 'CFG-434-Sp21': 212,
 'CFG-436-Sp0': 212,
 'CFG-487-Sp19': 212,
 'CFG-437-Sp0': 212,
 'CFG-438-Sp14': 212,
 'CFG-439-Sp14': 212,
 'CFG-440-Sp0': 212,
 'CFG-441-Sp14': 212,
 'CFG-442-Sp14': 212,
 'CFG-443-Sp14': 212,
 'CFG-426-Sp0': 212,
 'CFG-425-Sp14': 212,
 'CFG-424-Sp14': 212,
 'CFG-423-Sp14': 212,
 'CFG-408-Sp21': 212,
 'CFG-409-Sp21': 212,
 'CFG-410-Sp21': 212,
 'CFG-411-Sp21': 212,
 'CFG-412-Sp21': 212,
 'CFG-413-Sp0': 212,
 'CFG-414-Sp0': 212,
 'CFG-415-Sp14': 212,
 'CFG-416-Sp14': 212,
 'CFG-417-Sp0': 212,
 'CFG-418-Sp19': 212,
 'CFG-419-Sp19': 212,
 'CFG-420-Sp19': 212,
 'CFG-421-Sp12': 212,
 'CFG-422-Sp12': 212,
 'CFG-444-Sp22': 212,
 'CFG-445-Sp22': 212,
 'CFG-446-Sp19': 212,
 'CFG-466-Sp10': 212,
 'CFG-468-Sp8': 212,
 'CFG-469-Sp0': 212,
 'CFG-470-Sp0': 212,
 'CFG-471-Sp12': 212,
 'CFG-472-Sp12': 212,
 '

In [104]:
new_fractions['ProteinGroup'].value_counts().to_dict()

{1: 5499,
 2: 4277,
 4: 3666,
 6: 3055,
 7: 3055,
 8: 3055,
 9: 3055,
 10: 3055,
 13: 3055,
 55: 2444,
 48: 2444,
 49: 2444,
 52: 2444,
 64: 2444,
 56: 2444,
 60: 2444,
 61: 2444,
 43: 2444,
 82: 2444,
 47: 2444,
 53: 2444,
 40: 2444,
 39: 2444,
 38: 2444,
 36: 2444,
 34: 2444,
 33: 2444,
 30: 2444,
 28: 2444,
 27: 2444,
 19: 2444,
 17: 2444,
 16: 2444,
 84: 2440,
 83: 2440,
 81: 2440,
 80: 2440,
 79: 2440,
 72: 2440,
 70: 2440,
 89: 1833,
 91: 1833,
 92: 1833,
 94: 1833,
 98: 1833,
 107: 1833,
 111: 1833,
 112: 1833,
 118: 1830,
 137: 1830,
 138: 1830,
 146: 1827}

In [105]:
new_fractions['Combination'] = new_fractions['GlycanID'] + "_" + new_fractions['ProteinGroup'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_fractions['Combination'] = new_fractions['GlycanID'] + "_" + new_fractions['ProteinGroup'].astype(str)


In [106]:
new_fractions['Combination'].value_counts().to_dict()

{'CFG-007-Sp8_1': 9,
 'CFG-436-Sp0_1': 9,
 'CFG-428-Sp24_1': 9,
 'CFG-429-Sp14_1': 9,
 'CFG-430-Sp0_1': 9,
 'CFG-431-Sp0_1': 9,
 'CFG-432-Sp8_1': 9,
 'CFG-434-Sp21_1': 9,
 'CFG-435-Sp21_1': 9,
 'CFG-437-Sp0_1': 9,
 'CFG-407-Sp21_1': 9,
 'CFG-438-Sp14_1': 9,
 'CFG-439-Sp14_1': 9,
 'CFG-440-Sp0_1': 9,
 'CFG-441-Sp14_1': 9,
 'CFG-442-Sp14_1': 9,
 'CFG-443-Sp14_1': 9,
 'CFG-444-Sp22_1': 9,
 'CFG-427-Sp19_1': 9,
 'CFG-426-Sp0_1': 9,
 'CFG-425-Sp14_1': 9,
 'CFG-424-Sp14_1': 9,
 'CFG-409-Sp21_1': 9,
 'CFG-410-Sp21_1': 9,
 'CFG-411-Sp21_1': 9,
 'CFG-412-Sp21_1': 9,
 'CFG-413-Sp0_1': 9,
 'CFG-414-Sp0_1': 9,
 'CFG-415-Sp14_1': 9,
 'CFG-416-Sp14_1': 9,
 'CFG-417-Sp0_1': 9,
 'CFG-418-Sp19_1': 9,
 'CFG-419-Sp19_1': 9,
 'CFG-420-Sp19_1': 9,
 'CFG-421-Sp12_1': 9,
 'CFG-422-Sp12_1': 9,
 'CFG-423-Sp14_1': 9,
 'CFG-445-Sp22_1': 9,
 'CFG-446-Sp19_1': 9,
 'CFG-447-Sp14_1': 9,
 'CFG-467-Sp8_1': 9,
 'CFG-469-Sp0_1': 9,
 'CFG-470-Sp0_1': 9,
 'CFG-471-Sp12_1': 9,
 'CFG-472-Sp12_1': 9,
 'CFG-475-Sp14_1': 9,
 '

In [69]:
train = new_fractions.loc[old_train_data]
test = new_fractions.loc[old_test_data]

In [70]:
train.shape

(99882, 5)

In [71]:
test.shape

(29607, 5)

In [91]:
len(new_fractions['GlycanID'].unique())

611

In [73]:
test_glycans = test['GlycanID']

In [86]:
train_set = set(train['GlycanID'].unique())
test_set = set(test['GlycanID'].unique())

In [101]:
train['Combination'] = train['GlycanID'] + "_" + train['ProteinGroup'].astype(str)
test['Combination'] = test['GlycanID'] + "_" + test['ProteinGroup'].astype(str)


set_train_fractions = set(train['Combination'])
set_test_fractions = set(test['Combination'])


print('shared combinations:', set_train_fractions.intersection(set_test_fractions))

shared combinations: set()


In [97]:
train.shape

(99882, 6)

In [98]:
test.shape

(29607, 6)

In [95]:
len(set_new_fractions)

24695

In [96]:
len(test)

29607

In [93]:
shared_combinations

set()

In [89]:
len(train_set)

549

In [90]:
len(test_set)

611

In [88]:
len(train_set.intersection(test_set))

549

In [None]:
train['GlycanID'].value_counts().to_dict().keys()

GlycanID
CFG-008-Sp8     182
CFG-435-Sp21    182
CFG-427-Sp19    182
CFG-428-Sp24    182
CFG-430-Sp0     182
               ... 
CFG-192-Sp9     182
CFG-195-Sp12    182
CFG-640-Sp21    182
CFG-338-Sp12    179
CFG-051-Sp13    149
Name: count, Length: 549, dtype: int64

In [80]:
len(test['GlycanID'].value_counts())

611

In [72]:
test

Unnamed: 0,ObjId,ProteinGroup,Concentration,GlycanID,f
0,1004699,1,0.001,CFG-007-Sp8,0.000000
8,1004699,1,0.001,CFG-013-Sp11,0.000000
11,1004699,1,0.001,CFG-015-Sp8,0.000000
19,1004699,1,0.001,CFG-021-Sp8,0.000008
30,1004699,1,0.001,CFG-031-Sp8,0.000000
...,...,...,...,...,...
332830,1006330,146,200.000,CFG-618-Sp14,0.000000
332840,1006330,146,200.000,CFG-628-Sp14,0.000120
332841,1006330,146,200.000,CFG-630-Sp14,0.000766
332842,1006330,146,200.000,CFG-631-Sp14,0.002469


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

def create_stratified_train_test_split(fractions_df, glycans_df, proteins_df, test_size=0.1, random_state=42):
    """
    Create a stratified train-test split where:
    1. Test set has unique GlycanIDs and ProteinGroups not seen in training
    2. Distribution of cluster_labels for both glycans and proteins is maintained
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing ['ObjId', 'ProteinGroup', 'Concentration', 'GlycanID', 'f']
    glycans_df : pandas.DataFrame
        DataFrame containing ['Name', 'cluster_label'] where Name maps to GlycanID
    proteins_df : pandas.DataFrame
        DataFrame containing ['ProteinGroup', 'cluster_label']
    test_size : float, default=0.1
        Proportion of data to include in the test set
    random_state : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    train_indices : numpy.ndarray
        Indices of fractions_df that belong to the training set
    test_indices : numpy.ndarray
        Indices of fractions_df that belong to the test set
    """
    
    # merge together all datasets so cleaner to work with
    merged_df = pd.merge(
        fractions_df,
        glycans_df,
        left_on='GlycanID',
        right_on='Name',
        how='left'
    )
    # now remove Name as we already have GlycanID
    merged_df.drop('Name', axis=1, inplace=True)

    all_df = pd.merge(
        merged_df,
        proteins_df,
        on='ProteinGroup',
        how='left',
        suffixes=('', '_protein')  # so no mixup with duplicate column names
    )

    all_df = all_df.rename(columns={
        'cluster_label': 'glycan_cluster_label',
        'cluster_label_protein': 'protein_cluster_label'
    })
    
    
    # Set random seed
    np.random.seed(random_state)
    
    # Merge cluster labels from glycans and proteins into fractions
    data_w_clusters = fractions_df.copy()
    
    # fractions_with_clusters['glycan_cluster'] = {GlycanID: cluster_id}
    #glycan_cluster_map = dict(zip(glycans_df['Name'], glycans_df['cluster_label']))
    #data_w_clusters['glycan_cluster'] = data_w_clusters['GlycanID'].map(glycan_cluster_map)
    
    # fractions_with_clusters['protein_cluster'] = {ProteinGroup: cluster_id}
    #protein_cluster_map = dict(zip(proteins_df['ProteinGroup'], proteins_df['cluster_label']))
    #data_w_clusters['protein_cluster'] = data_w_clusters['ProteinGroup'].map(protein_cluster_map)
    

    
    # {cluster_id "0": count, cluster_id "1": count, ...}
    glycan_cluster_counts = glycans_df['cluster_label'].value_counts().to_dict()
    protein_cluster_counts = proteins_df['cluster_label'].value_counts().to_dict()
    
    
    # ceiling on test size split on each cluster_id count
    # ex: {0: 486, 1: 109, 2: 16} * 10% --> {0: 49, 1: 11, 2: 2}
    glycan_test_counts = {cluster: max(1, int(np.ceil(count * test_size))) 
                         for cluster, count in glycan_cluster_counts.items()}
    protein_test_counts = {cluster: max(1, int(np.ceil(count * test_size))) 
                          for cluster, count in protein_cluster_counts.items()}
    
    # randomly select the test count amount of samples from each and put in the test arrays
    # ex: {0: 49, 1: 11, 2: 2} for [0, 1, 2]: random.choice(49 samples of class 0), random.choice(11 samples of class 1), ...
    test_glycans = []
    for cluster, target_count in glycan_test_counts.items():
        cluster_glycans = glycans_df[glycans_df['cluster_label'] == cluster]['Name'].tolist()
        selected = np.random.choice(cluster_glycans, target_count, replace=False)
        test_glycans.extend(selected)
    
    test_proteins = []
    for cluster, target_count in protein_test_counts.items():
        cluster_proteins = proteins_df[proteins_df['cluster_label'] == cluster]['ProteinGroup'].tolist()
        selected = np.random.choice(cluster_proteins, target_count, replace=False)
        test_proteins.extend(selected)
    
    # Create everysample that either has one of the test glycan values or one of the proteingroups we grabbed then put it in is_test
    # the test values are 10% of the dataset but since we are using the or operation it will be slightly higher. In our case this creates about a 15% testset size
    is_test = ((fractions_df['GlycanID'].isin(test_glycans)) | 
               (fractions_df['ProteinGroup'].isin(test_proteins)))
    
    
    test_indices = fractions_df[is_test].index
    train_indices = fractions_df[~is_test].index
    
    
    print(f"Test data size created: ~{float(np.round(is_test.sum()/len(fractions_df), decimals=4)*100)}%")
    
    return train_indices, test_indices


def create_stratified_kfold(fractions_df, glycans_df, proteins_df, train_indices, n_splits=5, random_state=42):
    """
    Create stratified k-fold cross-validation splits on the training data,
    maintaining cluster distributions for both glycans and proteins.
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing ['ObjId', 'ProteinGroup', 'Concentration', 'GlycanID', 'f']
    glycans_df : pandas.DataFrame
        DataFrame containing ['Name', 'cluster_label'] where Name maps to GlycanID
    proteins_df : pandas.DataFrame
        DataFrame containing ['ProteinGroup', 'cluster_label']
    train_indices : numpy.ndarray
        Indices of fractions_df that belong to the training set
    n_splits : int, default=5
        Number of folds for cross-validation
    random_state : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    cv_splits : list of tuples
        List of (train_fold_indices, val_fold_indices) for each fold
    """
    # Filter to training data only
    train_data = fractions_df.loc[train_indices].copy()
    
    # Map glycan cluster labels
    glycan_cluster_map = dict(zip(glycans_df['Name'], glycans_df['cluster_label']))
    train_data['glycan_cluster'] = train_data['GlycanID'].map(glycan_cluster_map)
    
    # Map protein cluster labels
    protein_cluster_map = dict(zip(proteins_df['ProteinGroup'], proteins_df['cluster_label']))
    train_data['protein_cluster'] = train_data['ProteinGroup'].map(protein_cluster_map)
    
    # Get unique (GlycanID, ProteinGroup) combinations with their clusters
    unique_combinations = train_data[['GlycanID', 'ProteinGroup', 'glycan_cluster', 'protein_cluster']].drop_duplicates()
    
    # Create a combined stratification label (combination of both cluster labels)
    unique_combinations['combined_strat'] = unique_combinations['glycan_cluster'].astype(str) + '_' + unique_combinations['protein_cluster'].astype(str)
    
    # Set up StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Generate fold indices for the unique combinations
    fold_splits = list(skf.split(
        unique_combinations, 
        unique_combinations['combined_strat']
    ))
    
    # Map these back to the original indices in the training set
    cv_splits = []
    for fold_train_idx, fold_val_idx in fold_splits:
        # Get the (GlycanID, ProteinGroup) combos for this fold
        train_combos = unique_combinations.iloc[fold_train_idx]
        val_combos = unique_combinations.iloc[fold_val_idx]
        
        # Find all rows in the training data that match these combinations
        train_fold_mask = train_data.apply(
            lambda row: any((row['GlycanID'] == g) & (row['ProteinGroup'] == p) 
                          for g, p in zip(train_combos['GlycanID'], train_combos['ProteinGroup'])),
            axis=1
        )
        
        val_fold_mask = train_data.apply(
            lambda row: any((row['GlycanID'] == g) & (row['ProteinGroup'] == p) 
                          for g, p in zip(val_combos['GlycanID'], val_combos['ProteinGroup'])),
            axis=1
        )
        
        # Get the original indices
        train_fold_indices = train_data[train_fold_mask].index
        val_fold_indices = train_data[val_fold_mask].index
        
        cv_splits.append((train_fold_indices, val_fold_indices))
    
    return cv_splits


def summarize_split(fractions_df, glycans_df, proteins_df, train_indices, test_indices):
    """
    Summarize the train-test split to verify stratification and isolation of test entities.
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing ['ObjId', 'ProteinGroup', 'Concentration', 'GlycanID', 'f']
    glycans_df : pandas.DataFrame
        DataFrame containing ['Name', 'cluster_label'] where Name maps to GlycanID
    proteins_df : pandas.DataFrame
        DataFrame containing ['ProteinGroup', 'cluster_label']
    train_indices : numpy.ndarray
        Indices of fractions_df that belong to the training set
    test_indices : numpy.ndarray
        Indices of fractions_df that belong to the test set
    
    Returns:
    --------
    summary : dict
        Dictionary containing summary statistics about the split
    """
    train_data = fractions_df.loc[train_indices]
    test_data = fractions_df.loc[test_indices]
    
    # Check for overlap in GlycanIDs and ProteinGroups
    train_glycans = set(train_data['GlycanID'])
    test_glycans = set(test_data['GlycanID'])
    train_proteins = set(train_data['ProteinGroup'])
    test_proteins = set(test_data['ProteinGroup'])
    
    glycan_overlap = train_glycans.intersection(test_glycans)
    protein_overlap = train_proteins.intersection(test_proteins)
    
    # Check cluster distributions
    glycan_cluster_map = dict(zip(glycans_df['Name'], glycans_df['cluster_label']))
    protein_cluster_map = dict(zip(proteins_df['ProteinGroup'], proteins_df['cluster_label']))
    
    train_glycan_clusters = [glycan_cluster_map.get(g) for g in train_glycans if g in glycan_cluster_map]
    test_glycan_clusters = [glycan_cluster_map.get(g) for g in test_glycans if g in glycan_cluster_map]
    train_protein_clusters = [protein_cluster_map.get(p) for p in train_proteins if p in protein_cluster_map]
    test_protein_clusters = [protein_cluster_map.get(p) for p in test_proteins if p in protein_cluster_map]
    
    summary = {
        'train_size': len(train_indices),
        'test_size': len(test_indices),
        'train_test_ratio': len(train_indices) / (len(train_indices) + len(test_indices)),
        'unique_glycans_train': len(train_glycans),
        'unique_glycans_test': len(test_glycans),
        'unique_proteins_train': len(train_proteins),
        'unique_proteins_test': len(test_proteins),
        'glycan_overlap': len(glycan_overlap),
        'protein_overlap': len(protein_overlap),
        'train_glycan_cluster_counts': pd.Series(train_glycan_clusters).value_counts().to_dict(),
        'test_glycan_cluster_counts': pd.Series(test_glycan_clusters).value_counts().to_dict(),
        'train_protein_cluster_counts': pd.Series(train_protein_clusters).value_counts().to_dict(),
        'test_protein_cluster_counts': pd.Series(test_protein_clusters).value_counts().to_dict(),
    }
    
    return summary


# Example usage
def run_full_workflow(fractions_df, glycans_df, proteins_df, test_size=0.1, n_splits=5, random_state=42):
    """
    Run the full workflow: train-test split followed by k-fold cross-validation on the training set.
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing ['ObjId', 'ProteinGroup', 'Concentration', 'GlycanID', 'f']
    glycans_df : pandas.DataFrame
        DataFrame containing ['Name', 'cluster_label'] where Name maps to GlycanID
    proteins_df : pandas.DataFrame
        DataFrame containing ['ProteinGroup', 'cluster_label']
    test_size : float, default=0.1
        Proportion of data to include in the test set
    n_splits : int, default=5
        Number of folds for cross-validation
    random_state : int, default=42
        Random seed for reproducibility
    
    Returns:
    --------
    result : dict
        Dictionary containing train/test indices and CV folds
    """
    # Step 1: Create the train-test split
    train_indices, test_indices = create_stratified_train_test_split(
        fractions_df, glycans_df, proteins_df, test_size, random_state
    )
    
    # Step 2: Generate k-fold splits on the training data
    cv_splits = create_stratified_kfold(
        fractions_df, glycans_df, proteins_df, train_indices, n_splits, random_state
    )
    
    # Step 3: Summarize the split
    summary = summarize_split(fractions_df, glycans_df, proteins_df, train_indices, test_indices)
    
    result = {
        'train_indices': train_indices,
        'test_indices': test_indices,
        'cv_splits': cv_splits,
        'summary': summary
    }
    
    return result

In [56]:
run_full_workflow(fractions, glycans, proteins)

KeyboardInterrupt: 

In [141]:
fractions_df = new_fractions
test_size = 0.10

In [142]:
fractions_df['GlycanID'].value_counts()

GlycanID
CFG-007-Sp8     212
CFG-435-Sp21    212
CFG-427-Sp19    212
CFG-428-Sp24    212
CFG-429-Sp14    212
               ... 
CFG-189-Sp0     212
CFG-190-Sp9     212
CFG-640-Sp21    212
CFG-338-Sp12    209
CFG-051-Sp13    172
Name: count, Length: 611, dtype: int64

In [None]:
# range of 1800 - 5500 of valuecount for each proteingroup
fractions_df['ProteinGroup'].value_counts()

ProteinGroup
1      5499
2      4277
4      3666
6      3055
7      3055
8      3055
9      3055
10     3055
13     3055
55     2444
48     2444
49     2444
52     2444
64     2444
56     2444
60     2444
61     2444
43     2444
82     2444
47     2444
53     2444
40     2444
39     2444
38     2444
36     2444
34     2444
33     2444
30     2444
28     2444
27     2444
19     2444
17     2444
16     2444
84     2440
83     2440
81     2440
80     2440
79     2440
72     2440
70     2440
89     1833
91     1833
92     1833
94     1833
98     1833
107    1833
111    1833
112    1833
118    1830
137    1830
138    1830
146    1827
Name: count, dtype: int64

In [None]:
fractions_df[fractions_df['GlycanID'] == 'CFG-056-Sp14'].shape

(212, 6)

In [143]:
import random
from collections import deque

def create_isolated_split(fractions_df, test_size=0.1, random_state=42):
    """
    Create a data split where test set contains completely isolated GlycanIDs and ProteinGroups
    that don't appear in the training set.
    
    The function uses a recursive approach to ensure complete isolation:
    1. Select a random glycan
    2. Move all samples with that glycan to test set
    3. For each protein in those samples, move all samples with those proteins
    4. For each glycan in those new samples, move all their samples
    5. Continue until no more propagation occurs or desired test size is reached
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing 'GlycanID', 'ProteinGroup', etc.
    test_size : float
        Target proportion of data to be in the test set (default: 0.1)
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    train_df : pandas.DataFrame
        Training set
    test_df : pandas.DataFrame
        Test set with completely isolated GlycanIDs and ProteinGroups
    """
    # Set random seed for reproducibility
    random.seed(random_state)
    np.random.seed(random_state)
    
    # Make a copy of the original dataframe to avoid modifying it
    df = fractions_df.copy()
    
    # Track which samples are assigned to test set
    test_mask = np.zeros(len(df), dtype=bool)
    
    # Target number of samples for test set
    target_test_samples = int(len(df) * test_size)
    
    # Get the initial mapping between glycans and proteins
    glycan_to_proteins = {}
    protein_to_glycans = {}
    
    for glycan in df['GlycanID'].unique():
        proteins = df[df['GlycanID'] == glycan]['ProteinGroup'].unique()
        glycan_to_proteins[glycan] = set(proteins)
    
    for protein in df['ProteinGroup'].unique():
        glycans = df[df['ProteinGroup'] == protein]['GlycanID'].unique()
        protein_to_glycans[protein] = set(glycans)
    
    # Shuffle the glycans to randomly select starting points
    all_glycans = list(df['GlycanID'].unique())
    random.shuffle(all_glycans)
    
    # Try each glycan as a starting point until we reach the target test size
    for start_glycan in all_glycans:
        # Skip if we've already assigned enough samples to test
        if test_mask.sum() >= target_test_samples:
            break
            
        # Skip if this glycan is already in test set
        if start_glycan in set(df.loc[test_mask, 'GlycanID'].unique()):
            continue
        
        # Use BFS to propagate the isolation
        glycans_to_isolate = set([start_glycan])
        proteins_to_isolate = set()
        
        # Keep track of what we've already processed to avoid cycles
        processed_glycans = set()
        processed_proteins = set()
        
        # Use a queue for breadth-first traversal
        queue = deque([('glycan', start_glycan)])
        
        while queue:
            entity_type, entity = queue.popleft()
            
            if entity_type == 'glycan':
                if entity in processed_glycans:
                    continue
                    
                glycans_to_isolate.add(entity)
                processed_glycans.add(entity)
                
                # Add all proteins associated with this glycan
                for protein in glycan_to_proteins.get(entity, set()):
                    if protein not in processed_proteins:
                        queue.append(('protein', protein))
            else:  # entity_type == 'protein'
                if entity in processed_proteins:
                    continue
                    
                proteins_to_isolate.add(entity)
                processed_proteins.add(entity)
                
                # Add all glycans associated with this protein
                for glycan in protein_to_glycans.get(entity, set()):
                    if glycan not in processed_glycans:
                        queue.append(('glycan', glycan))
        
        # Mark all samples containing isolated glycans or proteins for test set
        isolation_mask = (
            df['GlycanID'].isin(glycans_to_isolate) | 
            df['ProteinGroup'].isin(proteins_to_isolate)
        )
        
        # Update test mask
        test_mask = test_mask | isolation_mask
        
        # Check if we've exceeded the target test size
        current_test_size = test_mask.sum() / len(df)
        if current_test_size >= test_size:
            break
    
    # Create the train and test dataframes
    test_df = df[test_mask].copy()
    train_df = df[~test_mask].copy()
    
    # Final statistics
    actual_test_size = len(test_df) / len(df)
    
    # Check that train and test sets have completely isolated glycans and proteins
    train_glycans = set(train_df['GlycanID'].unique())
    train_proteins = set(train_df['ProteinGroup'].unique())
    
    test_glycans = set(test_df['GlycanID'].unique())
    test_proteins = set(test_df['ProteinGroup'].unique())
    
    glycan_overlap = train_glycans.intersection(test_glycans)
    protein_overlap = train_proteins.intersection(test_proteins)
    
    print(f"Train set: {len(train_df)} samples ({len(train_df)/len(df):.2%})")
    print(f"Test set: {len(test_df)} samples ({len(test_df)/len(df):.2%})")
    print(f"Target test size: {test_size:.2%}")
    print(f"Actual test size: {actual_test_size:.2%}")
    print(f"Unique GlycanIDs in train: {len(train_glycans)}")
    print(f"Unique GlycanIDs in test: {len(test_glycans)}")
    print(f"Unique ProteinGroups in train: {len(train_proteins)}")
    print(f"Unique ProteinGroups in test: {len(test_proteins)}")
    print(f"Glycan overlap between train and test: {len(glycan_overlap)} (should be 0)")
    print(f"Protein overlap between train and test: {len(protein_overlap)} (should be 0)")
    
    if len(glycan_overlap) > 0 or len(protein_overlap) > 0:
        print("WARNING: There is overlap between train and test sets!")
    
    return train_df, test_df

In [144]:
train_df, test_df = create_isolated_split(new_fractions, test_size=0.1, random_state=42)

Train set: 0 samples (0.00%)
Test set: 129489 samples (100.00%)
Target test size: 10.00%
Actual test size: 100.00%
Unique GlycanIDs in train: 0
Unique GlycanIDs in test: 611
Unique ProteinGroups in train: 0
Unique ProteinGroups in test: 52
Glycan overlap between train and test: 0 (should be 0)
Protein overlap between train and test: 0 (should be 0)


In [147]:

def create_limited_isolation_split(fractions_df, test_size=0.1, random_state=42):
    """
    Create a data split that attempts to isolate GlycanIDs and ProteinGroups
    between train and test sets while limiting the test set size.
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing 'GlycanID', 'ProteinGroup', etc.
    test_size : float
        Target proportion of data to be in the test set (default: 0.1)
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    train_df : pandas.DataFrame
        Training set
    test_df : pandas.DataFrame
        Test set with isolated GlycanIDs and ProteinGroups
    """
    # Set random seed for reproducibility
    random.seed(random_state)
    np.random.seed(random_state)
    
    # Make a copy of the original dataframe to avoid modifying it
    df = fractions_df.copy().reset_index(drop=True)  # Reset index to avoid out-of-bounds issues
    
    # Target number of samples for test set
    target_test_samples = int(len(df) * test_size)
    
    # Build glycan-protein network for analysis
    glycan_to_samples = {}
    protein_to_samples = {}
    glycan_to_proteins = {}
    protein_to_glycans = {}
    
    # Create indices for faster lookups
    for i, row in df.iterrows():
        glycan = row['GlycanID']
        protein = row['ProteinGroup']
        
        # Map glycans and proteins to sample indices
        if glycan not in glycan_to_samples:
            glycan_to_samples[glycan] = []
        glycan_to_samples[glycan].append(i)
        
        if protein not in protein_to_samples:
            protein_to_samples[protein] = []
        protein_to_samples[protein].append(i)
        
        # Map glycans to proteins and vice versa
        if glycan not in glycan_to_proteins:
            glycan_to_proteins[glycan] = set()
        glycan_to_proteins[glycan].add(protein)
        
        if protein not in protein_to_glycans:
            protein_to_glycans[protein] = set()
        protein_to_glycans[protein].add(glycan)
    
    # Calculate the size of each glycan's connected component
    def calculate_component_size(start_glycan):
        """Calculate the size of the connected component starting from a glycan."""
        queue = deque([('glycan', start_glycan)])
        visited_glycans = set()
        visited_proteins = set()
        component_indices = set()
        
        while queue:
            entity_type, entity = queue.popleft()
            
            if entity_type == 'glycan':
                if entity in visited_glycans:
                    continue
                visited_glycans.add(entity)
                component_indices.update(glycan_to_samples.get(entity, []))
                
                # Add connected proteins to queue
                for protein in glycan_to_proteins.get(entity, set()):
                    if protein not in visited_proteins:
                        queue.append(('protein', protein))
            else:  # protein
                if entity in visited_proteins:
                    continue
                visited_proteins.add(entity)
                component_indices.update(protein_to_samples.get(entity, []))
                
                # Add connected glycans to queue
                for glycan in protein_to_glycans.get(entity, set()):
                    if glycan not in visited_glycans:
                        queue.append(('glycan', glycan))
        
        return len(component_indices), visited_glycans, visited_proteins, component_indices
    
    # Find smallest connected components that sum to approximately the test size
    all_glycans = list(glycan_to_samples.keys())
    random.shuffle(all_glycans)
    
    # Calculate component sizes for each starting glycan
    component_info = {}
    processed_glycans = set()
    
    for glycan in all_glycans:
        if glycan in processed_glycans:
            continue
            
        size, g_set, p_set, indices = calculate_component_size(glycan)
        component_info[glycan] = {
            'size': size,
            'glycans': g_set,
            'proteins': p_set,
            'indices': indices
        }
        
        processed_glycans.update(g_set)
    
    print(f"Found {len(component_info)} connected components in the dataset")
    
    # Sort components by size
    sorted_components = sorted(component_info.items(), key=lambda x: x[1]['size'])
    
    # Find non-overlapping components that sum to approximately the test size
    test_indices = set()
    test_glycans = set()
    test_proteins = set()
    current_size = 0
    
    # Try to find a combination of components close to target size
    # First try greedy approach with smallest components first
    for glycan, info in sorted_components:
        if current_size + info['size'] <= target_test_samples:
            test_indices.update(info['indices'])
            test_glycans.update(info['glycans'])
            test_proteins.update(info['proteins'])
            current_size += info['size']
    
    # If we haven't reached close to target size, we need to break a component
    if current_size < target_test_samples * 0.5:
        print(f"Connected components too large. Breaking isolation for some glycans/proteins.")
        
        # Reset and try a different approach
        test_indices = set()
        test_glycans = set()
        test_proteins = set()
        
        # Try to select individual glycans with minimal connections
        glycan_connectivity = {
            g: len(glycan_to_proteins[g]) for g in glycan_to_samples.keys()
        }
        
        # Sort glycans by connectivity (fewer connections first)
        sorted_glycans = sorted(glycan_connectivity.items(), key=lambda x: x[1])
        
        current_size = 0
        for glycan, _ in sorted_glycans:
            # Get all samples containing this glycan
            new_indices = set(glycan_to_samples[glycan])
            
            # Check if adding would exceed target
            if current_size + len(new_indices) > target_test_samples * 1.2:
                # If we're already close to target, stop
                if current_size >= target_test_samples * 0.8:
                    break
                
                # Otherwise, continue to next glycan
                continue
            
            # Add this glycan's samples
            test_indices.update(new_indices)
            test_glycans.add(glycan)
            
            # Add its proteins to the test set
            for idx in new_indices:
                protein = df.loc[idx, 'ProteinGroup']
                test_proteins.add(protein)
            
            current_size = len(test_indices)
            
            # Stop if we've reached target
            if current_size >= target_test_samples:
                break
    
    # Create test mask from indices
    test_mask = np.zeros(len(df), dtype=bool)
    for idx in test_indices:
        if 0 <= idx < len(df):  # Ensure index is valid
            test_mask[idx] = True
    
    # Create train and test sets
    test_df = df[test_mask].copy()
    train_df = df[~test_mask].copy()
    
    # Calculate actual test size
    actual_test_size = len(test_df) / len(df)
    
    # Check for overlaps
    train_glycans = set(train_df['GlycanID'].unique())
    train_proteins = set(train_df['ProteinGroup'].unique())
    
    test_glycans = set(test_df['GlycanID'].unique())
    test_proteins = set(test_df['ProteinGroup'].unique())
    
    glycan_overlap = train_glycans.intersection(test_glycans)
    protein_overlap = train_proteins.intersection(test_proteins)
    
    # Print statistics
    print(f"Train set: {len(train_df)} samples ({len(train_df)/len(df):.2%})")
    print(f"Test set: {len(test_df)} samples ({len(test_df)/len(df):.2%})")
    print(f"Target test size: {test_size:.2%}")
    print(f"Actual test size: {actual_test_size:.2%}")
    print(f"Unique GlycanIDs in train: {len(train_glycans)}")
    print(f"Unique GlycanIDs in test: {len(test_glycans)}")
    print(f"Unique ProteinGroups in train: {len(train_proteins)}")
    print(f"Unique ProteinGroups in test: {len(test_proteins)}")
    print(f"Glycan overlap between train and test: {len(glycan_overlap)} ({len(glycan_overlap)/len(test_glycans) if len(test_glycans) > 0 else 0:.2%} of test glycans)")
    print(f"Protein overlap between train and test: {len(protein_overlap)} ({len(protein_overlap)/len(test_proteins) if len(test_proteins) > 0 else 0:.2%} of test proteins)")
    
    return train_df, test_df, glycan_overlap, protein_overlap

# Alternative approach: select a subset of glycans
def create_subset_glycan_split(fractions_df, test_size=0.1, overlap_tolerance=0.0, random_state=42):
    """
    Alternative approach that selects a subset of glycans for the test set
    while controlling the amount of overlap in proteins.
    
    Parameters:
    -----------
    fractions_df : pandas.DataFrame
        DataFrame containing 'GlycanID', 'ProteinGroup', etc.
    test_size : float
        Target proportion of data to be in the test set (default: 0.1)
    overlap_tolerance : float
        Maximum allowed protein overlap (as fraction of test proteins) (default: 0.0)
    random_state : int
        Random seed for reproducibility
    
    Returns:
    --------
    train_df : pandas.DataFrame
        Training set
    test_df : pandas.DataFrame
        Test set with controlled glycan and protein overlap
    """
    # Set random seed for reproducibility
    random.seed(random_state)
    np.random.seed(random_state)
    
    # Make a copy of the original dataframe to avoid modifying it
    df = fractions_df.copy().reset_index(drop=True)
    
    # Target number of samples for test set
    target_test_samples = int(len(df) * test_size)
    
    # Get glycan and sample information
    all_glycans = df['GlycanID'].unique()
    glycan_sample_counts = df['GlycanID'].value_counts()
    
    # Shuffle glycans to randomize selection
    glycan_list = list(all_glycans)
    random.shuffle(glycan_list)
    
    # Sort glycans by sample count (start with smaller glycans)
    glycan_list = sorted(glycan_list, key=lambda g: glycan_sample_counts[g])
    
    # Select glycans for test set
    test_glycans = set()
    test_indices = set()
    current_size = 0
    
    for glycan in glycan_list:
        # Get indices for this glycan
        glycan_indices = df[df['GlycanID'] == glycan].index.tolist()
        new_size = current_size + len(glycan_indices)
        
        # Check if adding would exceed target by too much
        if new_size > target_test_samples * 1.3 and current_size >= target_test_samples * 0.7:
            continue
            
        # Add this glycan
        test_glycans.add(glycan)
        test_indices.update(glycan_indices)
        current_size = new_size
        
        # Stop if we've reached close to target
        if current_size >= target_test_samples:
            break
    
    # Create test mask from indices
    test_mask = np.zeros(len(df), dtype=bool)
    for idx in test_indices:
        test_mask[idx] = True
    
    # Create train and test sets
    test_df = df[test_mask].copy()
    train_df = df[~test_mask].copy()
    
    # Calculate overlap
    train_glycans = set(train_df['GlycanID'].unique())
    train_proteins = set(train_df['ProteinGroup'].unique())
    
    test_glycans = set(test_df['GlycanID'].unique())
    test_proteins = set(test_df['ProteinGroup'].unique())
    
    glycan_overlap = train_glycans.intersection(test_glycans)
    protein_overlap = train_proteins.intersection(test_proteins)
    
    protein_overlap_ratio = len(protein_overlap) / len(test_proteins) if test_proteins else 0
    
    # If protein overlap is too high, try to further reduce it
    if protein_overlap_ratio > overlap_tolerance and overlap_tolerance >= 0:
        print(f"Initial protein overlap too high: {protein_overlap_ratio:.2%}")
        print("Attempting to reduce protein overlap...")
        
        # Try to find glycans that minimize protein overlap
        max_iterations = 100
        for _ in range(max_iterations):
            # Calculate protein frequency in test set
            test_protein_counts = test_df['ProteinGroup'].value_counts()
            
            # Find least frequent proteins that appear in both sets
            overlap_proteins = list(protein_overlap)
            overlap_proteins.sort(key=lambda p: test_protein_counts.get(p, 0))
            
            if not overlap_proteins:
                break
                
            # For each overlapping protein, find glycans that contribute to it in test set
            protein_to_remove = overlap_proteins[0]
            contributing_glycans = test_df[test_df['ProteinGroup'] == protein_to_remove]['GlycanID'].unique()
            
            # Remove a glycan that contributes to this protein from test set
            if len(contributing_glycans) > 0:
                glycan_to_remove = contributing_glycans[0]
                remove_indices = test_df[(test_df['GlycanID'] == glycan_to_remove)].index
                
                # Update test set
                test_df = test_df.drop(remove_indices)
                
                # Update train set
                for idx in remove_indices:
                    orig_idx = df.index.get_loc(idx)
                    train_df = pd.concat([train_df, df.iloc[[orig_idx]]])
                
                # Recalculate overlap
                train_glycans = set(train_df['GlycanID'].unique())
                train_proteins = set(train_df['ProteinGroup'].unique())
                
                test_glycans = set(test_df['GlycanID'].unique())
                test_proteins = set(test_df['ProteinGroup'].unique())
                
                glycan_overlap = train_glycans.intersection(test_glycans)
                protein_overlap = train_proteins.intersection(test_proteins)
                
                protein_overlap_ratio = len(protein_overlap) / len(test_proteins) if test_proteins else 0
                
                if protein_overlap_ratio <= overlap_tolerance:
                    print(f"Successfully reduced protein overlap to {protein_overlap_ratio:.2%}")
                    break
            else:
                break
    
    # Print statistics
    actual_test_size = len(test_df) / (len(test_df) + len(train_df))
    
    print(f"Train set: {len(train_df)} samples ({1-actual_test_size:.2%})")
    print(f"Test set: {len(test_df)} samples ({actual_test_size:.2%})")
    print(f"Target test size: {test_size:.2%}")
    print(f"Actual test size: {actual_test_size:.2%}")
    print(f"Unique GlycanIDs in train: {len(train_glycans)}")
    print(f"Unique GlycanIDs in test: {len(test_glycans)}")
    print(f"Unique ProteinGroups in train: {len(train_proteins)}")
    print(f"Unique ProteinGroups in test: {len(test_proteins)}")
    print(f"Glycan overlap between train and test: {len(glycan_overlap)} ({len(glycan_overlap)/len(test_glycans) if len(test_glycans) > 0 else 0:.2%} of test glycans)")
    print(f"Protein overlap between train and test: {len(protein_overlap)} ({len(protein_overlap)/len(test_proteins) if len(test_proteins) > 0 else 0:.2%} of test proteins)")
    
    return train_df, test_df, glycan_overlap, protein_overlap

In [148]:
train_df, test_df, glycan_overlap, protein_overlap = create_limited_isolation_split(fractions_df, test_size=0.1)

Found 1 connected components in the dataset
Connected components too large. Breaking isolation for some glycans/proteins.
Train set: 116388 samples (89.88%)
Test set: 13101 samples (10.12%)
Target test size: 10.00%
Actual test size: 10.12%
Unique GlycanIDs in train: 549
Unique GlycanIDs in test: 62
Unique ProteinGroups in train: 52
Unique ProteinGroups in test: 52
Glycan overlap between train and test: 0 (0.00% of test glycans)
Protein overlap between train and test: 52 (100.00% of test proteins)


In [149]:
glycan_overlap

set()

In [151]:
len(protein_overlap)

52

In [None]:
glycan_overlap