In [389]:
import pandas as pd
import networkx as nx
from collections import defaultdict

In [390]:
# CLIENT LEVEL
path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG'
centers = ['Center3', 'Center2', 'Center3']

intensities = {}
mappings = {}

feature = 'Proteins'
# featiure = 'Gene.names'

for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
    # add all to intensities dict with key = center
    intensities[center] = aggregated_report
    # add all to mappings dict with key = center
    mapping = aggregated_report[['Sequence', feature]]
    mappings[center] = mapping
    print(f'{center} done')
    print(f'Number of peptides: {len(aggregated_report)}')

Center3 done
Number of peptides: 5528
Center2 done
Number of peptides: 6303
Center3 done
Number of peptides: 5528


In [391]:
# SERCER side --- merge mappings from different centers

merged_mapping = None

for name, df in mappings.items():
    if merged_mapping is None:
        merged_mapping = df
    else:
        merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', feature], how='outer')

print(f'Number of peptides in merged mapping: {len(merged_mapping)}')
# remove rows with NA
merged_mapping = merged_mapping.dropna(subset=['Sequence', feature])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')


Number of peptides in merged mapping: 7162
Number of peptides in merged mapping without NAs: 7162


In [392]:
def find_union(group, feature='Proteins'):
    # Find the intersection of lists in "Gene.names" within the group
    union = set(group[feature].iloc[0])
    for names in group[feature]:
        union |= set(names)
    # Return a Series with "Sequences" and the intersection of "Gene.names"
    return pd.Series({
        feature: ';'.join(union)
        # 'Gene.names': intersection
    })


merged_mapping[feature] = merged_mapping[feature].str.split(';')
merged_mapping = merged_mapping.groupby('Sequence').apply(lambda x: find_union(x, feature)).reset_index()

print(f'Number of unique peptides: {len(merged_mapping)}')

Number of unique peptides: 7022


In [393]:
### server side --- calculate unique and razor

df_exploded = merged_mapping.assign(**{feature: merged_mapping[feature].str.split(';')}).explode(feature)

unique_razor = df_exploded[feature].value_counts().rename_axis(feature).reset_index(name='Unique_razor')
unique_counts = merged_mapping[merged_mapping[feature].str.contains(';') == False][feature].value_counts().rename_axis(feature).reset_index(name='Unique')

result = pd.merge(unique_razor, unique_counts, on=feature, how='left').fillna({'Unique': 0})

In [394]:
peptides_mappings = pd.Series(merged_mapping[feature].values, index=merged_mapping['Sequence']).to_dict()
peptides_mapping_dict = {key: set(value.split(';')) for key, value in peptides_mappings.items()}

razor_uniq_dict = pd.Series(result['Unique_razor'].values, index=result[feature]).to_dict()
unique_genes = result[result['Unique'] > 0][feature].tolist()

In [395]:
def find_leading(features, more_then_half=False):
    # Filter the counts for the genes in the set
    relevant_counts = {feature: razor_uniq_dict[feature] for feature in features}
    # Find the max count
    max_count = max(relevant_counts.values())
    
    if more_then_half:
        leading_genes = [feature for feature, count in relevant_counts.items() if count >= max_count / 2]
    
    else:
        leading_genes = [feature for feature, count in relevant_counts.items() if count  == max_count]

    return set(sorted(leading_genes))



def build_peptide_gene_graph(peptides_mapping_dict):
    """
    Build a graph where peptides and proteins are nodes.
    """
    G = nx.Graph()
    for peptide, features in peptides_mapping_dict.items():
        for feature in features:
            G.add_node(peptide, type='peptide')
            G.add_node(feature, type='feature')
            G.add_edge(peptide, feature)
    return G


def sort_by_count(features_list):
    if len(features_list) == 1:
        return features_list
    # sort by razor unique count, from biggest to smallest
    return sorted(list(set(features_list)), key=lambda x: razor_uniq_dict[x], reverse=True)


In [396]:
G = build_peptide_gene_graph(peptides_mapping_dict)
connected_components = nx.connected_components(G)

final_protein_groups = []

for component in connected_components:
    component_copy = set(component)  
    features = [node for node in component_copy if G.nodes[node]['type'] == 'feature']
    peptides = [node for node in component_copy if G.nodes[node]['type'] == 'peptide']
    
    # find proteins with max count
    for feature in features:
        
        if len(peptides) == 0 or len(features) == 0:
            print('Error')
            print(f'Peptides: {peptides}')
            print(f'Features: {features}')
            raise ValueError('Peptides or features are not empty')
       
        leading = list(find_leading(features))
        leading_unique = set(leading) & set(unique_genes)
        if len(leading) > 1 and  len(leading_unique) > 0:
            # if intersect with unique_genes - take the first unique gene
            leading = list(leading_unique)[0]
        else:
            leading = leading[0]
                
        
        # Get all peptides directly connected to the leading protein
        leading_peptides = [peptide for peptide in peptides if G.has_edge(leading, peptide)]
        # Find other proteins connected only to these leading peptides
        other = set()

        for peptide in leading_peptides:
            connected_proteins = set(G.neighbors(peptide)) & set(features)  # Proteins connected to this peptide
            
            # Filter out proteins that are connected to peptides not in leading_peptides
            valid_features = set()
            for protein in connected_proteins:
                protein_peptides = set(G.neighbors(protein)) & set(peptides)  # Peptides connected to this protein
                if protein_peptides.issubset(set(leading_peptides)):
                    valid_features.add(protein)

            # Update 'other' with proteins that meet the criteria
            other.update(valid_features)

        other.discard(leading)
        leading_group = list(other) + [leading]

        razor_feature = find_leading(leading_group, more_then_half=False)
        if len(razor_feature) > 1:
            # keep only the first unique if there are more than one and unique is present
            if len(razor_feature & set(unique_genes)) > 0:
                razor_feature = sort_by_count(list(razor_feature & set(unique_genes)))[0]
            else:
                razor_feature = list(razor_feature)[0]
        
        final_protein_group = {
            'features': sort_by_count(leading_group),
            'peptides': sorted(list(set(leading_peptides))),
            'razor_feature': list(razor_feature)[0],
            'major_features': sort_by_count(list(find_leading(leading_group, more_then_half=True))),
        }
        
        final_protein_groups.append(final_protein_group)
            
        # Remove the proteins and peptides from the component_copy
        to_remove = set(leading_group) | set(leading_peptides)
        component_copy -= to_remove
        features = [feature for feature in features if feature not in set(leading_group)]
        peptides = [peptide for peptide in peptides if peptide not in set(leading_peptides)]

        if len(peptides) == 0 and len(features) == 0:
            break
    
    if len(peptides) == 0 and len(features) == 0:
        continue

In [397]:
protein_groups_as_strings = [';'.join(sorted(group['major_features'])) for group in final_protein_groups]
print(len(protein_groups_as_strings))


629


In [398]:
final_protein_groups_df = pd.DataFrame(final_protein_groups)
print(final_protein_groups_df.shape)

# split peptides to different rows 
final_matching = final_protein_groups_df.explode('peptides')
final_matching['major_features'] = final_matching['major_features'].apply(lambda x: ';'.join(x))
final_matching['features'] = final_matching['features'].apply(lambda x: ';'.join(x))

print(final_matching.shape)
final_matching.head()


(629, 4)
(7022, 4)


Unnamed: 0,features,peptides,razor_feature,major_features
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V5I3;G3V4...,AAAATGTIFTFR,{P05154},P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V5I3;G3V4...,AKWETSFNHK,{P05154},P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V5I3;G3V4...,AVVEVDESGTR,{P05154},P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V5I3;G3V4...,DFTFDLYR,{P05154},P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V5I3;G3V4...,EDQYHYLLDR,{P05154},P05154;G3V2M1


In [399]:
for center in centers:
    # write to file
    final_matching.to_csv(f'{path_to_data}/{center}/mapping.tsv', sep='\t', index=False)