In [1]:
import pandas as pd
import networkx as nx

In [2]:
# # CLIENT LEVEL
# path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG'
# feature_column = 'Proteins'

path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/03_Peptides_Genes'
feature_column = 'Gene.names'

centers = ['Center1', 'Center2', 'Center3']

intensities = {}
mappings = {}

for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
    # add all to intensities dict with key = center
    intensities[center] = aggregated_report
    # add all to mappings dict with key = center
    mapping = aggregated_report[['Sequence', feature_column]]
    mappings[center] = mapping
    print(f'{center} done')
    print(f'Number of peptides: {len(aggregated_report)}')

Center1 done
Number of peptides: 5958
Center2 done
Number of peptides: 6303
Center3 done
Number of peptides: 5528


In [3]:
# SERCER side --- merge mappings from different centers

merged_mapping = None

for name, df in mappings.items():
    if merged_mapping is None:
        merged_mapping = df
    else:
        merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', feature_column], how='outer')

print(f'Number of peptides in merged mapping: {len(merged_mapping)}')
# remove rows with NA
merged_mapping = merged_mapping.dropna(subset=['Sequence', feature_column])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')


Number of peptides in merged mapping: 7854
Number of peptides in merged mapping without NAs: 7628


In [4]:
def find_union(group, feature_column='Proteins'):
    # Find the intersection of lists in "Gene.names" within the group
    union = set(group[feature_column].iloc[0])
    for names in group[feature_column]:
        union |= set(names)
    # Return a Series with "Sequences" and the intersection of "Gene.names"
    return pd.Series({
        feature_column: ';'.join(union)
        # 'Gene.names': intersection
    })


merged_mapping[feature_column] = merged_mapping[feature_column].str.split(';')
merged_mapping = merged_mapping.groupby('Sequence').apply(lambda x: find_union(x, feature_column)).reset_index()

print(f'Number of unique peptides: {len(merged_mapping)}')

Number of unique peptides: 7567


In [5]:
### server side --- calculate unique and razor

df_exploded = merged_mapping.assign(**{feature_column: merged_mapping[feature_column].str.split(';')}).explode(feature_column)

unique_razor = df_exploded[feature_column].value_counts().rename_axis(feature_column).reset_index(name='Unique_razor')
unique_counts = merged_mapping[merged_mapping[feature_column].str.contains(';') == False][feature_column].value_counts().rename_axis(feature_column).reset_index(name='Unique')

result = pd.merge(unique_razor, unique_counts, on=feature_column, how='left').fillna({'Unique': 0})

In [6]:
peptides_mappings = pd.Series(merged_mapping[feature_column].values, index=merged_mapping['Sequence']).to_dict()
peptides_mapping_dict = {key: set(value.split(';')) for key, value in peptides_mappings.items()}

razor_uniq_dict = pd.Series(result['Unique_razor'].values, index=result[feature_column]).to_dict()
unique_genes = result[result['Unique'] > 0][feature_column].tolist()

In [7]:
def find_leading(features, more_then_half=False):
    # Filter the counts for the genes in the set
    relevant_counts = {feature: razor_uniq_dict[feature] for feature in features}
    # Find the max count
    max_count = max(relevant_counts.values())
    
    if more_then_half:
        leading_genes = [feature for feature, count in relevant_counts.items() if count >= max_count / 2]
    
    else:
        leading_genes = [feature for feature, count in relevant_counts.items() if count  == max_count]

    return set(sorted(leading_genes))



def build_peptide_gene_graph(peptides_mapping_dict):
    """
    Build a graph where peptides and proteins are nodes.
    """
    G = nx.Graph()
    for peptide, features in peptides_mapping_dict.items():
        for feature in features:
            G.add_node(peptide, type='peptide')
            G.add_node(feature, type='feature')
            G.add_edge(peptide, feature)
    return G


def sort_by_count(features_list):
    if len(features_list) == 1:
        return features_list
    # sort by razor unique count, from biggest to smallest
    return sorted(list(set(features_list)), key=lambda x: razor_uniq_dict[x], reverse=True)


In [8]:
G = build_peptide_gene_graph(peptides_mapping_dict)
connected_components = nx.connected_components(G)

final_protein_groups = []

for component in connected_components:
    component_copy = set(component)  
    features = [node for node in component_copy if G.nodes[node]['type'] == 'feature']
    peptides = [node for node in component_copy if G.nodes[node]['type'] == 'peptide']
    
    # find proteins with max count
    for feature in features:
        
        if len(peptides) == 0 or len(features) == 0:
            print('Error')
            print(f'Peptides: {peptides}')
            print(f'Features: {features}')
            raise ValueError('Peptides or features are not empty')
       
        leading = list(find_leading(features))
        leading_unique = set(leading) & set(unique_genes)
        if len(leading) > 1 and  len(leading_unique) > 0:
            # if intersect with unique_genes - take the first unique gene
            leading = list(leading_unique)[0]
        else:
            leading = leading[0]
                
        
        # Get all peptides directly connected to the leading protein
        leading_peptides = [peptide for peptide in peptides if G.has_edge(leading, peptide)]
        # Find other proteins connected only to these leading peptides
        other = set()

        for peptide in leading_peptides:
            connected_proteins = set(G.neighbors(peptide)) & set(features)  # Proteins connected to this peptide
            
            # Filter out proteins that are connected to peptides not in leading_peptides
            valid_features = set()
            for protein in connected_proteins:
                protein_peptides = set(G.neighbors(protein)) & set(peptides)  # Peptides connected to this protein
                if protein_peptides.issubset(set(leading_peptides)):
                    valid_features.add(protein)

            # Update 'other' with proteins that meet the criteria
            other.update(valid_features)

        other.discard(leading)
        leading_group = list(other) + [leading]

        razor_feature = find_leading(leading_group, more_then_half=False)
        if len(razor_feature) > 1:
            # keep only the first unique if there are more than one and unique is present
            if len(razor_feature & set(unique_genes)) > 0:
                razor_feature = sort_by_count(list(razor_feature & set(unique_genes)))
        razor_feature = list(razor_feature)[0]
        
        final_protein_group = {
            'features': sort_by_count(leading_group),
            'peptides': sorted(list(set(leading_peptides))),
            'razor_feature': razor_feature,
            'major_features': sort_by_count(list(find_leading(leading_group, more_then_half=True))),
        }
        
        final_protein_groups.append(final_protein_group)
            
        # Remove the proteins and peptides from the component_copy
        to_remove = set(leading_group) | set(leading_peptides)
        component_copy -= to_remove
        features = [feature for feature in features if feature not in set(leading_group)]
        peptides = [peptide for peptide in peptides if peptide not in set(leading_peptides)]

        if len(peptides) == 0 and len(features) == 0:
            break
    
    if len(peptides) == 0 and len(features) == 0:
        continue

In [9]:
final_protein_groups_df = pd.DataFrame(final_protein_groups)
final_protein_groups_df['major_features'] = final_protein_groups_df['major_features'].apply(lambda x: ';'.join(x))
final_protein_groups_df['features'] = final_protein_groups_df['features'].apply(lambda x: ';'.join(x))
print(final_protein_groups_df.shape)

# split peptides to different rows 
final_matching = final_protein_groups_df.reset_index(drop=True).explode('peptides')

print(final_matching.shape)
final_matching.head()


(612, 4)
(7567, 4)


Unnamed: 0,features,peptides,razor_feature,major_features
0,SERPINA3,AAAATGTIFTFR,SERPINA3,SERPINA3
0,SERPINA3,ADLSGITGAR,SERPINA3,SERPINA3
0,SERPINA3,AKWEMPFDPQDTHQSR,SERPINA3,SERPINA3
0,SERPINA3,AVLDVFEEGTEASAATAVK,SERPINA3,SERPINA3
0,SERPINA3,AVVEVDESGTR,SERPINA3,SERPINA3


In [10]:
for center in centers:
    # write to file
    final_matching.to_csv(f'{path_to_data}/{center}/mapping.tsv', sep='\t', index=False)

## Aggragate peptides to PG / Genes using mapping

In [11]:
for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
    final_matching = pd.read_csv(f'{path_to_data}/{center}/mapping.tsv', sep='\t')

    ################################  MERGE MAPPING WITH AGGREGATED REPORT  ################################
    # rename peptides to Sequence
    final_matching = final_matching.rename(columns={'peptides': 'Sequence'})
    # merge final_matching with aggregated_report (left join) using 'Sequence' as a key
    aggregated_report = pd.merge(aggregated_report, final_matching, on='Sequence', how='left')
    # write to file
    aggregated_report.to_csv(f'{path_to_data}/{center}/aggregated_NF_mapped.tsv', sep='\t', index=False)
    # remove unnecessary columns
    intensities[center] = aggregated_report.drop(columns=['Sequence', 'Proteins', 'Gene.names', 'features', 'razor_feature'])
    # drop column where major_features is NA
    intensities[center].dropna(subset=['major_features'], inplace=True)

    ################################  CHECK FOR CONTAMINANTS  ################################
    # Check for contaminants
    #  group by major_features and sum intensities for each group (if there is NA in any column, it will be ignored)
    reverse_contaminants = intensities[center].groupby('major_features').agg({**{col: 'sum' for col in intensities[center].columns if 'intensity' in col},
                                            'Reverse': lambda x: (x == '+').sum(),
                                            'Potential.contaminant': lambda x: (x == '+').sum(),
                                            'major_features': 'size'})

    reverse_contaminants['Reverse_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Reverse'] > x['major_features'] / 2 else 'NA', axis=1)
    reverse_contaminants['Potential.contaminant_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Potential.contaminant'] > x['major_features'] / 2 else 'NA', axis=1)
    # rename major_features to unique  + razor peptide
    reverse_contaminants = reverse_contaminants.rename(columns={'major_features': 'unique_razor_counts'})
    reverse_contaminants.drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)

    ################################  CALCULATE INTENSITIES  ################################
    # Group by major_features and sum intensities for each group
    intensities[center].drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)
    intensities[center] = intensities[center].groupby('major_features').sum().reset_index()
    intensities[center] = pd.merge(intensities[center], reverse_contaminants, on='major_features', how='left')

    ################################  WRITE TO FILE  ################################
    # write to file
    intensities[center].to_csv(f'{path_to_data}/{center}/intensities_counts_ALL.tsv', sep='\t', index=False)

    # filter out contaminants and reverse hits
    intensities[center] = intensities[center][intensities[center]['Reverse_mark'] != '+']
    intensities[center] = intensities[center][intensities[center]['Potential.contaminant_mark'] != '+']
    intensities[center].drop(columns=['Reverse_mark', 'Potential.contaminant_mark'], inplace=True)

    # save only intensities to file (without unique_razor_counts column)
    intensities[center].drop(columns=['unique_razor_counts']).to_csv(f'{path_to_data}/{center}/intensities_filtered.tsv', sep='\t', index=False)
    # save only unique_razor_counts to file
    intensities[center][['major_features', 'unique_razor_counts']].to_csv(f'{path_to_data}/{center}/counts_filtered.tsv', sep='\t', index=False)


    

# Check on Center 2 values only

In [84]:
import pandas as pd
import networkx as nx

In [85]:
# # CLIENT LEVEL
# path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG'
# output = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2'
# feature_column = 'Proteins'

path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/03_Peptides_Genes'
output = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2'
feature_column = 'Gene.names'

center = 'Center2'

aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
# add all to intensities dict with key = center
aggregated_report = aggregated_report
# add all to mappings dict with key = center
mapping = aggregated_report[['Sequence', feature_column]]
merged_mapping = mapping
print(f'{center} done')
print(f'Number of peptides: {len(aggregated_report)}')
# remove rows with NA
merged_mapping = merged_mapping.dropna(subset=['Sequence', feature_column])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')

Center2 done
Number of peptides: 6303
Number of peptides in merged mapping without NAs: 6143


In [86]:
### server side --- calculate unique and razor

df_exploded = merged_mapping.assign(**{feature_column: merged_mapping[feature_column].str.split(';')}).explode(feature_column)

unique_razor = df_exploded[feature_column].value_counts().rename_axis(feature_column).reset_index(name='Unique_razor')
unique_counts = merged_mapping[merged_mapping[feature_column].str.contains(';') == False][feature_column].value_counts().rename_axis(feature_column).reset_index(name='Unique')

result = pd.merge(unique_razor, unique_counts, on=feature_column, how='left').fillna({'Unique': 0})

In [87]:
peptides_mappings = pd.Series(merged_mapping[feature_column].values, index=merged_mapping['Sequence']).to_dict()
peptides_mapping_dict = {key: set(value.split(';')) for key, value in peptides_mappings.items()}

razor_uniq_dict = pd.Series(result['Unique_razor'].values, index=result[feature_column]).to_dict()
unique_genes = result[result['Unique'] > 0][feature_column].tolist()

In [88]:
def find_leading(features, more_then_half=False):
    # Filter the counts for the genes in the set
    relevant_counts = {feature: razor_uniq_dict[feature] for feature in features}
    # Find the max count
    max_count = max(relevant_counts.values())
    
    if more_then_half:
        leading_genes = [feature for feature, count in relevant_counts.items() if count >= max_count / 2]
    
    else:
        leading_genes = [feature for feature, count in relevant_counts.items() if count == max_count]

    return set(sorted(leading_genes))



def build_peptide_gene_graph(peptides_mapping_dict):
    """
    Build a graph where peptides and proteins are nodes.
    """
    G = nx.Graph()
    for peptide, features in peptides_mapping_dict.items():
        for feature in features:
            G.add_node(peptide, type='peptide')
            G.add_node(feature, type='feature')
            G.add_edge(peptide, feature)
    return G


def sort_by_count(features_list):
    if len(features_list) == 1:
        return features_list
    # sort by razor unique count, from biggest to smallest
    return sorted(list(set(features_list)), key=lambda x: razor_uniq_dict[x], reverse=True)


In [89]:
G = build_peptide_gene_graph(peptides_mapping_dict)
connected_components = nx.connected_components(G)

final_protein_groups = []

for component in connected_components:
    component_copy = set(component)  
    features = [node for node in component_copy if G.nodes[node]['type'] == 'feature']
    peptides = [node for node in component_copy if G.nodes[node]['type'] == 'peptide']
    
    # find proteins with max count
    for feature in features:
        
        if len(peptides) == 0 or len(features) == 0:
            print('Error')
            print(f'Peptides: {peptides}')
            print(f'Features: {features}')
            raise ValueError('Peptides or features are not empty')
       
        leading = list(find_leading(features))
        leading_unique = set(leading) & set(unique_genes)
        if len(leading) > 1 and  len(leading_unique) > 0:
            # if intersect with unique_genes - take the first unique gene
            leading = list(leading_unique)[0]
        else:
            leading = leading[0]
                
        
        # Get all peptides directly connected to the leading protein
        leading_peptides = [peptide for peptide in peptides if G.has_edge(leading, peptide)]
        # Find other proteins connected only to these leading peptides
        other = set()

        for peptide in leading_peptides:
            connected_proteins = set(G.neighbors(peptide)) & set(features)  # Proteins connected to this peptide
            
            # Filter out proteins that are connected to peptides not in leading_peptides
            valid_features = set()
            for protein in connected_proteins:
                protein_peptides = set(G.neighbors(protein)) & set(peptides)  # Peptides connected to this protein
                if protein_peptides.issubset(set(leading_peptides)):
                    valid_features.add(protein)

            # Update 'other' with proteins that meet the criteria
            other.update(valid_features)

        other.discard(leading)
        leading_group = list(other) + [leading]

        razor_feature = find_leading(leading_group, more_then_half=False)
        if len(razor_feature) > 1:
            # keep only the first unique if there are more than one and unique is present
            if len(razor_feature & set(unique_genes)) > 0:
                razor_feature = sort_by_count(list(razor_feature & set(unique_genes)))
        razor_feature = list(razor_feature)[0]
        
        final_protein_group = {
            'features': sort_by_count(leading_group),
            'peptides': sorted(list(set(leading_peptides))),
            'razor_feature': razor_feature,
            'major_features': sort_by_count(list(find_leading(leading_group, more_then_half=True))),
        }
        
        final_protein_groups.append(final_protein_group)
            
        # Remove the proteins and peptides from the component_copy
        to_remove = set(leading_group) | set(leading_peptides)
        component_copy -= to_remove
        features = [feature for feature in features if feature not in set(leading_group)]
        peptides = [peptide for peptide in peptides if peptide not in set(leading_peptides)]

        if len(peptides) == 0 and len(features) == 0:
            break
    
    if len(peptides) == 0 and len(features) == 0:
        continue

In [90]:
final_protein_groups_df = pd.DataFrame(final_protein_groups)
final_protein_groups_df['major_features'] = final_protein_groups_df['major_features'].apply(lambda x: ';'.join(x))
final_protein_groups_df['features'] = final_protein_groups_df['features'].apply(lambda x: ';'.join(x))
print(final_protein_groups_df.shape)
# split peptides to different rows 
final_matching = final_protein_groups_df.reset_index(drop=True).explode('peptides')
print(final_matching.shape)
final_matching.head()
print(f"Number of proteins in grouped data: {len(set(final_matching['major_features'].drop_duplicates().values))}")


(526, 4)
(6143, 4)
Number of proteins in grouped data: 526


In [91]:
final_matching.to_csv(f'{output}/mapping.tsv', sep='\t', index=False)

In [92]:
aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
final_matching = pd.read_csv(f'{output}/mapping.tsv', sep='\t')

################################  MERGE MAPPING WITH AGGREGATED REPORT  ################################
# rename peptides to Sequence
final_matching = final_matching.rename(columns={'peptides': 'Sequence'})
# merge final_matching with aggregated_report (left join) using 'Sequence' as a key
aggregated_report = pd.merge(aggregated_report, final_matching, on='Sequence', how='left')
# write to file
aggregated_report.to_csv(f'{output}/aggregated_NF_mapped{feature_column}.tsv', sep='\t', index=False)
# remove unnecessary columns
aggregated_report = aggregated_report.drop(columns=['Sequence', 'Proteins', 'Gene.names', 'features', 'razor_feature'])
# drop column where major_features is NA
aggregated_report.dropna(subset=['major_features'], inplace=True)

################################  CHECK FOR CONTAMINANTS  ################################
# Check for contaminants
#  group by major_features and sum intensities for each group (if there is NA in any column, it will be ignored)
reverse_contaminants = aggregated_report.groupby('major_features').agg({**{col: 'sum' for col in aggregated_report.columns if 'intensity' in col},
                                        'Reverse': lambda x: (x == '+').sum(),
                                        'Potential.contaminant': lambda x: (x == '+').sum(),
                                        'major_features': 'size'})

reverse_contaminants['Reverse_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Reverse'] >= x['major_features'] / 2 else 'NA', axis=1)
reverse_contaminants['Potential.contaminant_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Potential.contaminant'] >= x['major_features'] / 2 else 'NA', axis=1)
# rename major_features to unique  + razor peptide
reverse_contaminants = reverse_contaminants.rename(columns={'major_features': 'unique_razor_counts'})
reverse_contaminants.drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)

################################  CALCULATE INTENSITIES  ################################
# Group by major_features and sum intensities for each group
aggregated_report.drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)
aggregated_report = aggregated_report.groupby('major_features').sum().reset_index()
aggregated_report = pd.merge(aggregated_report, reverse_contaminants, on='major_features', how='left')

################################  WRITE TO FILE  ################################
# write to file
aggregated_report.to_csv(f'{output}/intensities_counts_ALL_pep_to{feature_column}.tsv', sep='\t', index=False)

# filter out contaminants and reverse hits
aggregated_report = aggregated_report[aggregated_report['Reverse_mark'] != '+']
aggregated_report = aggregated_report[aggregated_report['Potential.contaminant_mark'] != '+']
aggregated_report.drop(columns=['Reverse_mark', 'Potential.contaminant_mark'], inplace=True)

# save only intensities to file (without unique_razor_counts column)
aggregated_report.drop(columns=['unique_razor_counts']).to_csv(f'{output}/intensities_filtered_pep_to{feature_column}.tsv', sep='\t', index=False)
print(f'Number of unique peptides: {len(aggregated_report)}')
# save only unique_razor_counts to file
aggregated_report[['major_features', 'unique_razor_counts']].to_csv(f'{output}/counts_filtered_pep_to{feature_column}.tsv', sep='\t', index=False)


Number of unique peptides: 501
