In [3]:
import sys
sys.path.append('/home/yuliya/repos/cosybio/FedProt/evaluation_utils/')

import pandas as pd
from preprocessing_MQ import razor_unique

In [5]:
# # # CLIENT LEVEL
# path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG'
# feature_column = 'Proteins'

# path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/03_Peptides_Genes'
# feature_column = 'Gene.names'

# filter_rev check 
# path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/mapping_filtering_check/custom_NR/'
# feature_column = 'Proteins'

centers = ['Center1', 'Center2', 'Center3']

intensities = {}
mappings = {}

for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
    # add all to intensities dict with key = center
    intensities[center] = aggregated_report
    # add all to mappings dict with key = center
    mapping = aggregated_report[['Sequence', feature_column]]
    mappings[center] = mapping
    print(f'{center} done')
    print(f'Number of peptides: {len(aggregated_report)}')

Center1 done
Number of peptides: 5958
Center2 done
Number of peptides: 6303
Center3 done
Number of peptides: 5528


In [6]:
# SERCER side --- merge mappings from different centers

merged_mapping = None

for name, df in mappings.items():
    if merged_mapping is None:
        merged_mapping = df
    else:
        merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', feature_column], how='outer')

print(f'Number of peptides in merged mapping: {len(merged_mapping)}')
# remove rows with NA
merged_mapping = merged_mapping.dropna(subset=['Sequence', feature_column])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')


Number of peptides in merged mapping: 8824
Number of peptides in merged mapping without NAs: 8824


In [7]:
def find_union(group, feature_column='Proteins'):
    # Find the intersection of lists in "Gene.names" within the group
    union = set(group[feature_column].iloc[0])
    for names in group[feature_column]:
        union |= set(names)
    # Return a Series with "Sequences" and the intersection of "Gene.names"
    return pd.Series({
        feature_column: ';'.join(union)
        # 'Gene.names': intersection
    })


merged_mapping[feature_column] = merged_mapping[feature_column].str.split(';')
merged_mapping = merged_mapping.groupby('Sequence').apply(lambda x: find_union(x, feature_column)).reset_index()

print(f'Number of unique peptides: {len(merged_mapping)}')

Number of unique peptides: 7793


In [8]:
### server side --- calculate unique and razor
final_protein_groups = razor_unique.peptide_grouping(merged_mapping, feature_column)

In [9]:
final_protein_groups_df = pd.DataFrame(final_protein_groups)
final_protein_groups_df['major_features'] = final_protein_groups_df['major_features'].apply(lambda x: ';'.join(x))
final_protein_groups_df['features'] = final_protein_groups_df['features'].apply(lambda x: ';'.join(x))
print(final_protein_groups_df.shape)

# split peptides to different rows 
final_matching = final_protein_groups_df.reset_index(drop=True).explode('peptides')

print(final_matching.shape)
final_matching.head()


(700, 4)
(7793, 4)


Unnamed: 0,features,peptides,razor_feature,major_features
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V3F5;G3V5...,AAAATGTIFTFR,P05154,P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V3F5;G3V5...,AKWETSFNHK,P05154,P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V3F5;G3V5...,AVVEVDESGTR,P05154,P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V3F5;G3V5...,DFTFDLYR,P05154,P05154;G3V2M1
0,P05154;G3V2M1;G3V264;G3V482;G3V265;G3V3F5;G3V5...,EDQYHYLLDR,P05154,P05154;G3V2M1


In [11]:
for center in centers:
    # write to file
    final_matching.to_csv(f'{path_to_data}/{center}/mapping.tsv', sep='\t', index=False)

## Aggragate peptides to PG / Genes using mapping

In [13]:
for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}aggregated_NF.tsv', sep='\t')
    final_matching = pd.read_csv(f'{path_to_data}/{center}mapping.tsv', sep='\t')

    ################################  MERGE MAPPING WITH AGGREGATED REPORT  ################################
    # rename peptides to Sequence
    final_matching = final_matching.rename(columns={'peptides': 'Sequence'})
    # merge final_matching with aggregated_report (left join) using 'Sequence' as a key
    aggregated_report = pd.merge(aggregated_report, final_matching, on='Sequence', how='left')
    # write to file
    aggregated_report.to_csv(f'{path_to_data}/{center}/aggregated_NF_mapped.tsv', sep='\t', index=False)
    # remove unnecessary columns
    intensities[center] = aggregated_report.drop(columns=['Sequence', 'Proteins', 'Gene.names', 'features', 'razor_feature'])
    # drop column where major_features is NA
    intensities[center].dropna(subset=['major_features'], inplace=True)

    ################################  CHECK FOR CONTAMINANTS  ################################
    # Check for contaminants
    #  group by major_features and sum intensities for each group (if there is NA in any column, it will be ignored)
    reverse_contaminants = intensities[center].groupby('major_features').agg({**{col: 'sum' for col in intensities[center].columns if 'intensity' in col},
                                            'Reverse': lambda x: (x == '+').sum(),
                                            'Potential.contaminant': lambda x: (x == '+').sum(),
                                            'major_features': 'size'})

    reverse_contaminants['Reverse_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Reverse'] > x['major_features'] / 2 else 'NA', axis=1)
    reverse_contaminants['Potential.contaminant_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Potential.contaminant'] > x['major_features'] / 2 else 'NA', axis=1)
    # rename major_features to unique  + razor peptide
    reverse_contaminants = reverse_contaminants.rename(columns={'major_features': 'unique_razor_counts'})
    reverse_contaminants.drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)

    ################################  CALCULATE INTENSITIES  ################################
    # Group by major_features and sum intensities for each group
    intensities[center].drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)
    intensities[center] = intensities[center].groupby('major_features').sum().reset_index()

    ################################  WRITE TO FILE  ################################    # write to file
    intensities[center].to_csv(f'{path_to_data}/{center}/intensities_counts_ALL.tsv', sep='\t', index=False)

    # filter out contaminants and reverse hit
    intensities[center] = pd.merge(intensities[center], reverse_contaminants, on='major_features', how='left')
    intensities[center] = intensities[center][intensities[center]['Reverse_mark'] != '+']
    intensities[center] = intensities[center][intensities[center]['Potential.contaminant_mark'] != '+']
    intensities[center].drop(columns=['Reverse_mark', 'Potential.contaminant_mark'], inplace=True)

    # save only intensities to file (without unique_razor_counts column)
    intensities[center].drop(columns=['unique_razor_counts']).to_csv(f'{path_to_data}/{center}/intensities_filtered.tsv', sep='\t', index=False)
    # save only unique_razor_counts to file
    # intensities[center][['major_features', 'unique_razor_counts']].to_csv(f'{path_to_data}/{center}/counts_filtered_withinCenter.tsv', sep='\t', index=False)


    

# Check on Center 2 values only

In [2]:
import pandas as pd
import networkx as nx

In [3]:
# # CLIENT LEVEL
# path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG'
# output = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2'
# feature_column = 'Proteins'

path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/03_Peptides_Genes'
output = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/Check_center_2'
feature_column = 'Gene.names'

center = 'Center2'

aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
# add all to intensities dict with key = center
aggregated_report = aggregated_report
# add all to mappings dict with key = center
mapping = aggregated_report[['Sequence', feature_column]]
merged_mapping = mapping
print(f'{center} done')
print(f'Number of peptides: {len(aggregated_report)}')
# remove rows with NA
merged_mapping = merged_mapping.dropna(subset=['Sequence', feature_column])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')

Center2 done
Number of peptides: 6303
Number of peptides in merged mapping without NAs: 6143


In [4]:
### server side --- calculate unique and razor
final_protein_groups = razor_unique.peptide_grouping(merged_mapping, feature_column)

In [5]:
final_protein_groups_df = pd.DataFrame(final_protein_groups)
final_protein_groups_df['major_features'] = final_protein_groups_df['major_features'].apply(lambda x: ';'.join(x))
final_protein_groups_df['features'] = final_protein_groups_df['features'].apply(lambda x: ';'.join(x))
print(final_protein_groups_df.shape)
# split peptides to different rows 
final_matching = final_protein_groups_df.reset_index(drop=True).explode('peptides')
print(final_matching.shape)
final_matching.head()
print(f"Number of proteins in grouped data: {len(set(final_matching['major_features'].drop_duplicates().values))}")


(526, 4)
(6143, 4)
Number of proteins in grouped data: 526


In [6]:
final_matching.to_csv(f'{output}/mapping.tsv', sep='\t', index=False)

In [7]:
aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
final_matching = pd.read_csv(f'{output}/mapping.tsv', sep='\t')

################################  MERGE MAPPING WITH AGGREGATED REPORT  ################################
# rename peptides to Sequence
final_matching = final_matching.rename(columns={'peptides': 'Sequence'})
# merge final_matching with aggregated_report (left join) using 'Sequence' as a key
aggregated_report = pd.merge(aggregated_report, final_matching, on='Sequence', how='left')
# write to file
aggregated_report.to_csv(f'{output}/aggregated_NF_mapped{feature_column}.tsv', sep='\t', index=False)
# remove unnecessary columns
aggregated_report = aggregated_report.drop(columns=['Sequence', 'Proteins', 'Gene.names', 'features', 'razor_feature'])
# drop column where major_features is NA
aggregated_report.dropna(subset=['major_features'], inplace=True)

################################  CHECK FOR CONTAMINANTS  ################################
# Check for contaminants
#  group by major_features and sum intensities for each group (if there is NA in any column, it will be ignored)
reverse_contaminants = aggregated_report.groupby('major_features').agg({**{col: 'sum' for col in aggregated_report.columns if 'intensity' in col},
                                        'Reverse': lambda x: (x == '+').sum(),
                                        'Potential.contaminant': lambda x: (x == '+').sum(),
                                        'major_features': 'size'})

reverse_contaminants['Reverse_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Reverse'] >= x['major_features'] / 2 else 'NA', axis=1)
reverse_contaminants['Potential.contaminant_mark'] = reverse_contaminants.apply(lambda x: '+' if x['Potential.contaminant'] >= x['major_features'] / 2 else 'NA', axis=1)
# rename major_features to unique  + razor peptide
reverse_contaminants = reverse_contaminants.rename(columns={'major_features': 'unique_razor_counts'})
reverse_contaminants.drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)

################################  CALCULATE INTENSITIES  ################################
# Group by major_features and sum intensities for each group
aggregated_report.drop(columns=['Reverse', 'Potential.contaminant'], inplace=True)
aggregated_report = aggregated_report.groupby('major_features').sum().reset_index()
aggregated_report = pd.merge(aggregated_report, reverse_contaminants, on='major_features', how='left')

################################  WRITE TO FILE  ################################
# write to file
aggregated_report.to_csv(f'{output}/intensities_counts_ALL_pep_to{feature_column}.tsv', sep='\t', index=False)

# filter out contaminants and reverse hits
aggregated_report = aggregated_report[aggregated_report['Reverse_mark'] != '+']
aggregated_report = aggregated_report[aggregated_report['Potential.contaminant_mark'] != '+']
aggregated_report.drop(columns=['Reverse_mark', 'Potential.contaminant_mark'], inplace=True)

# save only intensities to file (without unique_razor_counts column)
aggregated_report.drop(columns=['unique_razor_counts']).to_csv(f'{output}/intensities_filtered_pep_to{feature_column}.tsv', sep='\t', index=False)
print(f'Number of unique peptides: {len(aggregated_report)}')
# save only unique_razor_counts to file
aggregated_report[['major_features', 'unique_razor_counts']].to_csv(f'{output}/counts_filtered_pep_to{feature_column}.tsv', sep='\t', index=False)


Number of unique peptides: 501
