In [270]:
import pandas as pd
import numpy as np


# Aggregate to Gene level

In [295]:
# CLIENT LEVEL

path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/03_Peptides_Genes'
centers = ['Center1', 'Center2', 'Center3']

intensities = {}
mappings = {}

for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
    # add all to intensities dict with key = center
    intensities[center] = aggregated_report
    # add all to mappings dict with key = center
    # mapping = aggregated_report[['Sequence', 'Proteins']]
    mapping = aggregated_report[['Sequence', 'Gene.names']]
    #rename Gene.names to Proteins
    mapping = mapping.rename(columns={'Gene.names': 'Proteins'})
    # transform Sequence column to index
    # mapping = mapping.set_index('Sequence')
    mappings[center] = mapping
    print(f'{center} done')
    print(f'Number of peptides: {len(aggregated_report)}')

Center1 done
Number of peptides: 5958
Center2 done
Number of peptides: 6303
Center3 done
Number of peptides: 5528


In [296]:
# SERCER side --- merge mappings
merged_mapping = None

for name, df in mappings.items():
    if merged_mapping is None:
        merged_mapping = df
    else:
        # merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', 'Gene.names'], how='outer')
        merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', 'Proteins'], how='outer')

print(f'Number of peptides in merged mapping: {len(merged_mapping)}')
# remove rows with NA
# merged_mapping = merged_mapping.dropna(subset=['Sequence', 'Gene.names'])
merged_mapping = merged_mapping.dropna(subset=['Sequence', 'Proteins'])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')



# keep only unique values in Sequence column by intersect Gene.names
def find_intersection(group):
    # Find the intersection of lists in "Gene.names" within the group
    intersection = set(group['Proteins'].iloc[0])
    for names in group['Proteins']:
        intersection &= set(names)
    # Return a Series with "Sequences" and the intersection of "Gene.names"
    return pd.Series({
        'Proteins': ';'.join(intersection)
        # 'Gene.names': intersection
    })


merged_mapping['Proteins'] = merged_mapping['Proteins'].str.split(';')
merged_mapping_unique = merged_mapping.groupby('Sequence').apply(find_intersection).reset_index()

print(f'Number of unique peptides: {len(merged_mapping_unique)}')

Number of peptides in merged mapping: 7854
Number of peptides in merged mapping without NAs: 7628
Number of unique peptides: 7567


In [297]:
# print non-unique rows
non_unique = merged_mapping[merged_mapping.duplicated(subset='Sequence', keep=False)]
non_unique.sort_values(by='Sequence')

Unnamed: 0,Sequence,Proteins
30,AAPSVTLFPPSSEELQANK,"[IGLC6, IGLC7]"
7435,AAPSVTLFPPSSEELQANK,[IGLC6]
57,ADGSPVK,"[IGLC1, IGLC7, IGLL5]"
7439,ADGSPVK,"[IGLC1, IGLL5]"
74,ADTLTDEINFLR,[KRT6A]
...,...,...
5736,YEDEINKR,"[KRT1, KRT2, KRT5, KRT6A, KRT75, KRT77, KRT8]"
7385,YEDEINKR,"[KRT1, KRT2, KRT5, KRT77, KRT8]"
7843,YEDEINKR,"[KRT1, KRT2, KRT5, KRT6A, KRT6C, KRT75, KRT77,..."
5742,YEELQVTAGR,"[KRT6A, KRT75]"


In [298]:
### server side --- calculate unique and razor

df_exploded = merged_mapping_unique.assign(**{'Proteins': merged_mapping_unique['Proteins'].str.split(';')}).explode('Proteins')

unique_razor = df_exploded['Proteins'].value_counts().rename_axis('Proteins').reset_index(name='Unique_razor')
unique_counts = merged_mapping_unique[merged_mapping_unique['Proteins'].str.contains(';') == False]['Proteins'].value_counts().rename_axis('Proteins').reset_index(name='Unique')

result = pd.merge(unique_razor, unique_counts, on='Proteins', how='left').fillna({'Unique': 0})

In [299]:
new_for_dict = merged_mapping_unique.groupby('Proteins')['Sequence'].apply(lambda x: ';'.join(x)).reset_index()
peptides_mappings = pd.Series(new_for_dict['Proteins'].values, index=new_for_dict['Sequence']).to_dict()
peptides_mapping_dict = {key: set(value.split(';')) for key, value in peptides_mappings.items()}

razor_uniq_dict = pd.Series(result['Unique_razor'].values, index=result['Proteins']).to_dict()
unique_genes = result[result['Unique'] > 0]['Proteins'].tolist()
multiple_non_unique_genes = result[(result['Unique'] == 0) & (result['Unique_razor'] > 1)]['Proteins'].tolist()

In [300]:
def find_leading_genes(genes):
    # Filter the counts for the genes in the set
    relevant_counts = {gene: razor_uniq_dict[gene] for gene in genes}
    # Find the max count
    max_count = max(relevant_counts.values())
    # Find all genes with the max count
    leading_genes = [gene for gene, count in relevant_counts.items() if count == max_count]
    return set(sorted(leading_genes))

In [301]:
i = 0
new_peptides_genes = peptides_mapping_dict.copy()


while True:

    print(f'Iteration {i}')
    new_dict = new_peptides_genes.copy()
    found = False

    for peptide_1, genes_1 in new_peptides_genes.items():
        for peptide_2, genes_2 in new_peptides_genes.items():
            if peptide_1 == peptide_2:
                continue

            common_genes = genes_1.intersection(genes_2)
            if len(common_genes) < 1:
                continue
            
            leading_1 = find_leading_genes(genes_1)
            leading_2 = find_leading_genes(genes_2)
            merged_peptide_key = f"{peptide_1};{peptide_2}"

            if leading_1 == leading_2:
               found = True
               new_dict[merged_peptide_key] = genes_1.union(genes_2)
               del new_dict[peptide_1]
               del new_dict[peptide_2]

            elif razor_uniq_dict[list(leading_1)[0]] > razor_uniq_dict[list(leading_2)[0]]:
               found = True
               new_dict[peptide_1] = new_dict[peptide_1] - common_genes

            elif razor_uniq_dict[list(leading_1)[0]] < razor_uniq_dict[list(leading_2)[0]]:
               found = True
               new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            # case if leading genes are not the same but intersection exist
            # elif leading_1.issubset(common_genes) and leading_1 < leading_2:
            #     found = True
            #     new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            # elif leading_2.issubset(common_genes) and leading_2 < leading_1:
            #     found = True
            #     new_dict[peptide_1] = new_dict[peptide_1] - common_genes
               
            elif len(leading_1) > len(leading_2):
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes
            
            elif len(leading_1) < len(leading_2):
                found = True
                new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            elif len(genes_1) > len(genes_2):
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes
            
            elif len(genes_1) < len(genes_2):
                found = True
                new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            else:
              print("\n\nOOPS! WARNING!")
              print(f'{genes_1} vs {genes_2}')
              print(f'{leading_1} vs {leading_2}')
              print(f'{common_genes}')
              print('\n\n')
              raise ValueError

            if found:
                break

        if found:
            break

    if new_dict == new_peptides_genes:
        break

    else:
        i += 1
        new_peptides_genes = new_dict

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration

In [302]:
# Each value in iter_peptides_mapping_dict collapse using ; and transform to pandas df
collapsed_PG = pd.DataFrame({'Sequence': list(new_peptides_genes.keys()), 'Proteins': [sorted(value) for value in new_peptides_genes.values()]})


def find_leading_genes(genes):
    # Filter the counts for the genes in the set
    relevant_counts = {gene: razor_uniq_dict[gene] for gene in genes}
    # Find the max count
    max_count = max(relevant_counts.values())
    # Find all genes with the max count
    leading_genes = [gene for gene, count in relevant_counts.items() if count >= (max_count / 2)]
    return leading_genes


# add column with leading gene
collapsed_PG['Leading'] = collapsed_PG['Proteins'].apply(find_leading_genes)
# join Leading column valies to string using ;
collapsed_PG['Leading'] = collapsed_PG['Leading'].apply(lambda x: ';'.join(x))
collapsed_PG['All.proteins'] = collapsed_PG['Proteins'].apply(lambda x: ';'.join(x))
collapsed_PG

Unnamed: 0,Sequence,Proteins,Leading,All.proteins
0,ATWSGAVLAGR;CEGPIPDVTFELLR;CEGPIPDVTFELLREGETK...,[A1BG],A1BG,A1BG
1,ETDPLGKPR;IYLSDSLTGK;NPLGEGPVSNTVAFSTESADPR;TG...,[ABI3BP],ABI3BP,ABI3BP
2,AALPAQELEEYNK;AILQFYPK;DMVGLDALDAQPLLK;EAGYEGP...,[ACE],ACE,ACE
3,YPIEHGIITNWDDMEK,"[ACTA1, ACTA2, ACTC1, ACTG2]",ACTA1;ACTA2;ACTC1;ACTG2,ACTA1;ACTA2;ACTC1;ACTG2
4,GYFTHHHQR;HLLGPDYTETLYSPR,[ADAMDEC1],ADAMDEC1,ADAMDEC1
...,...,...,...,...
608,IQLVEEELDR;IQLVEEELDRAQER;LATALQK;AEGDVAALNR;A...,"[CCDC57, TPM4]",TPM4,CCDC57;TPM4
609,EDAANNYAR;AVCMLSNTTAIAEAWAR;QLFHPEQLITGK;VGINY...,"[TUBA3E, TUBA4A, TUBA8, TUBAL3]",TUBA3E;TUBA4A,TUBA3E;TUBA4A;TUBA8;TUBAL3
610,SYELPDGQVITIGNER;AVFPSIVGR;AVFPSIVGRPR;AGFAGDD...,"[ACTB, ACTBL2, ACTG1, POTEE, POTEF, POTEKP]",ACTB;ACTG1,ACTB;ACTBL2;ACTG1;POTEE;POTEF;POTEKP
611,KDAEAWFNEK;LASYLDKVR;LASYLDK;LAADDFR;QSVEADING...,"[KRT10, KRT18, KRT24, KRT26]",KRT10,KRT10;KRT18;KRT24;KRT26


In [303]:
# Load the original data
original_data = pd.read_csv('/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Combined/proteinGroups.txt', sep='\t')

# filter if Reverse or Potential contaminant columns contain +, if the value is empty - keep it
# original_data = original_data[~original_data['Reverse'].str.contains('\+', na=False)]
# original_data = original_data[~original_data['Potential contaminant'].str.contains('\+', na=False)]
# original_data = original_data[~original_data['Only identified by site'].str.contains('\+', na=False)]


# original_data = original_data[['Majority protein IDs']]
# original_data = original_data[['Protein IDs']]
original_data = original_data[['Gene.names']]
print(f'Number of proteins in original data: {len(original_data)}')

# For each value in Protein IDs, split by ;, sort and join by ;
# original_data['Proteins'] = original_data['Majority protein IDs'].str.split(';').apply(lambda x: str(';'.join(sorted(x))))
# original_data['Proteins'] = original_data['Protein IDs'].str.split(';').apply(lambda x: str(';'.join(sorted(x))))
original_data['Proteins'] = original_data['Gene.names'].str.split(';').apply(lambda x: str(';'.join(sorted(x))))
original_data
original_PG = set(original_data[['Proteins']].drop_duplicates().values.flatten().tolist())
original_PG = set(original_data[['Proteins']].drop_duplicates().values.flatten().tolist())
print(f'Number of proteins in original data after dropping duplicates: {len(original_PG)}')


KeyError: "None of [Index(['Gene.names'], dtype='object')] are in the [columns]"

In [289]:
# collapsed_PG_PG = set(collapsed_PG['Leading'].drop_duplicates().values.flatten().tolist())
collapsed_PG_PG = set(collapsed_PG['All.proteins'].drop_duplicates().values.flatten().tolist())
print(f'Number of proteins in collapsed data: {len(collapsed_PG_PG)}')

Number of proteins in collapsed data: 700


In [290]:
original_PG - collapsed_PG_PG

{'A0A024R6N5;A0A0B4J278;A0A0G2JRN3;G3V2B9;G3V387;G3V544;G3V5R8;P20848',
 'A0A075B6E2;M0QYF7;M0R140;M0R2L9;P39019',
 'A0A075B6K2;A0A075B6K5',
 'A0A075B6P5;A0A075B6S6;A0A087WW87;A0A0A0MRZ7;P01614;P01615',
 'A0A075B6Q5;A0A075B7F1;A0A0B4J1V0;A0A0C4DH36;A0A0J9YX35;P0DTE1;S4R3C0',
 'A0A075B6S5;A0A0C4DH67;A0A0C4DH69',
 'A0A075B727;P16284',
 'A0A075B7E8;A0A0B4J1X5;A0A0C4DH42;A0A0J9YVY3;P01767',
 'A0A087WTE4;A0A087WTF6;A0A087WTR3;A0A087WV75;A0A087WVD0;A0A087WVU1;A0A087WWD4;A0A087WWJ5;A0A087WX77;A0A087WZS4;A0A087X1V2;A0A0C4DGS4;A0A0D9SF30;P13591',
 'A0A087WV01;A0A087WVQ9;A0A2R8Y488;A0A2R8Y660;A0A2R8YDN5;A0A2U3TZH3;A0A6Q8PFK6;A0A7I2V3H3;A0A7I2V5N4;A0A7I2V659;A0A9L9PXK0;A0A9L9PYI8;A6PW80;P68104;Q05639;Q5JR01;Q5VTE0',
 'A0A087WWU8;A0A2R2Y2Q3;A0A494C0G0;A0A494C0P6;D6R904;H0YKP3;H0YL80;J3KN67;P06753',
 'A0A087WWY3;A0A669KBC6;A0A7I2V3E6;A0A7P0NMY4;A0A804HJC2;A0A804HK76;A0A804HL72;E7EN95;H0Y5C6;H0Y5F3;H7C5L4;O75369;P21333;Q14315;Q60FE5',
 'A0A087WYQ1;A4QPE4;O15020',
 'A0A087X0D5;A0A3B3IRM7;A0A7I2V2E4;A

In [291]:
collapsed_PG_PG - original_PG

{'A0A024R6N5;A0A0G2JRN3',
 'A0A075B6K0;P01717;P01718',
 'A0A075B6K2;P80748',
 'A0A075B6K5',
 'A0A075B6P5;A0A087WW87;P01614;P01615',
 'A0A075B6S6;A0A0A0MRZ7;P06310',
 'A0A075B736;A6NNZ2;CON__ENSEMBL:ENSBTAP00000025008;P68371;Q13885;Q3ZCM7;Q5SQY0;Q9BVA1;Q9H4B7',
 'A0A075B7E8;A0A0B4J1X5',
 'A0A075B7F0;A0A0B4J1V1;A0A0C4DH32;P01762;P01763;P01780',
 'A0A087WTE4;A0A087WTF6;A0A087WTR3;A0A087WVD0;A0A087WVU1;A0A087WWJ5;A0A087WZS4;A0A087X1V2;A0A0C4DGS4;A0A0D9SF30;A0A0D9SF98;H7BYX6',
 'A0A087WV01;A0A087WVQ9;A0A2R8Y488;A0A2R8Y660;A0A2U3TZH3;A0A7I2V3H3;A0A7I2V5N4;A0A7I2V659;A0A9L9PXK0;A0A9L9PYI8;A6PW80;P68104;Q05639;Q5JR01;Q5VTE0',
 'A0A087WV75;A0A087WWD4;A0A087WX77;P13591',
 'A0A087WWT3;A0A0C4DGB6;B7WNR0;C9JKR2;CON__P02768-1;D6RHD5;H0YA55;H7C013;P02768',
 'A0A087WWU8;A0A2R2Y2Q3;A0A494C0G0;A0A494C0P6;D6R904;J3KN67;P06753;Q5VU61',
 'A0A087WWY3;A0A669KBC6;A0A7I2V3E6;A0A7P0NMY4;A0A804HJC2;A0A804HK76;A0A804HL72;E7EN95;H0Y5C6;H0Y5F3;H7C2E7;H7C5L4;O75369;P21333;Q14315;Q60FE5',
 'A0A0A0MR79;A0A8I5KQY2;A0A8

In [292]:
len(original_PG - collapsed_PG_PG) / len(original_PG)

0.20270270270270271

In [293]:
len(collapsed_PG_PG - original_PG) / len(collapsed_PG_PG)

0.24142857142857144

In [665]:
# # check if gene names contains more than one unique gene
# # if yes - keep only unqiue with with max number of peptides in razor_uniq_dict
# for peptide,genes in peptides_mapping_dict.items():
#     if len(genes) > 1:
#         unique_ones = genes.intersection(unique_genes)
#         if len(unique_ones) > 1:
#             # remove unique ones that are not max
#             max_gene = max(unique_ones, key=lambda x: razor_uniq_dict[x])
#             unique_ones.remove(max_gene)
#             for gene in unique_ones:
#                 genes.remove(gene)
#             peptides_mapping_dict[peptide] = genes



In [638]:
def gene_in_dict_with_max(gene, max_gene, dictionary):
    """Check if gene and max_gene are in any entry together."""
    for genes in dictionary.values():
        if gene in genes and max_gene in genes:
            return True
    return False


i = 0
iter_peptides_mapping_dict = peptides_mapping_dict.copy()

while True:
    new_dict = iter_peptides_mapping_dict.copy()
    found = False

    for peptide_1, genes_1 in iter_peptides_mapping_dict.items():
        for peptide_2, genes_2 in iter_peptides_mapping_dict.items():
            if peptide_1 == peptide_2:
                continue

            common_genes = genes_1.intersection(genes_2)
            if not common_genes:
                continue
            
            union_genes = genes_1.union(genes_2)
            max_gene = max(union_genes, key=lambda x: razor_uniq_dict[x])

            # Logic 1: If gene lists are identical
            if genes_1 == genes_2:
                found = True
                new_dict[peptide_1] = genes_1
                del new_dict[peptide_2]
                
            
            # Logic 2: If the largest protein in union not in intersection
            elif max_gene not in common_genes:
                found = True
                # Remove intersection part as specified
                if max_gene in genes_1:
                    new_dict[peptide_1] = genes_1 - common_genes
                if max_gene in genes_2:
                    new_dict[peptide_2] = genes_2 - common_genes
                
            
            # Logic 3: If the largest protein is in the intersection
            else:
              found = True
              
              non_intersecting_genes_1 = genes_1 - common_genes
              non_intersecting_genes_2 = genes_2 - common_genes


              # Move genes to common if they appear with max_gene in any entry
              for gene in list(non_intersecting_genes_1):
                  if gene_in_dict_with_max(gene, max_gene, iter_peptides_mapping_dict):
                      non_intersecting_genes_1.remove(gene)
                      common_genes.add(gene)

              for gene in list(non_intersecting_genes_2):
                  if gene_in_dict_with_max(gene, max_gene, iter_peptides_mapping_dict):
                      non_intersecting_genes_2.remove(gene)
                      common_genes.add(gene)

              # Merge the entries for peptides if common_genes is modified
              if common_genes:
                  merged_peptide_key = f"{peptide_1};{peptide_2}"
                  new_dict[merged_peptide_key] = common_genes
                  del new_dict[peptide_1]
                  del new_dict[peptide_2]
                  

              # Add remaining non-intersecting genes under unique keys if any
              if non_intersecting_genes_1:
                  new_dict[f"{peptide_1}_unique"] = non_intersecting_genes_1
              if non_intersecting_genes_2:
                  new_dict[f"{peptide_2}_unique"] = non_intersecting_genes_2

            if found:
                break

        if found:
            break

    if new_dict == iter_peptides_mapping_dict:
        break
    else:
        print(f'Iteration {i}')
        i += 1
        iter_peptides_mapping_dict = new_dict


Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52


Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration 77
Iteration 78
Iteration 79
Iteration 80
Iteration 81
Iteration 82
Iteration 83
Iteration 84
Iteration 85
Iteration 86
Iteration 87
Iteration 88
Iteration 89
Iteration 90
Iteration 91
Iteration 92
Iteration 93
Iteration 94
Iteration 95
Iteration 96
Iteration 97
Iteration 98
Iteration 99
Iteration 100
Iteration 101
Iteration 102
Iteration 103
Iteration 104
Iteration 105
Iteration 106
Iteration 107
Iteration 108
Iteration 109
Iteration 110
Iteration 111
Iteration 112
Iteration 113
Iteration 114
Iteration 115
Iteration 116
Iteration 117
Iteration 118
Iteration 119
Iteration 120
Iteration 121
Iteration 122
Iteration 123
Iteration 124
Iteration 125
Iteration 126
Iteration 1

In [639]:
# Each value in iter_peptides_mapping_dict collapse using ; and transform to pandas df
collapsed_PG = pd.DataFrame({'Sequence': list(iter_peptides_mapping_dict.keys()), 'Proteins': [';'.join(sorted(value)) for value in iter_peptides_mapping_dict.values()]})
collapsed_PG

Unnamed: 0,Sequence,Proteins
0,LLIYGASTR,A0A075B6H7;A0A0C4DH55;A0A0C4DH90;P01624
1,FSGSSSGAER;GDGIPDRFSGSSSGAER,A0A075B6H9
2,SSQSLLHSDGK,A0A075B6S2;A2NJV5
3,VEDTAVYYCAR,A0A075B7B8
4,GLVWVSR,A0A075B7E8;A0A0B4J1X5
...,...,...
451,IYHSHIDAPK;DIASGLIGPLIICK;DIASGLIGPLIICKK;DSLD...,CON__ENSEMBL:ENSBTAP00000031900;E9PFZ2;H7C5N5;...
452,PNKPGVYVR;YILQGVTSWGLGCARPNKPGVYVR;APWCHTTNSQV...,A0A9L9PXP2;A0A9L9PY04;A0A9L9PYG2;A6PVI2;CON__P...
453,ALQEYRK;FMETVAEK;LFDSDPITVTVPVEVSR;LFDSDPITVTV...,E5RG36;E5RGB0;E5RJZ5;E7ERK6;E7ETB4;H0YAS8;H0YC...
454,QEYDESGPSIVHR;VAPEEHPVLLTEAPLNPK;EITALAPSTMK;D...,A0A2R8Y793;A0A2R8YEA7;A0A2R8YFE2;A0A2R8YGF8;A0...


In [646]:
# Load the original data
original_data = pd.read_csv('/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Center3/proteinGroups.txt', sep='\t')

# filter if Reverse or Potential contaminant columns contain +, if the value is empty - keep it
# original_data = original_data[~original_data['Reverse'].str.contains('\+', na=False)]
# original_data = original_data[~original_data['Potential contaminant'].str.contains('\+', na=False)]
# original_data = original_data[~original_data['Only identified by site'].str.contains('\+', na=False)]


original_data = original_data[['Protein IDs']]
print(f'Number of proteins in original data: {len(original_data)}')

# For each value in Protein IDs, split by ;, sort and join by ;
original_data['Proteins'] = original_data['Protein IDs'].str.split(';').apply(lambda x: str(';'.join(sorted(x))))
original_data
original_PG = set(original_data[['Proteins']].drop_duplicates().values.flatten().tolist())
print(f'Number of proteins in original data after dropping duplicates: {len(original_PG)}')


Number of proteins in original data: 505
Number of proteins in original data after dropping duplicates: 505


In [647]:
collapsed_PG_PG = set(collapsed_PG['Proteins'].drop_duplicates().values.flatten().tolist())
print(f'Number of proteins in collapsed data: {len(collapsed_PG_PG)}')

Number of proteins in collapsed data: 456


In [648]:
collapsed_PG_PG - original_PG

{'A0A024R6N5;A0A0G2JRN3',
 'A0A0A0MS51;A0A0A0MT01;P06396',
 'A0A0B4J278;G3V2B9;G3V387;G3V544;G3V5R8;P01009;P20848',
 'A0A0G2JH38;A0A0G2JHM4;A0A0G2JI59;A0A0G2JJ82;A0A0G2JJM0;A0A8V8TMI9;A0A8V8TNU0;A2ABG0;B4E1Z4;C9JYQ5;F8WCJ9;H7C5H1;P00751',
 'A0A0G2JIE7;A0A0G2JK28;A0A0G2JL69;A0A8Q3WKM6;A0A8Q3WKM9;A0A8Q3WKN5;A0A8Q3WL70;B4DQI1;E9PDZ0;F2Z3N2;H0Y3H6;H0Y868;P06681;Q5ST52;Q8N6L6',
 'A0A7P0T945;A0A7P0TA31;A6NHL2;C9J2C0;C9JDL2;C9K0S6;F5H5D3;F8VQQ4;F8VRZ4;F8VS66;F8VVB9;F8VWV9;F8VX09;P0DPH7;P0DPH8;P68363;Q6PEY2;Q71U36;Q9BQE3;Q9NY65',
 'A0A8Q3SI05;A0A8Q3SI34',
 'A0A8Q3SI45;A0A8Q3WKN7;A0A8Q3WLS3;M0QXZ3;M0QYC8;M0R1Q1;O95568;P01024',
 'A0A8Q3WM02',
 'A0A8V8TND7',
 'B1AHL2',
 'B7ZKJ8',
 'C9JEV8;C9JJQ8;C9JQ00;P68366',
 'C9JMA2;P48740',
 'CON__Q3T052;Q14624',
 'F8W876'}

In [649]:
original_PG - collapsed_PG_PG

{'A0A024R6N5;A0A0B4J278;A0A0G2JRN3;G3V2B9;G3V387;G3V544;G3V5R8;P20848',
 'A0A075B736;A6NNZ2;CON__ENSEMBL:ENSBTAP00000025008;Q3ZCM7;Q5SQY0;Q9H4B7',
 'A0A087WWT3;A0A0C4DGB6;B7WNR0;C9JKR2;CON__P02768-1;D6RHD5;H0YA55;H7C013;P02768',
 'A0A0A0MS51;A0A0A0MT01;A0A0U1RQL8;A0A8V8TND7;Q5T0H8;REV__A0A6Q8PG26;REV__F8W8F9;REV__H0YJM7;REV__Q6TDU7',
 'A0A0G2JH38;A0A8V8TMI9;A0A8V8TNU0;B4E1Z4;H7C5H1;P00751',
 'A0A0G2JHM4;A0A0G2JI59;A0A0G2JIE7;A0A0G2JJ82;A0A0G2JJM0;A0A0G2JK28;A0A0G2JL69;A0A8Q3WKM6;A0A8Q3WKM9;A0A8Q3WKN5;A0A8Q3WL70;A2ABG0;B4DQI1;C9JYQ5;E9PDZ0;F2Z3N2;F8WCJ9;H0Y3H6;H0Y868;P06681;Q5ST52;Q8N6L6',
 'A0A0J9YYC8;A6XMV9;CON__P07477;E7EQ64;H0Y8D1;P07477;P07478;Q8NHM4',
 'A0A0U1RRA4;B1AHM6;B1AHM7;B1AHM8;B1AHM9;B1AHN3;CON__ENSEMBL:ENSBTAP00000016046;P23142',
 'A0A1B0GTT5;A0A1W2PR46;A0A1X7SBR3;A0A1X7SCE1;B0YJC4;B4DIR1;CON__H-INV:HIT000292931;CON__O95678;CON__P02538;CON__P04259;CON__P05787;CON__P07744;CON__P08729;CON__P48668;CON__Q3KNV1;CON__Q6ISB0;CON__Q8VED5;CON__Q9DCV7;CON__Q9H552;CON__Q9NSB2;F8VUG2

In [644]:
len(original_PG - collapsed_PG_PG) / len(original_PG)

0.03508771929824561

In [645]:
len(collapsed_PG_PG - original_PG) / len(collapsed_PG_PG)

0.03508771929824561

In [357]:
peptides_mapping_dict

{'LLIYGASTR': {'A0A075B6H7', 'A0A0C4DH55', 'A0A0C4DH90', 'P01624'},
 'FSGSILGNK;SSGVPDR': {'A0A075B6I0'},
 'ITCSGDALPK;YAYWYQQK': {'A0A075B6K4'},
 'LSSVTAADTAVYYCAR': {'A0A075B6R2',
  'A0A087WSY4',
  'A0A087WW49',
  'A0A0C4DH41',
  'P01824',
  'P01825',
  'P06331',
  'P0DP06',
  'P0DP07',
  'P0DP08'},
 'GLVWVSR': {'A0A075B7E8', 'A0A0B4J1X5'},
 'YPLYVLK': {'A0A087WSY5',
  'A0A6Q8PG06',
  'A0A6Q8PHS9',
  'CON__Q2KIG3',
  'Q96IY4'},
 'ASQSVSSNLAWYQQK;ASQSVSSNLAWYQQKPGQAPR': {'A0A087WSY6', 'P01624'},
 'AADDTWEPFASGK;ALGISPFHEHAEVVFTANDSGPR;ALGISPFHEHAEVVFTANDSGPRR;GSPAINVAVHVFR;GSPAINVAVHVFRK;KAADDTWEPFASGK;RYTIAALLSPYSYSTTAVVTNPK;RYTIAALLSPYSYSTTAVVTNPKE;TSESGELHGLTTEEEFVEGIYK;YTIAALLSPYSYSTTAVVTNPKE': {'A0A087WT59',
  'P02766'},
 'AGEQDATIHLK;CVVTGEDGSESEATVNVK;FFLCQVAGDAK;LTPNQQR;NAPTPQEFR;VDKNDEAEYICIAENK': {'A0A087WTE4',
  'A0A087WTF6',
  'A0A087WV75',
  'A0A087WWD4',
  'A0A087WX77',
  'H7BYX6',
  'P13591'},
 'LPSGSDHVMLK': {'A0A087WTE4',
  'A0A087WTF6',
  'A0A087WTR3',
  'A0A087WV75'

In [360]:
res = []
for peptide, genes in peptides_mapping_dict.items():
    
    if genes.intersection({'A0A0D9SFE8', 'A0A2R8Y793', 'A0A2R8YEA7', 'A0A2R8YFE2', 'A0A2R8YGF8', 'A0A494C1H2', 'A0A6Q8PFE4', 'A0A6Q8PGD7', 'A0A6Q8PH58', 'A0A7P0TBL1', 'A0A804GS07', 'A0A804HKV3', 'A5A3E0', 'A6NL76', 'B8ZZJ2', 'C9JFL5', 'C9JTX5', 'C9JUM1', 'C9JZR7', 'E7EVS6', 'F6QUT6', 'F6UVQ4', 'F8WB63', 'F8WCH0', 'G5E9R0', 'I3L1U9', 'I3L3I0', 'I3L3R2', 'I3L4N8', 'J3KT65', 'K7EM38', 'P0CG38', 'P0CG39', 'P60709', 'P62736', 'P63261', 'P63267', 'P68032', 'P68133', 'Q562R1', 'Q6S8J3', 'Q9BYX7'}):
        print(sorted(genes))
        res += list(genes)

res = set(res)
print(" \n", sorted(res))

['A0A0D9SFE8', 'A0A494C1H2', 'P0CG38', 'P0CG39']
['A0A2R8YFE2', 'A0A6Q8PFE4', 'A0A804GS07', 'A5A3E0', 'I3L4N8', 'P0CG38', 'P0CG39', 'P60709', 'P63261', 'Q6S8J3', 'Q9BYX7']
['A0A2R8Y793', 'A0A2R8YGF8', 'A0A6Q8PFE4', 'A0A6Q8PH58', 'A0A804GS07', 'E7EVS6', 'G5E9R0', 'I3L1U9', 'I3L3I0', 'I3L3R2', 'I3L4N8', 'J3KT65', 'K7EM38', 'P60709', 'P63261']
['A0A2R8YFE2', 'A0A6Q8PFE4', 'A0A804GS07', 'A0A804HKV3', 'A6NL76', 'E7EVS6', 'I3L4N8', 'P60709', 'P62736', 'P63261', 'P63267', 'P68032', 'P68133']
['A0A2R8Y793', 'A0A2R8YEA7', 'A0A6Q8PFE4', 'A0A804GS07', 'A0A804HKV3', 'A6NL76', 'E7EVS6', 'I3L1U9', 'I3L3I0', 'I3L4N8', 'P60709', 'P62736', 'P63261', 'P63267', 'P68032', 'P68133', 'Q562R1']
['A0A2R8Y793', 'A0A2R8YEA7', 'A0A6Q8PFE4', 'A0A804GS07', 'A0A804HKV3', 'A5A3E0', 'A6NL76', 'E7EVS6', 'I3L4N8', 'P60709', 'P62736', 'P63261', 'P63267', 'P68032', 'P68133', 'Q562R1', 'Q6S8J3', 'Q9BYX7']
['A0A2R8Y793', 'A0A2R8YGF8', 'A0A6Q8PGD7', 'A0A6Q8PH58', 'C9JTX5', 'C9JUM1', 'C9JZR7', 'E7EVS6', 'G5E9R0', 'P60709']
[

In [None]:
#

In [380]:
# get all values from razor_uniq_dict for genes in res
res_1 = {gene:razor_uniq_dict[gene] for gene in res}
# print sorted values from res_1 by values
print(sorted(res_1.items(), key=lambda x: x[1], reverse=True), sep='\n')


# do the same for multiple_non_unique_genes

[('P60709', 17), ('P63261', 16), ('I3L4N8', 16), ('E7EVS6', 15), ('A0A804GS07', 14), ('A0A2R8Y793', 14), ('P68032', 12), ('P68133', 12), ('A0A6Q8PFE4', 12), ('P62736', 11), ('I3L3I0', 11), ('A6NL76', 11), ('I3L1U9', 11), ('P63267', 11), ('A0A2R8YGF8', 9), ('G5E9R0', 9), ('A0A6Q8PH58', 9), ('C9JZR7', 8), ('I3L3R2', 8), ('A0A804HKV3', 8), ('C9JUM1', 8), ('K7EM38', 8), ('J3KT65', 8), ('Q6S8J3', 7), ('B8ZZJ2', 7), ('F6QUT6', 7), ('C9JFL5', 7), ('F6UVQ4', 7), ('F8WB63', 7), ('C9JTX5', 7), ('A5A3E0', 7), ('P0CG38', 7), ('P0CG39', 5), ('Q562R1', 4), ('A0A2R8YEA7', 4), ('F8WCH0', 3), ('A0A2R8YFE2', 3), ('Q9BYX7', 3), ('A0A6Q8PGD7', 2), ('A0A0D9SFE8', 1), ('A0A494C1H2', 1), ('A0A7P0TBL1', 1)]


In [363]:
result[result['Proteins'].isin(['A0A0D9SFE8', 'A0A2R8Y793', 'A0A2R8YEA7', 'A0A2R8YFE2', 'A0A2R8YGF8', 'A0A494C1H2', 'A0A6Q8PFE4', 'A0A6Q8PGD7', 'A0A6Q8PH58', 'A0A7P0TBL1', 'A0A804GS07', 'A0A804HKV3', 'A5A3E0', 'A6NL76', 'B8ZZJ2', 'C9JFL5', 'C9JTX5', 'C9JUM1', 'C9JZR7', 'E7EVS6', 'F6QUT6', 'F6UVQ4', 'F8WB63', 'F8WCH0', 'G5E9R0', 'I3L1U9', 'I3L3I0', 'I3L3R2', 'I3L4N8', 'J3KT65', 'K7EM38', 'P0CG38', 'P0CG39', 'P60709', 'P62736', 'P63261', 'P63267', 'P68032', 'P68133', 'Q562R1', 'Q6S8J3', 'Q9BYX7'])].sort_values(by='Unique', ascending=False)

Unnamed: 0,Proteins,Unique_razor,Unique
187,P60709,17,0.0
222,I3L4N8,16,0.0
223,P63261,16,0.0
236,E7EVS6,15,0.0
243,A0A2R8Y793,14,0.0
258,A0A804GS07,14,0.0
282,P68133,12,0.0
284,P68032,12,0.0
291,A0A6Q8PFE4,12,0.0
333,I3L3I0,11,0.0


In [300]:
peptides_mapping_dict

{'ATWSGAVLAGR;CEGPIPDVTFELLR;CLAPLEGAR;FALVREDR;GVTFLLR;HQFLLTGDTQGR;IFFHLNAVALGDGGHYTCR;LELHVDGPPPRPQLR;LLELTGPK;NGVAQEPVHLDSPAIK;SGLSTGWTQLSK;SLPAPWLSMAPVSWITPGLK;TPGAAANLELIFVGPQHAGNYR;VTLTCVAPLSGVDFQLR': {'A1BG'},
 'AAQVTIQSSGTFSSK;AIGYLNTGYQR;DMYSFLEDMGLK;DNSVHWERPQKPK;DTVIKPLLVEPEGLEK;EQAPHCICANGR;ETTFNSLLCPSGGEVSEELSLK;FEVQVTVPK;FQVDNNNR;FSGQLNSHGCFYQQVK;GHFSISIPVK;HYDGSYSTFGER;IAQWQSFQLEGGLK;LHTEAQIQEEGTVVELTGR;LLIYAVLPTGDVIGDSAK;LLLQQVSLPELPGEYSMK;LVHVEEPHTETVR;MCPQLQQYEMHGPEGLR;NALFCLESAWK;NEDSLVFVQTDK;PQYMVLVPSLLHTETTEK;QFSFPLSSEPFQGSYK;QGIPFFGQVR;QKDNGCFR;QQNAQGGFSSTQDTVVALHALSK;QSSEITR;RTTVMVK;SASNMAIVDVK;SDIAPVAR;SLNEEAVK;SLNEEAVKK;SSSNEEVMFLTVQVK;TAQEGDHGSHVYTK;TEHPFTVEEFVLPK;TEVSSNHVLIYLDK;TGTHGLLVK;VDLSFSPSQSLPASHAHLR;VGFYESDVMGR;VSVQLEASPAFLAVPVEK;VTAAPQSVCALR;VTGEGCVYLQTSLK;VYDYYETDEFAIAEYNAPCSK;YDVENCLANK;YSDASDCHGEDSQAFCEK': {'A2M'},
 'ETDPLGKPR;IYLSDSLTGK;NPLGEGPVSNTVAFSTESADPR;TGQQLTSDQLPIK;VSEPVSAGR;YIQKPDNSPCSITDSVK': {'ABI3BP'},
 'AALPAQELEEYNK;AILQFYPK;DMVGLD

In [281]:
i = 0
last = False

while True:
    new_dict = peptides_mapping_dict.copy()

    found = False

    for peptide_1,genes_1 in peptides_mapping_dict.items():
        for peptide_2, genes_2 in peptides_mapping_dict.items():
            
            if peptide_1 == peptide_2:
                continue

            if genes_1 == genes_2 or genes_2 == genes_1:
                found = True
                # remove both keys from dict and add new key with both peptides with ";"
                new_key = peptide_1 + ';' + peptide_2
                del new_dict[peptide_1]
                del new_dict[peptide_2]
                new_dict[new_key] = genes_1

            elif genes_1.intersection(genes_2):

                common_genes = genes_1.intersection(genes_2)
                both_genes = genes_1.union(genes_2)  


            if found:
                break

        if found:
            break

    print(f'Cycle: {i}')
    i += 1
    if new_dict == peptides_mapping_dict:
        if last:
            print('Done')
            break
        last = True
    peptides_mapping_dict = new_dict
        

Cycle: 0
Cycle: 1
Cycle: 2
Cycle: 3
2
{'ACTA1', 'ACTC1', 'ACTA2', 'ACTG2'} {'ACTA1', 'ACTC1', 'ACTA2', 'ACTG1', 'POTEKP', 'ACTB', 'ACTG2', 'ACTBL2', 'POTEF', 'POTEE'}
Result:  set() {'ACTA1', 'ACTC1', 'ACTA2', 'ACTG1', 'POTEKP', 'ACTB', 'ACTG2', 'ACTBL2', 'POTEF', 'POTEE'}
Cycle: 4
Cycle: 5
1
{'ACTB', 'POTEJ', 'POTEI', 'POTEF', 'POTEE', 'ACTG1', 'POTEKP'} {'POTEJ', 'POTEI'}
Result:  {'ACTB', 'POTEJ', 'POTEI', 'POTEF', 'POTEE', 'ACTG1', 'POTEKP'} set()
Cycle: 6
Cycle: 7
Cycle: 8
Cycle: 9
Cycle: 10
Cycle: 11
Cycle: 12
2
{'APOC4-APOC2', 'APOC2'} {'APOC4-APOC2', 'APOC4'}
Result:  {'APOC2'} {'APOC4-APOC2', 'APOC4'}
Cycle: 13
Cycle: 14
Cycle: 15
Cycle: 16
Cycle: 17
Cycle: 18
Cycle: 19
Cycle: 20
Cycle: 21
Cycle: 22
Cycle: 23
Cycle: 24
Cycle: 25


Cycle: 26
Cycle: 27
Cycle: 28
Cycle: 29
Cycle: 30
Cycle: 31
Cycle: 32
Cycle: 33
Cycle: 34
Cycle: 35
Cycle: 36
Cycle: 37
Cycle: 38
Cycle: 39
Cycle: 40
Cycle: 41
2
{'HSPA6', 'HSPA1L', 'HSPA8', 'HSPA7', 'HSPA2', 'HSPA1A', 'HSPA1B'} {'HSPA2', 'HSPA1L', 'HSPA5', 'HSPA8'}
Result:  {'HSPA1A', 'HSPA1B', 'HSPA7', 'HSPA6'} {'HSPA2', 'HSPA1L', 'HSPA5', 'HSPA8'}
Cycle: 42
2
{'HSPA8', 'HSPA2'} {'HSPA2', 'HSPA1L', 'HSPA5', 'HSPA8'}
Result:  set() {'HSPA2', 'HSPA1L', 'HSPA5', 'HSPA8'}
Cycle: 43
Cycle: 44
Cycle: 45
Cycle: 46
Cycle: 47
Cycle: 48
Cycle: 49
Cycle: 50
Cycle: 51
Cycle: 52
Cycle: 53
Cycle: 54
Cycle: 55
2
{'IGHV3-74', 'IGHV3-66', 'IGHV3-21'} {'IGHV3-43', 'IGHV3-21'}
Result:  {'IGHV3-74', 'IGHV3-66'} {'IGHV3-43', 'IGHV3-21'}
Cycle: 56
Cycle: 57
Cycle: 58
Cycle: 59
Cycle: 60
Cycle: 61
Cycle: 62
Cycle: 63
Cycle: 64
Cycle: 65
Cycle: 66
Cycle: 67
Cycle: 68
Cycle: 69
Cycle: 70
Cycle: 71
Cycle: 72
Cycle: 73
Cycle: 74
Cycle: 75
Cycle: 76
Cycle: 77
Cycle: 78
Cycle: 79
Cycle: 80
Cycle: 81
Cycle: 82
Cy

In [250]:
result[(result['Gene.names'] == 'SAA4') | (result['Gene.names'] == 'SAA1')]

Unnamed: 0,Gene.names,Unique_razor,Unique
251,SAA4,6,0.0
301,SAA1,4,2.0


TypeError: unsupported operand type(s) for +: 'set' and 'set'

In [247]:
result

Unnamed: 0,Gene.names,Unique_razor,Unique
0,APOB,302,302.0
1,C3,129,128.0
2,FN1,98,98.0
3,C5,90,90.0
4,C4B,87,5.0
...,...,...,...
612,HLA-H,1,0.0
613,IGHV3OR16-13,1,0.0
614,HLA-A,1,0.0
615,EPX,1,0.0


In [283]:
peptides_mapping_dict

{'ATWSGAVLAGR;CEGPIPDVTFELLR;CLAPLEGAR;FALVREDR;GVTFLLR;HQFLLTGDTQGR;IFFHLNAVALGDGGHYTCR;LELHVDGPPPRPQLR;LLELTGPK;NGVAQEPVHLDSPAIK;SGLSTGWTQLSK;SLPAPWLSMAPVSWITPGLK;TPGAAANLELIFVGPQHAGNYR;VTLTCVAPLSGVDFQLR': {'A1BG'},
 'ETDPLGKPR;IYLSDSLTGK;NPLGEGPVSNTVAFSTESADPR;TGQQLTSDQLPIK;VSEPVSAGR;YIQKPDNSPCSITDSVK': {'ABI3BP'},
 'AALPAQELEEYNK;AILQFYPK;DMVGLDALDAQPLLK;EAGYEGPLHQCDIYR;TLGSANLPLAK;VLQAGSSRPWQEVLK': {'ACE'},
 'AGAQQPAVALETCNPQPCPAR;AGLAWSPCSR;AIGESFIMK;CVEAQGSLLK;EVCQAVPCPAR;FDLELPDGNR;GPGQADCAVAIGRPLGEVVTLR;LFINVAPHAR;LLDMTFSSK;SLVELTPIAAVHGR;VALTEDRLPR;VPVQEELCGLASKPGSR;YGSQLAPETFYR': {'ADAMTS13'},
 'KPPEESTCFERPCFK;SGPECGLAK': {'ADAMTSL2'},
 'GGPLGYQK;ILWIPAGALR;RPDGCGVCGGDDSTCR': {'ADAMTSL4'},
 'GDIGETGVPGAEGPR;GDPGLIGPK;IFYNQQNHYDGSTGK;SAFSVGLETYVTIPNMPIR': {'ADIPOQ'},
 'AESPEVCFNEESPK;AFSSYQK;AIPVTQYLK;CQAYESNR;DADPDTFFAK;DMVEYKDR;ESLLNHFLYEVAR;FLVNLVK;FTDSENVCQER;FTDSENVCQERDADPDTFFAK;FTFEYSR;GQCIINSNK;HFQNLGK;HNFSHCCSK;HPDLSIPELLR;HVCGALLK;IAPQLSTEELVSLGEK;ICAMEGLPQK;KSDVGF

In [273]:
len(peptides_mapping_dict)

472

In [142]:
result[result['Gene.names'].apply(lambda x: any(gene in x.split(';') for gene in ['ACTA1', 'ACTA2', 'ACTB', 'ACTBL2', 'ACTC1', 'ACTG1', 'ACTG2']))]


Unnamed: 0,Gene.names,Unique_razor,Unique
98,ACTB,20,1.0
99,ACTG1,19,0.0
177,ACTA1,12,0.0
178,ACTC1,12,0.0
195,ACTG2,11,0.0
196,ACTA2,11,0.0
372,ACTBL2,5,0.0


In [91]:
result

Unnamed: 0,Gene.names,Unique_razor,Unique
0,APOB,342,342.0
1,C3,170,168.0
2,FN1,115,115.0
3,VWF,107,107.0
4,C4B,98,5.0
...,...,...,...
758,TUBB3,1,0.0
759,TUBB2B,1,0.0
760,TUBB2A,1,0.0
761,TUBB4B,1,0.0


In [36]:
### Server side --- calculate PG for the next merging

def get_leading_unique(gene_names_list):
    # Filter gene names that are considered unique based on unique_peptides list
    unique_genes_in_row = [gene for gene in gene_names_list if gene in unique_genes]
    
    # If there's only one gene name, or none of the genes are unique, return as is
    if not unique_genes_in_row:
        print(f'No unique genes in {gene_names_list}')  
        return gene_names_list[0]  
    elif len(gene_names_list) == 1:
        return gene_names_list[0]
    else:
        # If multiple unique gene names, find the one with the highest Unique_razor value
        highest_razor_gene = result[result['Gene.names'].isin(unique_genes_in_row)].sort_values(by='Unique_razor', ascending=False).iloc[0]['Gene.names']
        return highest_razor_gene

merged_mapping_unique['Gene.names.list'] = merged_mapping_unique['Gene.names'].str.split(';')
merged_mapping_unique['Leading_unique'] = merged_mapping_unique['Gene.names.list'].apply(get_leading_unique)
# merged_mapping_unique.drop(columns=['Gene.names.list'], inplace=True)


No unique genes in ['HBXBP', 'ASGR2']
No unique genes in ['CALM3', 'CALM1', 'CALM2']
No unique genes in ['HIST1H2AG', 'HIST3H2A', 'H2AFX', 'HIST1H2AA', 'H2AFV', 'HIST1H2AC', 'HIST2H2AB', 'H2AFJ', 'HIST1H2AD', 'HIST1H2AB', 'HIST2H2AA3', 'H2AFZ', 'HIST2H2AC', 'HIST1H2AH', 'HIST1H2AJ']
No unique genes in ['SIRPA', 'SIRPB1']
No unique genes in ['HIST1H2BL', 'HIST1H2BM', 'HIST1H2BJ', 'HIST1H2BH', 'HIST1H2BN', 'HIST2H2BF', 'HIST1H2BB', 'HIST1H2BD', 'HIST1H2BO', 'HIST2H2BE', 'H2BFS', 'HIST1H2BC', 'HIST3H2BB', 'HIST1H2BK']
No unique genes in ['HSPA8', 'HSPA2']
No unique genes in ['HIST1H1C', 'HIST1H1D', 'HIST1H1E']
No unique genes in ['IGKV2-40', 'IGKV2D-28']
No unique genes in ['H2AFV', 'H2AFZ']
No unique genes in ['HBXBP', 'ASGR2']
No unique genes in ['SAA4', 'SAA2-SAA4']
No unique genes in ['TUBA1B', 'TUBA3E', 'TUBA1A', 'TUBA1C']
No unique genes in ['CALM3', 'CALM1', 'CALM2']
No unique genes in ['SAA4', 'SAA2-SAA4']
No unique genes in ['LILRB1', 'LILRB2']
No unique genes in ['EEF1A1', 'EEF1

In [7]:
result.head()

Unnamed: 0,Gene.names,Unique_razor,Unique
0,APOB,342,342.0
1,C3,170,168.0
2,FN1,115,115.0
3,VWF,107,107.0
4,C5,98,98.0


In [24]:
merged_mapping_unique

Unnamed: 0,Sequence,Gene.names,Gene.names.list,Leading_unique
0,AAAATGTIFTFR,SERPINA3;SERPINA5,"[SERPINA3, SERPINA5]",SERPINA3
1,AAAEVAGQFVIK,TFRC,[TFRC],TFRC
2,AAALAHLDR,AOC2;AOC3,"[AOC2, AOC3]",AOC3
3,AAANQMR,C4A;C4B,"[C4A, C4B]",C4B
4,AAAPNTPK,PRG4,[PRG4],PRG4
...,...,...,...,...
6915,YYSFFDLDPK,FERMT3,[FERMT3],FERMT3
6916,YYTYLIMNK,C3,[C3],C3
6917,YYVTIIDAPGHR,EEF1A1;EEF1A1P5,"[EEF1A1, EEF1A1P5]",EEF1A1
6918,YYYDGKDYIEFNK,AZGP1,[AZGP1],AZGP1


In [113]:
### Server side --- secong step preparation for creation of PG groups


# Function to update the dictionary with Leading_unique values
def update_dict(row):
    # Split Gene.names into individual gene names
    genes = row['Gene.names'].split(';')
    # For each gene name, append the Leading_unique value to the corresponding list in the dict
    for gene in genes:
        if gene in gene_dict:  # Check if the gene is a key in the dictionary
            gene_dict[gene].append(row['Leading_unique'])

gene_dict = {gene: [] for gene in set(result['Gene.names'].unique()) - set(unique_genes)}
# do not return anything, just update gene_dict
_ = merged_mapping_unique.apply(update_dict, axis=1)

# iterate over gene_dict and transform list to set
for gene, values in gene_dict.items():
    gene_dict[gene] = set(values)


# keep only Gene namse with biggest value in Unique_razor column
# Function to find the gene with the highest Unique_razor value
def find_highest_razor_gene(genes):
    # Filter the result DataFrame to include only the relevant genes
    filtered_result = result[result['Gene.names'].isin(genes)]
    # Find the gene with the maximum Unique_razor value
    highest_razor_gene = filtered_result.loc[filtered_result['Unique_razor'].idxmax()]['Gene.names']
    return highest_razor_gene

# Iterate through each gene in gene_dict
for gene, associated_genes in gene_dict.items():
    # Ensure associated genes are unique before proceeding
    unique_associated_genes = list(set(associated_genes))
    # If more than one unique associated gene, find the one with the highest Unique_razor
    if len(unique_associated_genes) > 1:
        highest_razor_gene = find_highest_razor_gene(unique_associated_genes)
        # Update the gene_dict to keep only the highest_razor_gene
        gene_dict[gene] = [highest_razor_gene]
    else:
        # Update with the unique list to remove potential duplicates
        gene_dict[gene] = unique_associated_genes



In [114]:
### server side --- finaly, create PG groups


def filter_gene_names(row):
    # Split Gene.names into a list for easier comparison
    genes_list = row['Gene.names'].split(';')
    # Filter the list based on the condition described
    filtered_genes = [gene for gene in genes_list if gene in gene_dict and gene_dict[gene][0] == row['Leading_unique']]
    # Return the filtered list as a string, joined by ';'
    return ';'.join(filtered_genes)

# Apply the filter function to each row of merged_mapping_unique
merged_mapping_unique['Gene.names'] = merged_mapping_unique.apply(filter_gene_names, axis=1)


# Step 1 & 2: Group by Leading_unique and aggregate Gene.names into unique, sorted strings
pg_groups = merged_mapping_unique.groupby('Leading_unique')['Gene.names'].apply(lambda x: ';'.join(sorted(set(sorted(x))))).reset_index()
pg_groups.rename(columns={'Gene.names': 'Groups'}, inplace=True)

# Ensure the Leading_unique column in merged_mapping_unique matches the type in pg_groups for accurate mapping
merged_mapping_unique['Leading_unique'] = merged_mapping_unique['Leading_unique'].astype(pg_groups['Leading_unique'].dtype)

# Step 3: Map the unique, sorted PG strings back to merged_mapping_unique
# Create a mapping dictionary from Leading_unique to PG
pg_mapping = dict(zip(pg_groups['Leading_unique'], pg_groups['Groups']))

# Apply the mapping to add the PG column to merged_mapping_unique
merged_mapping_unique['Groups'] = merged_mapping_unique['Leading_unique'].map(pg_mapping)

# Function to clean and sort gene names, removing duplicates
def clean_sort_genes(gene_str):
    unique_genes = sorted(set(gene_str.split(';')))
    return ';'.join(unique_genes)

# Apply this function to the 'PG' column to ensure each gene name is unique and sorted
merged_mapping_unique['Groups'] = merged_mapping_unique['Groups'].apply(clean_sort_genes)
merged_mapping_unique = merged_mapping_unique.drop(columns=['Gene.names', 'Leading_unique'])

In [115]:
merged_mapping_unique[merged_mapping_unique['Groups'].str.startswith(';') == True]

Unnamed: 0,Sequence,Gene.names.list,Groups
1,AAAEVAGQFVIK,[TFRC],;TFR2
2,AAALAHLDR,"[AOC3, AOC2]",;AOC2
8,AACLPLPGYR,[LCP1],;PLS1;PLS3
11,AAFCYQIR,[MST1],;MST1L
14,AAGHPGDPESQQR,[TLN1],;TLN2
...,...,...,...
6867,YTLNILEEIGGGQK,[LCP1],;PLS1;PLS3
6871,YTPVQQGPVGVNVTYGGDPIPK,[FLNA],;FLNB;FLNC
6879,YVDGGFGMGK,[AOC3],;AOC2
6906,YYGGGYGSTQATFMVFQALAQYQK,[C3],;METTL18


In [116]:
merged_mapping_unique[merged_mapping_unique['Groups'] == '']

Unnamed: 0,Sequence,Gene.names.list,Groups
0,AAAATGTIFTFR,"[SERPINA5, SERPINA3]",
3,AAANQMR,"[C4A, C4B]",
4,AAAPNTPK,[PRG4],
5,AAATGPSFWLGNETLK,[PEPD],
6,AAATLMSER,[SLC4A1],
...,...,...,...
6913,YYPYQSFQTPQHPSFLFQDK,[LGALS3BP],
6914,YYQENFCEQICSK,[C6],
6915,YYSFFDLDPK,[FERMT3],
6918,YYYDGKDYIEFNK,[AZGP1],


In [117]:
# return to the client side

## CLIENT side --- merge intensities by Groups
for center in centers:
    
    intensities[center] = pd.merge(intensities[center], merged_mapping_unique, on='Sequence', how='left')
    # group by Groups and summarize intensities in each column for the group
    intensities[center] = intensities[center].dropna(subset=['Groups'])
    # remove Sequence column and Proteins column and Gene.names column
    # remove also rows with NA in Groups column
    intensities[center] = intensities[center].drop(columns=['Sequence', 'Proteins', 'Gene.names'])
    intensities[center] = intensities[center].groupby('Groups').sum().reset_index()
    

In [118]:
intensities['Center1'] 

Unnamed: 0,Groups,P_1.RIC_1,P_1.RIC_2,P_1.RIC_3,P_1.RIC_4,P_1.RIC_5,P_1.RIC_6,P_1.RIC_7,P_1.RIC_8,P_1.RIC_9,...,P_2.RIC_3,P_2.RIC_4,P_2.RIC_5,P_2.RIC_6,P_2.RIC_7,P_2.RIC_8,P_2.RIC_9,P_2.RIC_10,P_2.RIC_11,Gene.names.list
0,,1.284534e+08,1.298108e+08,1.318517e+08,1.124940e+08,1.407478e+08,1.162514e+08,1.386974e+08,1.053387e+08,1.264607e+08,...,1.182479e+08,1.117078e+08,1.348059e+08,1.116023e+08,1.206488e+08,1.084519e+08,1.215506e+08,1.048167e+08,1.321534e+08,"[SERPINA5, SERPINA3, PRG4, PEPD, C4A, C4B, TTR..."
1,;ACTA1;ACTA2;ACTBL2;ACTC1;ACTG1;ACTG2;POTEE;PO...,2.318205e+05,3.530995e+05,4.941979e+05,3.319458e+05,3.432104e+05,2.678653e+05,5.389929e+05,2.907673e+05,4.879625e+05,...,3.813681e+05,4.304619e+05,3.842753e+05,4.660764e+05,3.102147e+05,4.395342e+05,3.183050e+05,3.024991e+05,3.866126e+05,"[POTEJ, POTEI, ACTG2, ACTC1, ACTA2, ACTB, ACTA..."
2,;ACTN4,1.835200e+04,2.796430e+04,2.860940e+04,2.643930e+04,2.479970e+04,2.333600e+04,4.431260e+04,2.399040e+04,3.709690e+04,...,1.002350e+04,1.279990e+04,9.976400e+03,1.397520e+04,9.842200e+03,1.277900e+04,9.250600e+03,6.677800e+03,1.188250e+04,"[ACTN1, ACTN4, ACTN1, ACTN1, ACTN1, ACTN1, ACT..."
3,;AOC2,1.475140e+04,1.127460e+04,1.028160e+04,9.714200e+03,1.121850e+04,1.165250e+04,1.176800e+04,1.013810e+04,1.029630e+04,...,3.292300e+03,2.598600e+03,4.602800e+03,2.428900e+03,3.237800e+03,3.254700e+03,2.267600e+03,2.587000e+03,3.069400e+03,"[AOC3, AOC3]"
4,;APOC4-APOC2,8.828990e+04,9.741310e+04,8.806520e+04,8.120570e+04,4.740346e+05,1.105680e+05,1.963853e+05,8.338500e+04,9.742690e+04,...,1.174983e+05,1.207656e+05,1.949836e+05,1.096520e+05,9.947970e+04,8.725530e+04,1.570494e+05,1.151250e+05,1.579420e+05,"[APOC4, APOC4, APOC4, APOC4-APOC2, APOC4, APOC..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,IGKV3-7,6.950400e+03,1.055800e+04,6.776600e+03,6.617700e+03,6.404500e+03,7.863600e+03,9.239200e+03,8.769100e+03,6.074300e+03,...,6.790400e+03,9.633700e+03,3.285600e+03,8.991600e+03,9.883900e+03,8.767900e+03,8.497600e+03,8.591600e+03,7.034500e+03,"[IGKV3D-7, IGKV3-7, IGKV3OR2-268]"
78,RPS27A;UBA52;UBB;UBC,1.804620e+04,2.249030e+04,3.009730e+04,2.296350e+04,2.776010e+04,1.825560e+04,2.517140e+04,2.227620e+04,2.884850e+04,...,3.481800e+03,3.231900e+03,4.890100e+03,2.884300e+03,2.809800e+03,3.084600e+03,3.119000e+03,2.881000e+03,3.434900e+03,"[UBC, UBB, RPS27A, UBA52, UBC, UBB, RPS27A, UB..."
79,SAA2-SAA4;SAA4,2.532270e+05,2.710454e+05,2.109990e+05,2.250424e+05,4.647940e+05,2.198176e+05,3.298730e+05,2.079793e+05,2.130006e+05,...,3.333882e+05,2.607482e+05,4.242393e+05,2.874436e+05,2.374499e+05,2.877665e+05,3.699380e+05,3.266270e+05,3.218000e+05,"[SAA4, SAA2-SAA4, SAA4, SAA2-SAA4, SAA4, SAA2-..."
80,SIRPA;SIRPB1;SIRPG,2.285170e+04,2.368640e+04,1.998480e+04,1.892830e+04,1.735840e+04,1.735870e+04,2.302190e+04,2.219890e+04,2.067020e+04,...,1.192000e+04,2.546300e+04,1.629100e+04,2.084200e+04,2.291400e+04,2.437600e+04,1.231900e+04,2.368700e+04,1.779200e+04,"[SIRPB1, SIRPA, SIRPB1, SIRPG, SIRPA]"
