In [603]:
import pandas as pd
import numpy as np


# Aggregate to Gene level

In [604]:
# CLIENT LEVEL

path_to_data = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/balanced_data/04_Peptides_PG'
centers = ['Center1', 'Center2', 'Center3']

intensities = {}
mappings = {}

for center in centers:
    aggregated_report = pd.read_csv(f'{path_to_data}/{center}/aggregated_NF.tsv', sep='\t')
    # add all to intensities dict with key = center
    intensities[center] = aggregated_report
    # add all to mappings dict with key = center
    mapping = aggregated_report[['Sequence', 'Proteins']]
    # mapping = aggregated_report[['Sequence', 'Gene.names']]
    #rename Gene.names to Proteins
    # mapping = mapping.rename(columns={'Gene.names': 'Proteins'})
    # transform Sequence column to index
    # mapping = mapping.set_index('Sequence')
    mappings[center] = mapping
    print(f'{center} done')
    print(f'Number of peptides: {len(aggregated_report)}')

Center1 done
Number of peptides: 5958
Center2 done
Number of peptides: 6303
Center3 done
Number of peptides: 5528


In [605]:
# SERCER side --- merge mappings
merged_mapping = None

for name, df in mappings.items():
    if merged_mapping is None:
        merged_mapping = df
    else:
        # merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', 'Gene.names'], how='outer')
        merged_mapping = pd.merge(merged_mapping, df, on=['Sequence', 'Proteins'], how='outer')

print(f'Number of peptides in merged mapping: {len(merged_mapping)}')
# remove rows with NA
# merged_mapping = merged_mapping.dropna(subset=['Sequence', 'Gene.names'])
merged_mapping = merged_mapping.dropna(subset=['Sequence', 'Proteins'])
print(f'Number of peptides in merged mapping without NAs: {len(merged_mapping)}')



# keep only unique values in Sequence column by intersect Gene.names
def find_intersection(group):
    # Find the intersection of lists in "Gene.names" within the group
    intersection = set(group['Proteins'].iloc[0])
    for names in group['Proteins']:
        intersection &= set(names)
    # Return a Series with "Sequences" and the intersection of "Gene.names"
    return pd.Series({
        'Proteins': ';'.join(intersection)
        # 'Gene.names': intersection
    })


def find_union(group):
    # Find the intersection of lists in "Gene.names" within the group
    union = set(group['Proteins'].iloc[0])
    for names in group['Proteins']:
        union |= set(names)
    # Return a Series with "Sequences" and the intersection of "Gene.names"
    return pd.Series({
        'Proteins': ';'.join(union)
        # 'Gene.names': intersection
    })

merged_mapping['Proteins'] = merged_mapping['Proteins'].str.split(';')
merged_mapping_unique = merged_mapping.groupby('Sequence').apply(find_union).reset_index()

print(f'Number of unique peptides: {len(merged_mapping_unique)}')

Number of peptides in merged mapping: 8044
Number of peptides in merged mapping without NAs: 8044
Number of unique peptides: 7793


In [606]:
# print non-unique rows
non_unique = merged_mapping[merged_mapping.duplicated(subset='Sequence', keep=False)]
non_unique.sort_values(by='Sequence')

Unnamed: 0,Sequence,Proteins
30,AAPSVTLFPPSSEELQANK,"[A0A5H1ZRQ7, A0M8Q6, P0CF74, P0DOY2, P0DOY3]"
7516,AAPSVTLFPPSSEELQANK,"[P0CF74, P0DOY2, P0DOY3]"
57,ADGSPVK,"[A0A0B4J231, A0A5H1ZRQ4, A0A5H1ZRQ7, A0M8Q6, B..."
7520,ADGSPVK,"[A0A0B4J231, A0A5H1ZRQ4, B9A064, P0CG04]"
74,ADTLTDEINFLR,"[CON__P02538, P02538]"
...,...,...
8039,YRVFALDQK,"[A0A0G2JPR0, A0A140TA49, A0A8V8TLP6, CON__P010..."
5917,YTVNQCR,[P00450]
8041,YTVNQCR,"[E9PFZ2, H7C5R1, P00450]"
5923,YVLPNFEVK,"[A0A0G2JPR0, A0A140TA49, A0A8V8TLP6, CON__ENSE..."


In [607]:
### server side --- calculate unique and razor

df_exploded = merged_mapping_unique.assign(**{'Proteins': merged_mapping_unique['Proteins'].str.split(';')}).explode('Proteins')

unique_razor = df_exploded['Proteins'].value_counts().rename_axis('Proteins').reset_index(name='Unique_razor')
unique_counts = merged_mapping_unique[merged_mapping_unique['Proteins'].str.contains(';') == False]['Proteins'].value_counts().rename_axis('Proteins').reset_index(name='Unique')

result = pd.merge(unique_razor, unique_counts, on='Proteins', how='left').fillna({'Unique': 0})

In [608]:
new_for_dict = merged_mapping_unique.groupby('Proteins')['Sequence'].apply(lambda x: ';'.join(x)).reset_index()
peptides_mappings = pd.Series(new_for_dict['Proteins'].values, index=new_for_dict['Sequence']).to_dict()
peptides_mapping_dict = {key: set(value.split(';')) for key, value in peptides_mappings.items()}

razor_uniq_dict = pd.Series(result['Unique_razor'].values, index=result['Proteins']).to_dict()
unique_genes = result[result['Unique'] > 0]['Proteins'].tolist()
multiple_non_unique_genes = result[(result['Unique'] == 0) & (result['Unique_razor'] > 1)]['Proteins'].tolist()

In [609]:
def find_leading_genes(genes, more_then_half=False):
    # Filter the counts for the genes in the set
    relevant_counts = {gene: razor_uniq_dict[gene] for gene in genes}
    # Find the max count
    max_count = max(relevant_counts.values())
    
    if more_then_half:
        leading_genes = [gene for gene, count in relevant_counts.items() if count == max_count]
    
    else:
        leading_genes = [gene for gene, count in relevant_counts.items() if count >= max_count / 2]

    return set(sorted(leading_genes))

In [610]:
i = 0
new_peptides_genes = peptides_mapping_dict.copy()


while True:

    print(f'Iteration {i}')
    new_dict = new_peptides_genes.copy()
    found = False

    for peptide_1, genes_1 in new_peptides_genes.items():
        for peptide_2, genes_2 in new_peptides_genes.items():
            if peptide_1 == peptide_2:
                continue

            common_genes = genes_1.intersection(genes_2)
            if len(common_genes) < 1:
                continue
            
            leading_1 = find_leading_genes(genes_1)
            leading_2 = find_leading_genes(genes_2)
            merged_peptide_key = f"{peptide_1};{peptide_2}"

            unique_1 = [gene for gene in genes_1 if gene in unique_genes]
            unique_2 = [gene for gene in genes_2 if gene in unique_genes]

            if leading_1 == leading_2:
                found = True
                new_dict[merged_peptide_key] = genes_1.union(genes_2)
                del new_dict[peptide_1]
                del new_dict[peptide_2]

            elif razor_uniq_dict[list(leading_1)[0]] > razor_uniq_dict[list(leading_2)[0]]:
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes

            elif razor_uniq_dict[list(leading_1)[0]] < razor_uniq_dict[list(leading_2)[0]]:
                found = True
                new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            elif unique_1 == unique_2 and len(unique_1) > 0 and all(element in common_genes for element in unique_1) and all(element in leading_1 for element in unique_1):
                found = True
                new_dict[merged_peptide_key] = genes_1.union(genes_2)
                del new_dict[peptide_1]
                del new_dict[peptide_2]
               
            elif len(unique_1) > len(unique_2):
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes

            elif len(unique_1) < len(unique_2):
                found = True
                new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            elif len(leading_1) > len(leading_2):
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes
            
            elif len(leading_1) < len(leading_2):
                found = True
                new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            elif len(genes_1) > len(genes_2):
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes
            
            elif len(genes_1) < len(genes_2):
                found = True
                new_dict[peptide_2] = new_dict[peptide_2] - common_genes

            elif len(unique_1) == len(unique_2) == 0:
                found = True
                new_dict[peptide_1] = new_dict[peptide_1] - common_genes

            else:
              print("\n\nOOPS! WARNING!")
              print(f'Genes: {genes_1} vs {genes_2}')
              print(f'Leading genes: {leading_1} vs {leading_2}')
              print(f'Common genes: {common_genes}')
              print(f'Unique genes: {unique_1} vs {unique_2}')
              print('\n\n')
              raise ValueError

            if found:
                break

        if found:
            break

    if new_dict == new_peptides_genes:
        break

    else:
        i += 1
        new_peptides_genes = new_dict

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
Iteration 21
Iteration 22
Iteration 23
Iteration 24
Iteration 25
Iteration 26
Iteration 27
Iteration 28
Iteration 29
Iteration 30
Iteration 31
Iteration 32
Iteration 33
Iteration 34
Iteration 35
Iteration 36
Iteration 37
Iteration 38
Iteration 39
Iteration 40
Iteration 41
Iteration 42
Iteration 43
Iteration 44
Iteration 45
Iteration 46
Iteration 47
Iteration 48
Iteration 49
Iteration 50
Iteration 51
Iteration 52
Iteration 53
Iteration 54
Iteration 55
Iteration 56
Iteration 57
Iteration 58
Iteration 59
Iteration 60
Iteration 61
Iteration 62
Iteration 63
Iteration 64
Iteration 65
Iteration 66
Iteration 67
Iteration 68
Iteration 69
Iteration 70
Iteration 71
Iteration 72
Iteration 73
Iteration 74
Iteration 75
Iteration 76
Iteration

In [612]:
# Each value in iter_peptides_mapping_dict collapse using ; and transform to pandas df
collapsed_PG = pd.DataFrame({'Sequence': list(new_peptides_genes.keys()), 'Proteins': [sorted(value) for value in new_peptides_genes.values()]})

# add column with leading gene
collapsed_PG['Major'] = collapsed_PG['Proteins'].apply(lambda x: find_leading_genes(x, more_then_half=True))
collapsed_PG['Leading'] = collapsed_PG['Proteins'].apply(find_leading_genes)
# join Leading column valies to string using ;
collapsed_PG['Major'] = collapsed_PG['Major'].apply(lambda x: ';'.join(sorted(x)))
collapsed_PG['Leading'] = collapsed_PG['Leading'].apply(lambda x: ';'.join(sorted(x)))
collapsed_PG['All.proteins'] = collapsed_PG['Proteins'].apply(lambda x: ';'.join(x))
collapsed_PG

ValueError: max() arg is an empty sequence

In [613]:
collapsed_PG[collapsed_PG['Proteins'].apply(len) == 0]

Unnamed: 0,Sequence,Proteins
9,AGEQDATIHLK;CVVTGEDGSESEATVNVK;DKDISWFSPNGEK;F...,[]
10,ASWTRPEK,[]
16,EEHLCTQR,[]
23,LTVNSLK,[]
25,EPQVYTLPPSRDELTK;GPSVFPLAPSSK;GPSVFPLAPSSKSTSG...,[]
...,...,...
913,GSCGIGGGIGGGSSR;ISSVLAGGSCR;IRDWYQR,[]
914,EVATNSELVQSGK;TEELNREVATNSELVQSGK;APSTYGGGLSVS...,[]
926,DSLDKEK;ELHHLQEQNVSNAFLDK;FNKNNEGTYYSPNYNPQSR;...,[]
940,DVDAAYMNKVELQAK;GMQDLVEDFK;KQCANLQAAIADAEQR;QC...,[]


In [None]:
# split values in Sequence column to list
# split this list to separate rows
# to each row add corresponding values from Leading column

resulting_mapping = collapsed_PG.assign(**{'Sequence': collapsed_PG['Sequence'].str.split(';')}).explode('Sequence')
resulting_mapping = resulting_mapping[['Sequence', 'Leading', 'Major']]
resulting_mapping.reset_index(drop=True, inplace=True)
resulting_mapping.head()

# send this to the client side

Unnamed: 0,Sequence,Leading,Major
0,ATWSGAVLAGR,A1BG,A1BG
1,CEGPIPDVTFELLR,A1BG,A1BG
2,CEGPIPDVTFELLREGETK,A1BG,A1BG
3,CLAPLEGAR,A1BG,A1BG
4,FALVREDR,A1BG,A1BG


In [None]:
for center in centers:
    # write to file
    resulting_mapping.to_csv(f'{path_to_data}/{center}/mapping.tsv', sep='\t', index=False)

In [640]:
peptides_mapping_dict

{'DTEEEDFHVDQATTVK;WERPFEVKDTEEEDFHVDQATTVK': {'A0A024R6N5', 'A0A0G2JRN3'},
 'TDTSHHDQDHPTFNK': {'A0A024R6N5',
  'A0A0B4J278',
  'A0A0G2JRN3',
  'G3V387',
  'G3V544',
  'G3V5R8',
  'P01009'},
 'TLNQPDSQLQLTTGNGLFLSEGLK': {'A0A024R6N5',
  'A0A0G2JRN3',
  'G3V2B9',
  'G3V544',
  'P01009'},
 'FLENEDR;FLENEDRR;GKWERPFEVK;KLSSWVLLMK;LGMFNIQHCK;LQHLENELTHDIITK;LSITGTYDLK;LSSWVLLMK;RLGMFNIQHCK;RSASLHLPK;SASLHLPK;SVLGQLGITK;VFSNGADLSGVTEEAPLK;VFSNGADLSGVTEEAPLKLSK;WERPFEVK': {'A0A024R6N5',
  'A0A0G2JRN3',
  'P01009'},
 'FLEDVKK;KQINDYVEK;LVDKFLEDVK;LVDKFLEDVKK;LYHSEAFTVNFGDTEEAK;LYHSEAFTVNFGDTEEAKK;QINDYVEK': {'A0A024R6N5',
  'A0A0G2JRN3',
  'G3V2B9',
  'P01009'},
 'AVLTIDEK;FNKPFVFLMIEQNTK;GTEAAGAMFLEAIPMSIPPEVK;SPLFMGK': {'A0A024R6N5',
  'P01009'},
 'VVNPTQK': {'A0A024R6N5', 'P01009', 'P20848'},
 'FSGSSSGAER;GDGIPDRFSGSSSGAER': {'A0A075B6H9'},
 'FSGSILGNK;SSGVPDR': {'A0A075B6I0'},
 'ITCSGDALPK': {'A0A075B6K4'},
 'YAYWYQQK': {'A0A075B6K0', 'A0A075B6K4'},
 'DSNRPSGIPER': {'A0A075B6K5'},
 'FSGS

In [643]:
import networkx as nx
from collections import defaultdict


def build_peptide_gene_graph(peptides_mapping_dict):
    """
    Build a graph where peptides and genes are nodes.
    An edge between a peptide and a gene node indicates association.
    """
    G = nx.Graph()
    for peptide, proteins in peptides_mapping_dict.items():
        for protein in proteins:
            G.add_node(peptide, type='peptide')
            G.add_node(protein, type='protein')
            G.add_edge(peptide, protein)
    return G


peptides_mappings = pd.Series(new_for_dict['Proteins'].values, index=new_for_dict['Sequence']).to_dict()
peptides_mapping_dict = {key: set(value.split(';')) for key, value in peptides_mappings.items()}

G = build_peptide_gene_graph(peptides_mapping_dict)
connected_components = nx.connected_components(G)

# Creating a mapping from peptides to proteins
peptide_to_proteins = defaultdict(set)
for peptide, proteins in peptides_mapping_dict.items():
    for protein in proteins:
        peptide_to_proteins[peptide].add(protein)


protein_groups = []

for component in connected_components:
    sub_G = G.subgraph(component).copy()
    proteins_in_group = [node for node in sub_G.nodes if sub_G.nodes[node]['type'] == 'protein']
    peptides_in_group = [node for node in sub_G.nodes if sub_G.nodes[node]['type'] == 'peptide']

    # Dictionary to count peptides per protein
    peptide_counts = {protein: len(list(sub_G.neighbors(protein))) for protein in proteins_in_group}

    # Sorting proteins by their peptide count (descending)
    proteins_sorted_by_peptide_count = sorted(peptide_counts, key=peptide_counts.get, reverse=True)

    # Creating the protein group based on the sorted proteins
    protein_group = {
        'proteins': proteins_sorted_by_peptide_count,
        'peptides': peptides_in_group,
        'razor_protein': proteins_sorted_by_peptide_count[0]
    }

    protein_groups.append(protein_group)



# Identify all peptides and the groups they belong to
peptide_to_groups = defaultdict(set)

for i, group in enumerate(protein_groups):
    for peptide in group['peptides']:
        peptide_to_groups[peptide].add(i)

# Function to determine the group with the majority of unique + razor peptides
def determine_dominant_group(peptide, groups):
    max_peptides = -1
    dominant_group = None
    for group_idx in groups:
        # Count the total number of unique and shared peptides preferring this group
        total_peptides = len(protein_groups[group_idx]['peptides'])
        if total_peptides > max_peptides:
            max_peptides = total_peptides
            dominant_group = group_idx
    return dominant_group

# Adjust peptide assignments based on the razor-unique logic
for peptide, groups in peptide_to_groups.items():
    if len(groups) > 1:  # The peptide is shared
        dominant_group = determine_dominant_group(peptide, groups)
        # Assign the peptide to the dominant group and remove from others
        for group_idx in groups:
            if group_idx != dominant_group:
                protein_groups[group_idx]['peptides'].remove(peptide)

# Optionally, reconstruct the protein group structures if needed
for group in protein_groups:
    group['peptides'] = list(set(group['peptides']))  # Ensure peptides are unique


# Finalize the protein group assignments
final_protein_groups = []

for group in protein_groups:
    # Extracting the final list of peptides for each group
    final_peptides = group['peptides']
    
    # You may also want to identify the final list of unique proteins in each group,
    # especially if peptides were reassigned in the previous step.
    # This can involve identifying which proteins are still connected to the peptides in the group.
    connected_proteins = {protein for peptide in final_peptides for protein in peptide_to_proteins[peptide]}
    
    final_group = {
        'proteins': list(connected_proteins),
        'peptides': final_peptides,
        'razor_protein': group['razor_protein']  # Assuming you still want to keep track of the razor protein
    }
    
    final_protein_groups.append(final_group)



NameError: name 'peptide_to_proteins' is not defined

In [639]:
# Finalize the protein group assignments
final_protein_groups = []

for group in protein_groups:
    # Extracting the final list of peptides for each group
    final_peptides = group['peptides']
    
    # You may also want to identify the final list of unique proteins in each group,
    # especially if peptides were reassigned in the previous step.
    # This can involve identifying which proteins are still connected to the peptides in the group.
    connected_proteins = {protein for peptide in final_peptides for protein in peptide_to_proteins[peptide]}
    
    final_group = {
        'proteins': list(connected_proteins),
        'peptides': final_peptides,
        'razor_protein': group['razor_protein']  # Assuming you still want to keep track of the razor protein
    }
    
    final_protein_groups.append(final_group)


NameError: name 'peptide_to_proteins' is not defined

# Check the results with central approach

In [488]:
# Load the original data
original_data = pd.read_csv('/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Combined/proteinGroups.txt', sep='\t')

original_data = original_data[['Majority protein IDs']]
print(f'Number of proteins in original data: {len(original_data)}')

# For each value in Protein IDs, split by ;, sort and join by ;
original_data['Proteins'] = original_data['Majority protein IDs'].str.split(';').apply(lambda x: str(';'.join(sorted(x))))
original_PG = set(original_data[['Proteins']].drop_duplicates().values.flatten().tolist())
print(f'Number of proteins in original data after dropping duplicates: {len(original_PG)}')


Number of proteins in original data: 768
Number of proteins in original data after dropping duplicates: 660


In [489]:
collapsed_PG_PG = set(collapsed_PG['Leading'].drop_duplicates().values.flatten().tolist())
# collapsed_PG_PG = set(collapsed_PG['All.proteins'].drop_duplicates().values.flatten().tolist())
print(f'Number of proteins in collapsed data: {len(collapsed_PG_PG)}')

Number of proteins in collapsed data: 613


In [490]:
original_PG - collapsed_PG_PG

{'AHNAK',
 'ARPC4;ARPC4-TTLL3',
 'B3GNT8',
 'B4GAT1',
 'BEGAIN',
 'CCDC168',
 'CD34',
 'CLSTN1',
 'CNTN4',
 'COL3A1',
 'CRELD1',
 'CSF1',
 'CTSA',
 'CTSG',
 'CTSH',
 'CTSK;CTSL;CTSL3P;CTSV',
 'DKFZp781D1416;STAG1;STAG2;STAG3;STAG3L4',
 'DRP2',
 'FUCA1',
 'GPR126',
 'GTF2I',
 'H2AFJ;H2AFX;HIST1H2AA;HIST1H2AB;HIST1H2AC;HIST1H2AD;HIST1H2AG;HIST1H2AH;HIST1H2AJ;HIST2H2AA3;HIST2H2AB;HIST2H2AC;HIST3H2A',
 'HLA-A;HLA-H',
 'HLA-B;HLA-C;HLA-Cw',
 'HLCS',
 'HPSE',
 'HSPA6;HSPA7',
 'HSPB1',
 'HTR7',
 'IGHV3-66;IGHV3-74',
 'IGKV1-27;IGKV1-8;IGKV1-9',
 'IGKV2-40;IGKV2D-28',
 'IGKV3D-15;IGKV3D-7;IGKV3OR2-268',
 'INPP5D',
 'KRT13',
 'KRT6B',
 'KRT71;KRT73;KRT74',
 'LCN1;LCN1P1',
 'LGALSL',
 'LILRA2',
 'LILRA6;LILRB3',
 'LMNA',
 'LSAMP',
 'MARCO',
 'MSTN',
 'MYL6',
 'NIPBL',
 'NOTCH3',
 'OPTN',
 'PDGFRB',
 'PECAM1',
 'PFKL',
 'PFKM;PFKP',
 'PGLYRP1',
 'PIGR',
 'PLEKHG4B',
 'PLXNB2',
 'PNP',
 'POGLUT1',
 'PRKCSH',
 'PRRC2C',
 'PRSS1;PRSS2;PRSS3P2',
 'PSMA5',
 'PXDC1',
 'RAB11A;RAB11B',
 'RAI14',
 'RELN'

In [491]:
collapsed_PG_PG - original_PG

{'ADGRG6;GPR126',
 'ARF1;ARF3',
 'CCDC40',
 'CDC14B;CDC14C',
 'CEP250',
 'CFH',
 'CFHR3',
 'EGFR',
 'H2AFJ;H2AFX;HIST1H2AA;HIST1H2AB;HIST1H2AC;HIST1H2AD;HIST1H2AG;HIST1H2AH;HIST1H2AJ;HIST2H2AA3;HIST2H2AC;HIST3H2A',
 'HLA-B;HLA-C',
 'IGHV3-74',
 'IGKV1-12;IGKV1-6',
 'IGKV2-40;IGKV2D-26;IGKV2D-28;IGKV2D-30',
 'IGKV3-7;IGKV3D-7;IGKV3OR2-268',
 'IGKV3D-15',
 'IGLV3-16',
 'KDM8',
 'KIF5A;KIF5B;KIF5C',
 'KRT12',
 'KRT20',
 'KRT28',
 'MED12',
 'MORC1',
 'MYH10',
 'PF4',
 'PKLR',
 'POTEI;POTEJ',
 'PRSS1;PRSS2;PRSS3P2;TMPRSS13',
 'RAB10;RAB13;RAB15;RAB1A;RAB1B;RAB1C;RAB8A;RAB8B',
 'SERPINB9',
 'SRGAP1',
 'TTN',
 'TUBA1A;TUBA1B;TUBA1C',
 'TUBA3E;TUBA4A',
 'TUBB1;TUBB2A;TUBB2B;TUBB4B;TUBB8',
 'TUBB;TUBB3;TUBB4A;TUBB6',
 'VPS13C',
 'YWHAH'}

In [492]:
len(original_PG - collapsed_PG_PG) / len(original_PG)

0.12878787878787878

In [493]:
len(collapsed_PG_PG - original_PG) / len(collapsed_PG_PG)

0.06199021207177814