## Process protein complex membership

**Input:** Protein complexes from CORUM + CERES gene scores + processed paralog pairs

**Output:** Paralog pairs annotated with protein complex features

CORUM: http://mips.helmholtz-muenchen.de/corum/

Note: For some complexes more genes are listed in the gene name column than in the entrez id column (i.e. there are not enough 'None' placeholders). Example complex is 824, SMN1 and SMN2 are both not included in the id list but there is only one 'None' entry.

In [19]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Inputs
file_CORUM_complexes = get_data_path(['CORUM'], 'allComplexes.txt')
file_gene_id_map = get_local_data_path(['processed'], 'HGNC_gene_id_map.csv')
file_gene_scores = get_local_data_path(['processed', 'depmap20Q2'], 'gene_scores_16_04_21.csv')
file_paralog_pairs = get_local_data_path(['processed','ensembl93'], 'unique_pairs.csv')

# Output
file_complex_features = get_local_data_path(['processed','paralog_features'], 'protein_complex_features.csv')

### Make map of entrez-id/symbol to complex id

Some sub-units are associated with multiple genes. For example complex 3298 has gene name "SMN1; SMN2" for one of its sub-units. In the sub units gene list these are indicated by a space after the semi-colon symbol.

In [2]:
def load_corum_complexes(fname):
    all_complexes = pd.read_csv(fname, sep='\t')
    # Reduce down to human complexes
    human_complexes = all_complexes[all_complexes.Organism == 'Human'].reset_index(drop=True)
    human_complexes = human_complexes[['ComplexID', 'ComplexName', 'subunits(Entrez IDs)', 'subunits(Gene name)']]
    human_complexes = human_complexes.rename(columns={'subunits(Entrez IDs)':'subunits_entrez_id',
                                                      'subunits(Gene name)':'subunits_symbol'})
    human_complexes['subunits_symbol'] = human_complexes.subunits_symbol.apply(lambda x: x.replace(';;', ';'))
    print('Num complexes:', human_complexes.ComplexID.nunique())
    display(human_complexes[:1])
    
    # Add different separator symbol for cases where there are multiple gene names per entrez entry
    complexes = human_complexes.assign(subunits_symbol = human_complexes.subunits_symbol.apply(lambda x: x.replace('; ','|')))
    display(complexes[complexes.ComplexID==3298])
    
    return complexes

In [3]:
def create_gene_complex_id_map(complexes):
    # Stack the list of entrez ids associated with each complex id
    df_entrez = complexes.set_index('ComplexID').subunits_entrez_id.apply(lambda x: pd.Series(x.split(';')))\
                         .stack().reset_index(level=1, drop=True).reset_index().rename(columns={0:'entrez_id'})
    # Stack the list of symbols associated with each complex id
    df_symbol = complexes.set_index('ComplexID').subunits_symbol.apply(lambda x: pd.Series(x.split(';')))\
                         .stack().reset_index(level=1, drop=True).reset_index().rename(columns={0:'symbol'})
    assert(df_entrez.shape[0] == df_symbol.shape[0])
    subunits_map = pd.concat([df_entrez, df_symbol.drop(columns=['ComplexID'])], axis=1)
    display(subunits_map[:1])
    
    # Split out symbols with | (stack those as well)
    subunits_map = subunits_map.set_index(['ComplexID','entrez_id']).symbol.apply(lambda x: pd.Series(x.split('|'))).stack()\
                               .reset_index(level=2, drop=True).reset_index().rename(columns={0:'symbol'})
    
    # Check for entrez ids + symbols not in HGNC - drop these rows
    gene_id_map = pd.read_csv(file_gene_id_map).dropna(subset=['entrez_id'])[['entrez_id','symbol']]
    gene_id_map = gene_id_map.astype({'entrez_id':'int'}).astype({'entrez_id':'str'})
    print('N entrez ids not in HGNC:', subunits_map[~subunits_map.entrez_id.isin(gene_id_map.entrez_id)].entrez_id.nunique()-1)
    subunits_map = subunits_map[subunits_map.entrez_id.isin(gene_id_map.entrez_id) | 
                                subunits_map.symbol.isin(gene_id_map.symbol)].reset_index(drop=True)
    
    # Fill in the entrez ids where ids were recorded as None in CORUM
    df = pd.merge(subunits_map[subunits_map.entrez_id=='None'].drop(columns=['entrez_id']), gene_id_map)
    assert(df.shape[0] == subunits_map[subunits_map.entrez_id=='None'].shape[0])
    complex_map = pd.concat([subunits_map[subunits_map.entrez_id!='None'], df]).reset_index(drop=True)
    
    # Clean up
    complex_map = complex_map.rename(columns={'ComplexID':'complex_id'}).sort_values('complex_id').drop(columns=['symbol'])
    complex_map = complex_map.astype({'entrez_id':'int'})
    print('Num genes:', complex_map.entrez_id.nunique())
    print('Num complexes:', complex_map.complex_id.nunique())
    return complex_map

In [4]:
all_complexes = load_corum_complexes(file_CORUM_complexes)

Num complexes: 2916


Unnamed: 0,ComplexID,ComplexName,subunits_entrez_id,subunits_symbol
0,1,BCL6-HDAC4 complex,604;9759,BCL6;HDAC4


Unnamed: 0,ComplexID,ComplexName,subunits_entrez_id,subunits_symbol
1481,3298,"SMN complex (GEMIN2,5, SMN)",8487;None;25929,GEMIN2;SMN1|SMN2;GEMIN5


In [5]:
complex_map = create_gene_complex_id_map(all_complexes)
complex_map[:2]

Unnamed: 0,ComplexID,entrez_id,symbol
0,1,604,BCL6


N entrez ids not in HGNC: 9
Num genes: 3649
Num complexes: 2916


Unnamed: 0,complex_id,entrez_id
0,1,604
1,1,9759


### Calculate complex membership for paralog pairs

In [6]:
def compute_complex_membership(pairs, complex_map):
    # Set of complexes that each gene is a member in
    complexes_per_gene = complex_map.groupby('entrez_id').agg({'complex_id': set}).reset_index()
    display(complexes_per_gene[:2])

    # Merge with each gene in all pairs 
    df = pd.merge(pairs[['A1_entrez','A2_entrez']], 
                  complexes_per_gene.rename(columns={'entrez_id':'A1_entrez','complex_id':'A1_complex_ids'}), how='left')
    df = pd.merge(df, complexes_per_gene.rename(columns={'entrez_id':'A2_entrez','complex_id':'A2_complex_ids'}), how='left')

    # Fill NaNs (gene not in a complex) with empty sets
    df['A1_complex_ids'] = df['A1_complex_ids'].apply(lambda d: d if not pd.isnull(d) else set())
    df['A2_complex_ids'] = df['A2_complex_ids'].apply(lambda d: d if not pd.isnull(d) else set())

    # Calculate complex intersection and union
    complex_membership = df.assign(
        complex_intersection=df.apply(lambda x: x.A1_complex_ids.intersection(x.A2_complex_ids), axis=1),
        complex_union=df.apply(lambda x: x.A1_complex_ids.union(x.A2_complex_ids), axis=1))
    
    # Calculate features
    complex_membership['either_in_complex'] = complex_membership.complex_union.apply(lambda x: len(x) > 0)
    complex_membership['in_same_complex'] = complex_membership.complex_intersection.apply(lambda x: len(x) > 0)
    
    return complex_membership

In [7]:
paralog_pairs = pd.read_csv(file_paralog_pairs, index_col=0)[['A1','A2','A1_entrez','A2_entrez']]
print('Num paralog pairs:', paralog_pairs.shape[0])
paralog_pairs[:1]

Num paralog pairs: 36648


Unnamed: 0,A1,A2,A1_entrez,A2_entrez
0,A1BG,OSCAR,1,126014


In [8]:
complex_membership = compute_complex_membership(paralog_pairs, complex_map)

Unnamed: 0,entrez_id,complex_id
0,2,{2710}
1,12,{5389}


In [9]:
print('Either in complex:', sum(complex_membership.either_in_complex))
assert(sum(complex_membership.either_in_complex) == paralog_pairs[paralog_pairs.A1_entrez.isin(complex_map.entrez_id) | 
                                                                  paralog_pairs.A2_entrez.isin(complex_map.entrez_id)].shape[0])
print('In same complex:', sum(complex_membership.in_same_complex))
complex_membership[complex_membership.in_same_complex][:2]

Either in complex: 8060
In same complex: 402


Unnamed: 0,A1_entrez,A2_entrez,A1_complex_ids,A2_complex_ids,complex_intersection,complex_union,either_in_complex,in_same_complex
15,10236,10492,"{3082, 1181, 1223}","{5385, 1307, 1181, 6838}",{1181},"{6838, 1223, 5385, 3082, 1307, 1181}",True,True
205,64240,64241,{7263},{7263},{7263},{7263},True,True


### Get essentiality for each complex sub-unit

#### Load gene scores

In [10]:
# Load gene scores
gene_scores_raw = pd.read_csv(file_gene_scores, index_col=0)
print('Gene scores:', gene_scores_raw.shape)
gene_scores_raw[:1]

Gene scores: (769, 16438)


Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,221302,9183,55055,11130,79364,440590,79699,7791,23140,26009
ACH-000004,0.153,0.0372,-0.2442,-0.0256,-0.0196,-0.208,0.3096,-0.4438,0.2257,0.1447,...,-0.24,-0.1982,-0.132,-0.4609,0.1545,0.17,-0.4775,0.2669,0.1061,-0.2168


In [11]:
# Compute % of cell lines in which gene is essential + the avg. CERES score for each gene
gene_scores = pd.merge(gene_scores_raw.apply(lambda x: (x < -0.6).sum() / gene_scores_raw.shape[0]).reset_index(),
                       gene_scores_raw.mean().reset_index(), on=['index'])
gene_scores = gene_scores.rename(columns={'index':'entrez_id', '0_x':'essential_percent', '0_y':'avg_ceres_score'})
gene_scores = gene_scores.astype({'entrez_id':'int'})
print('Mean gene essentiality: %.2f%%' % (gene_scores.essential_percent.mean()*100))
display(gene_scores[:1])

Mean gene essentiality: 10.60%


Unnamed: 0,entrez_id,essential_percent,avg_ceres_score
0,1,0.0,0.092581


In [13]:
# Merge essentiality for each subunit gene
subunit_essentiality = pd.merge(complex_map, gene_scores, how='left').rename(columns={'entrez_id':'subunit_id'})
print('N subunits w/out score:', subunit_essentiality[subunit_essentiality.essential_percent.isna()].subunit_id.nunique())
display(subunit_essentiality[:2])

N subunits w/out score: 254


Unnamed: 0,complex_id,subunit_id,essential_percent,avg_ceres_score
0,1,604,0.022107,-0.076935
1,1,9759,0.055917,-0.278913


### Calculate essentiality of complexes in which pairs are members

In [14]:
def compute_essentiality_of_complex_members(complex_members, subunit_ess):
    # Stack complex membership (union of complexes) for paralog pairs
    # This results in one row for each (A1, A2, complex_id) tuple
    complex_members = complex_members.set_index(['A1_entrez','A2_entrez'])['complex_union'].apply(list).apply(pd.Series)\
                                     .stack().reset_index().drop(columns=['level_2'])\
                                     .rename(columns={0:'complex_id'}).astype({'complex_id':'int'})
    display(complex_members[:2])
    
    # Merge complex_id with subunit scores
    # This results in df expanding to multiple rows per (A1, A2, complex_id) tuple, 1 per complex subunit
    complex_members = pd.merge(complex_members, subunit_essentiality)
    assert(complex_members.drop_duplicates(subset=['A1_entrez','A2_entrez']).shape[0] == 
           sum(complex_membership.either_in_complex))
    display(complex_members[:2])
    
    # Filter out sub-units that are either A1 or A2 before computing essentiality of complex
    complex_members = complex_members[(complex_members.A1_entrez != complex_members.subunit_id) & 
                                      (complex_members.A2_entrez != complex_members.subunit_id)].reset_index(drop=True)
    
    # Compute essentiality scores for each (A1, A2, complex) tuple
    complex_ess = complex_members.groupby(['A1_entrez','A2_entrez','complex_id'])\
                                 .agg({'essential_percent':'mean', 'avg_ceres_score':'mean'})
    complex_ess.columns = ['mean_essentiality', 'mean_ceres_score']
    complex_ess = complex_ess.reset_index()

    print('Mean complex essentiality:', complex_ess.mean_essentiality.mean())
    display(complex_ess[:2])
    
    # Compute essentiality scores for each (A1, A2) pair
    # This is the avg of the complex essentiality for all the A1/A2 complexes
    avg_complex_ess = complex_ess.groupby(['A1_entrez','A2_entrez'])\
                                 .agg({'mean_essentiality':'mean', 'mean_ceres_score':'mean'})
    avg_complex_ess.columns = ['mean_complex_essentiality', 'mean_complex_ceres_score']
    avg_complex_ess = avg_complex_ess.reset_index()
    avg_complex_ess['flag_missing_scores'] = avg_complex_ess.mean_complex_ceres_score.isna()

    return complex_ess, avg_complex_ess

In [15]:
complex_members = complex_membership[complex_membership.either_in_complex].reset_index(drop=True)
complex_ess, avg_complex_ess = compute_essentiality_of_complex_members(complex_members, subunit_essentiality)
avg_complex_ess[:2]

Unnamed: 0,A1_entrez,A2_entrez,complex_id
0,29974,10236,3082
1,29974,10236,1181


Unnamed: 0,A1_entrez,A2_entrez,complex_id,subunit_id,essential_percent,avg_ceres_score
0,29974,10236,3082,3609,0.988296,-1.139846
1,29974,10236,3082,10236,0.323797,-0.526903


Mean complex essentiality: 0.21189717380900638


Unnamed: 0,A1_entrez,A2_entrez,complex_id,mean_essentiality,mean_ceres_score
0,2,718,2710,0.009753,-0.130928
1,2,718,6107,0.0013,0.003951


Unnamed: 0,A1_entrez,A2_entrez,mean_complex_essentiality,mean_complex_ceres_score,flag_missing_scores
0,2,718,0.004551,-0.052164,False
1,2,720,0.009753,-0.130928,False


In [16]:
print('N missing scores:', avg_complex_ess[avg_complex_ess.flag_missing_scores].shape[0])
print('Mean complex membership essentiality:', avg_complex_ess.mean_complex_essentiality.mean())
avg_complex_ess[:2]

N missing scores: 60
Mean complex membership essentiality: 0.1986149177303116


Unnamed: 0,A1_entrez,A2_entrez,mean_complex_essentiality,mean_complex_ceres_score,flag_missing_scores
0,2,718,0.004551,-0.052164,False
1,2,720,0.009753,-0.130928,False


### Export protein complex features for paralog pairs

In [17]:
# Merge with all pairs (NaNs for pairs not in any complexes)
complex_features = pd.merge(complex_membership[['A1_entrez', 'A2_entrez', 'either_in_complex', 'in_same_complex']], 
                            avg_complex_ess, how='left')
complex_features = complex_features.fillna({'flag_missing_scores':False})

print('Either in complex:', sum(complex_features.either_in_complex),'/',complex_features.shape[0])
print('Same complex:', sum(complex_features.in_same_complex))
print('Avg complex essentiality: %.2f %%' % (complex_features.mean_complex_essentiality.mean()*100))
print('Complex members without scores b/c of missing gene scores (CERES):', 
       complex_features[complex_features.flag_missing_scores].shape[0])

assert(complex_features.shape[0] == paralog_pairs.shape[0])
complex_features[:1]

Either in complex: 8060 / 36648
Same complex: 402
Avg complex essentiality: 19.86 %
Complex members without scores b/c of missing gene scores (CERES): 60


Unnamed: 0,A1_entrez,A2_entrez,either_in_complex,in_same_complex,mean_complex_essentiality,mean_complex_ceres_score,flag_missing_scores
0,1,126014,False,False,,,False


In [20]:
complex_features.to_csv(file_complex_features, index=0)

### Complex membership and essentiality for *ASF1A/ASF1B*

In [21]:
display(pd.merge(paralog_pairs[(paralog_pairs.A1=='ASF1A') & (paralog_pairs.A2=='ASF1B')], complex_membership))
df = pd.merge(pd.merge(paralog_pairs[(paralog_pairs.A1=='ASF1A') & (paralog_pairs.A2=='ASF1B')], complex_ess),
              complex_map.groupby('complex_id').count().rename(columns={'entrez_id':'n_subunits'}).reset_index())
pd.merge(df, all_complexes[['ComplexID','ComplexName']].rename(columns={'ComplexID':'complex_id'}))

Unnamed: 0,A1,A2,A1_entrez,A2_entrez,A1_complex_ids,A2_complex_ids,complex_intersection,complex_union,either_in_complex,in_same_complex
0,ASF1A,ASF1B,25842,55723,"{6149, 2235, 2236, 1149, 1150}","{2235, 2236, 1149, 1150}","{2235, 2236, 1149, 1150}","{6149, 2235, 2236, 1149, 1150}",True,True


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,complex_id,mean_essentiality,mean_ceres_score,n_subunits,ComplexName
0,ASF1A,ASF1B,25842,55723,1149,0.553533,-0.810183,8,Histone H3.1 complex
1,ASF1A,ASF1B,25842,55723,1150,0.439272,-0.530142,7,Histone H3.3 complex
2,ASF1A,ASF1B,25842,55723,2235,0.957521,-1.361864,5,ASF1-interacting protein complex
3,ASF1A,ASF1B,25842,55723,2236,0.355657,-0.641063,6,ASF1-histone containing complex
4,ASF1A,ASF1B,25842,55723,6149,0.425228,-0.631792,3,Codanin-1-Asf1\u2013histone H3.1-histone H4\u2...


In [22]:
display(pd.merge(paralog_pairs[(paralog_pairs.A1=='COPS7A') & (paralog_pairs.A2=='COPS7B')], complex_membership))
df = pd.merge(pd.merge(paralog_pairs[(paralog_pairs.A1=='COPS7A') & (paralog_pairs.A2=='COPS7B')], complex_ess),
              complex_map.groupby('complex_id').count().rename(columns={'entrez_id':'n_subunits'}).reset_index())
pd.merge(df, all_complexes[['ComplexID','ComplexName']].rename(columns={'ComplexID':'complex_id'}))

Unnamed: 0,A1,A2,A1_entrez,A2_entrez,A1_complex_ids,A2_complex_ids,complex_intersection,complex_union,either_in_complex,in_same_complex
0,COPS7A,COPS7B,50813,64708,"{2179, 726, 727, 728, 2174}",{},{},"{2179, 726, 727, 728, 2174}",True,False


Unnamed: 0,A1,A2,A1_entrez,A2_entrez,complex_id,mean_essentiality,mean_ceres_score,n_subunits,ComplexName
0,COPS7A,COPS7B,50813,64708,726,0.785672,-1.002744,12,DDB2 complex
1,COPS7A,COPS7B,50813,64708,727,0.78319,-0.985889,12,CSA complex
2,COPS7A,COPS7B,50813,64708,728,0.78319,-0.985889,13,CSA-POLIIa complex
3,COPS7A,COPS7B,50813,64708,2174,0.964889,-1.18437,8,COP9 signalosome complex
4,COPS7A,COPS7B,50813,64708,2179,0.845254,-1.012608,9,CNS-P53 complex
