## Process protein-protein interactions

**Input**: PPI pairs from BIOGRID + CERES gene scores + processed paralog pairs

**Output**: Paralog pairs annotated with PPI features

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Inputs
file_biogrid_ppi = get_data_path(['biogrid'], 'BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab3.txt')
file_paralog_pairs = get_local_data_path(['processed', 'ensembl93'], 'unique_pairs.csv')
file_gene_scores = get_local_data_path(['processed', 'depmap20Q2'], 'gene_scores_26_05_20.csv')

# Output
file_biogrid_features = get_local_data_path(['processed','paralog_features'], 'biogrid_ppi_features.csv')

### Protein-Protein Interactions (PPI) from BioGRID

Using all methods for calling an interaction between A1 and A2.

Multiple entries can refer to the same interaction but based on different Experimental System / different Throughput

In [2]:
biogrid_raw = pd.read_csv(file_biogrid_ppi, sep='\t',  low_memory=False)
biogrid_raw[:1]

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,REFSEQ Accessions Interactor A,SWISS-PROT Accessions Interactor B,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,NP_003001|NP_001268364,Q14315,Q59H94,NP_001120959|NP_001449,-,-,-,-,-,-


In [5]:
# Filter interactions down to physical interactions between two human proteins
hsapien = 9606
biogrid_ppi = biogrid_raw[(biogrid_raw['Organism Interactor A']==hsapien) & (biogrid_raw['Organism Interactor B']==hsapien) &
                          (biogrid_raw['Experimental System Type']=='physical')]

# Clean up
biogrid_ppi = biogrid_ppi.rename(columns={'Entrez Gene Interactor A':'A1_entrez', 'Entrez Gene Interactor B':'A2_entrez',
                                          'Experimental System':'experimental_system'})
biogrid_ppi = biogrid_ppi[['A1_entrez', 'A2_entrez', 'experimental_system']].drop_duplicates().reset_index(drop=True)

# Filter out self interactions
biogrid_ppi = biogrid_ppi[biogrid_ppi.A1_entrez != biogrid_ppi.A2_entrez]
print('N physical interactions:', biogrid_ppi.shape[0])
display(biogrid_ppi.experimental_system.value_counts())

# Sort interacting genes - to find unique interaction pairs
biogrid_ppi = biogrid_ppi.astype({'A1_entrez':'int', 'A2_entrez':'int'})
biogrid_ppi = pd.DataFrame(np.sort(biogrid_ppi[['A1_entrez', 'A2_entrez']], axis=1), columns=['A1_entrez', 'A2_entrez'])

print('N genes in interaction map:', pd.concat([biogrid_ppi.A1_entrez, biogrid_ppi.A2_entrez]).nunique())
display(biogrid_ppi[:2])

N physical interactions: 499929


Affinity Capture-MS              219685
Two-hybrid                        97984
Affinity Capture-Western          48943
Co-fractionation                  43010
Reconstituted Complex             26801
Proximity Label-MS                22391
Affinity Capture-RNA              18549
Biochemical Activity               8591
Co-localization                    3410
Affinity Capture-Luminescence      1969
FRET                               1917
Protein-peptide                    1806
Co-purification                    1528
Co-crystal Structure               1390
PCA                                 715
Far Western                         696
Protein-RNA                         544
Name: experimental_system, dtype: int64

N genes in interaction map: 18501


Unnamed: 0,A1_entrez,A2_entrez
0,2318,6416
1,88,84665


### Calculate shared and total protein-protein interactions for paralog pairs

In [7]:
# Assumes PPI input is not symmetric (duplicates have been removed)

def compute_ppi_summary_for_pairs(ppi, paralog_pairs):
    
    # Make ppi df symmetrical 
    ppi_symmetric = pd.concat([ppi, ppi.rename(columns={'A1_entrez':'A2_entrez','A2_entrez':'A1_entrez'})])
    ppi_symmetric = ppi_symmetric.reset_index(drop=True).drop_duplicates()
    
    # Merge with paralog pairs
    paralog_ppi = pd.merge(paralog_pairs[['A1_entrez','A2_entrez']], ppi_symmetric, how='left', indicator='interact')
    paralog_ppi.interact = paralog_ppi.interact=='both'
    print('N. pairs that interact:', sum(paralog_ppi.interact))
    
    # Use symmetric version of ppi table to get all interactors for each gene
    ppi_per_gene = ppi_symmetric.groupby('A1_entrez').agg({'A2_entrez':set}).reset_index()\
                                .rename(columns={'A1_entrez':'gene', 'A2_entrez':'ppi'})
    display(ppi_per_gene[:1])

    # Merge ppi_per_gene with each A1 and A2 in all pairs - df will be all pairs where at least 1 has 1+ interactors
    # Note: pairs can have shared interactors even if there is no evidence they interact themselves
    df = pd.merge(paralog_ppi, ppi_per_gene.rename(columns={'gene':'A1_entrez','ppi':'A1_ppi'}))
    df = pd.merge(df, ppi_per_gene.rename(columns={'gene':'A2_entrez','ppi':'A2_ppi'}))
    
    # Remove A2 gene in the set of interactors for A1 gene (and v.v.)
    # Don't want to include these in union for other calculations
    df.A1_ppi = df.apply(lambda x: x.A1_ppi - {x.A2_entrez}, axis=1)
    df.A2_ppi = df.apply(lambda x: x.A2_ppi - {x.A1_entrez}, axis=1)

    # Calculate total num interactors + shared interactors
    df = df.assign(n_A1_ppi = df.apply(lambda x: len(x.A1_ppi), axis=1),
                   n_A2_ppi = df.apply(lambda x: len(x.A2_ppi), axis=1))
    df = df.assign(shared_ppi = df.apply(lambda x: x.A1_ppi.intersection(x.A2_ppi), axis=1))
    df = df.assign(n_total_ppi = df.apply(lambda x: len(x.A1_ppi.union(x.A2_ppi)), axis=1))
    df = df.assign(n_shared_ppi = df.apply(lambda x: len(x.shared_ppi), axis=1))

    # Calculate jaccard index for shared interactors
    def calc_jaccard_index(x):
        if x.n_shared_ppi == 0: return 0
        return x.n_shared_ppi / ((x.n_A1_ppi + x.n_A2_ppi) - x.n_shared_ppi)

    df['shared_ppi_jaccard_idx'] = df.apply(calc_jaccard_index, axis=1)

    # Calculate FET for overlap, N = all genes involved in interactions
    N = len(pd.concat([ppi.A1_entrez, ppi.A2_entrez]).unique())
    assert(ppi_per_gene.shape[0] == N)

    #       A2   NA2
    # A1  |    |    |
    # NA1 |    |    |
    def calc_fet_shared_ppi(x):
        tab = pd.DataFrame({'A2': [x.n_shared_ppi, x.n_A2_ppi - x.n_shared_ppi],
                            'NA2': [x.n_A1_ppi - x.n_shared_ppi, N - (x.n_A1_ppi + x.n_A2_ppi - x.n_shared_ppi)]}, 
                           index=['A1', 'NA1'])
        (OR, pval) = stats.fisher_exact(tab)
        if pval==0: # Use smallest float64 number, to apply log10
            pval = np.nextafter(0, 1)
        log_pval = (-np.log10(pval)) if pval != 1 else 0
        log_pval = -log_pval if OR < 1 else log_pval
        return log_pval

    df['fet_ppi_overlap'] = df.apply(calc_fet_shared_ppi, axis=1)
    
    ret_df = pd.merge(paralog_ppi, df, how='left')
    ret_df = ret_df.fillna({'n_shared_ppi':0,'n_total_ppi':0,'shared_ppi_jaccard_idx':0,'fet_ppi_overlap':0, 'direct':False})
    ret_df = ret_df.drop(columns=['A1_ppi','A2_ppi','n_A1_ppi','n_A2_ppi'])
    
    return ret_df

In [6]:
paralog_pairs = pd.read_csv(file_paralog_pairs, index_col=0)
print('Num pairs:', paralog_pairs.shape[0])

Num pairs: 36648


In [8]:
biogrid_ppi_summary = compute_ppi_summary_for_pairs(biogrid_ppi, paralog_pairs)
biogrid_ppi_summary[:1]

N. pairs that interact: 2853


Unnamed: 0,gene,ppi
0,1,"{284161, 1026, 11010, 63891, 23198, 7083, 1054..."


Unnamed: 0,A1_entrez,A2_entrez,interact,shared_ppi,n_total_ppi,n_shared_ppi,shared_ppi_jaccard_idx,fet_ppi_overlap
0,1,126014,False,,0.0,0.0,0.0,0.0


### Compute essentiality of shared interactors

In [9]:
# Load gene scores
gene_scores_raw = pd.read_csv(file_gene_scores, index_col=0)
print('Gene scores:', gene_scores_raw.shape)
gene_scores_raw[:1]

Gene scores: (769, 16445)


Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,221302,9183,55055,11130,79364,440590,79699,7791,23140,26009
ACH-001382,0.652347,-0.020793,-0.270112,-0.211408,0.06271,-0.081674,0.37684,-0.767385,0.166991,0.112045,...,0.182055,-0.724872,0.499652,-0.141164,-0.336501,0.250665,-0.676302,-0.098001,-0.677497,-0.506629


In [10]:
# Compute % of cell lines in which gene is essential + the avg. CERES score for each gene
gene_scores = pd.merge(gene_scores_raw.apply(lambda x: (x < -0.6).sum() / gene_scores_raw.shape[0]).reset_index(),
                       gene_scores_raw.mean().reset_index(), on=['index'])
gene_scores = gene_scores.rename(columns={'index':'entrez_id', '0_x':'essential_percent', '0_y':'avg_ceres_score'})
gene_scores = gene_scores.astype({'entrez_id':'int'})
print('Mean essential %%: %.2f, mean CERES: %.2f' % (gene_scores.essential_percent.mean()*100, 
                                                     gene_scores.avg_ceres_score.mean()))
display(gene_scores[:1])

Mean essential %: 10.62, mean CERES: -0.16


Unnamed: 0,entrez_id,essential_percent,avg_ceres_score
0,1,0.0,0.092681


In [12]:
def compute_shared_ppi_essentiality(shared_ppi_summary):
    
    # Stack shared interactors set
    df = shared_ppi_summary[shared_ppi_summary.n_shared_ppi > 0].set_index(['A1_entrez','A2_entrez'])['shared_ppi']\
                                                                .apply(list).apply(pd.Series).stack()
    df = df.reset_index().drop(columns=['level_2']).rename(columns={0:'shared_interactor'}).astype({'shared_interactor':'int'})

    # Annotate each interactor w/ its essentiality and aggregate
    # NOTE: don't want to assign 0 if no scores are available
    df = pd.merge(df, gene_scores.rename(columns={'entrez_id':'shared_interactor'}), how='left')
    print('Interactors w/out essentiality:', df[df.essential_percent.isna()].shared_interactor.nunique(), '/', 
          df.shared_interactor.nunique())

    # Compute avg interactor essentiality and % interactors that are broadly essential (> 90% of cell lines)
    percent_broadly = lambda x: np.sum(x > 0.9) / len(x) if len(x)>0 and not x.isna().all() else float('nan')
    ppi_essentiality = df.groupby(['A1_entrez','A2_entrez'])\
                         .agg({'essential_percent':['mean', percent_broadly], 'avg_ceres_score':'mean'})
    ppi_essentiality.columns = ['shared_ppi_mean_essentiality','shared_ppi_percent_essential','shared_ppi_mean_ceres_score']
    ppi_essentiality = ppi_essentiality.reset_index()  

    return ppi_essentiality

In [14]:
biogrid_ppi_essentiality = compute_shared_ppi_essentiality(biogrid_ppi_summary)
print('Mean percent essential:', biogrid_ppi_essentiality.shared_ppi_percent_essential.mean()*100)
biogrid_ppi_essentiality[:1]

Interactors w/out essentiality: 1182 / 10370
Mean percent essential: 12.384835164209688


Unnamed: 0,A1_entrez,A2_entrez,shared_ppi_mean_essentiality,shared_ppi_percent_essential,shared_ppi_mean_ceres_score
0,2,718,0.177503,0.166667,-0.26044


### Merge all PPI features

In [17]:
biogrid_features = pd.merge(biogrid_ppi_summary.drop(columns=['shared_ppi']), biogrid_ppi_essentiality, how='left')

# Fill in essentiality NaNs for genes that have shared ppi
print('Shared interactors w/out essentiality:', 
      biogrid_features[(biogrid_features.n_shared_ppi>0) & biogrid_features.shared_ppi_percent_essential.isna()].shape[0])

assert(biogrid_features.shape[0] == paralog_pairs.shape[0])
print('Interactions among paralogs:', sum(biogrid_features.interact),'/',biogrid_features.shape[0])
biogrid_features[:1]

Shared interactors w/out essentiality: 439
Interactions among paralogs: 2853 / 36648


Unnamed: 0,A1_entrez,A2_entrez,interact,n_total_ppi,n_shared_ppi,shared_ppi_jaccard_idx,fet_ppi_overlap,shared_ppi_mean_essentiality,shared_ppi_percent_essential,shared_ppi_mean_ceres_score
0,1,126014,False,0.0,0.0,0.0,0.0,,,


In [18]:
biogrid_features.to_csv(file_biogrid_features, index=0)