## Process protein-protein interactions

**Input**: PPI pairs from BIOGRID + CERES gene scores + processed paralog pairs

**Output**: Paralog pairs annotated with PPI features

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import re

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)
ensembl_version = '93'

# Inputs
file_biogrid_ppi = get_data_path(['biogrid'], 'BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab3.txt')
file_paralog_pairs = get_local_data_path(['processed', 'ensembl93'], 'unique_pairs.csv')
file_gene_scores = get_local_data_path(['processed', 'depmap20Q2'], 'gene_scores_16_04_21.csv')

# Output
file_ppi_features = get_local_data_path(['processed','paralog_features'], 'ppi_features.csv')

### Paralog pairs

In [2]:
# Unique, sorted list
paralog_pairs = pd.read_csv(file_paralog_pairs, index_col=0)
print('Num pairs:', paralog_pairs.shape[0])
paralog_genes = pd.concat([paralog_pairs.A1_entrez, paralog_pairs.A2_entrez]).unique()
print('N paralog genes:', len(paralog_genes))
paralog_pairs[:1]

Num pairs: 36648
N paralog genes: 13320


Unnamed: 0,A1,A2,min_seq_id,max_seq_id,singh_wgd,makino_wgd,WGD,same_chr,closest,family_size,family_id,cds_length_ratio,A1_entrez,A1_ensembl,A2_entrez,A2_ensembl
0,A1BG,OSCAR,0.127273,0.22028,False,False,False,True,False,3,3046,0.578629,1,ENSG00000121410,126014,ENSG00000170909


### Protein-Protein Interactions (PPI) from BioGRID

Using all methods for calling an interaction between A1 and A2.

There are techniques that measure direct physical interactions between protein pairs, named “binary” methods, while there are other techniques that measure physical interactions among groups of proteins, without pairwise determination of protein partners, named “co-complex” methods.

Multiple entries can refer to the same interaction but based on different Experimental System / different Throughput

In [3]:
biogrid_raw = pd.read_csv(file_biogrid_ppi, sep='\t',  low_memory=False)
biogrid_raw[:1]

Unnamed: 0,#BioGRID Interaction ID,Entrez Gene Interactor A,Entrez Gene Interactor B,BioGRID ID Interactor A,BioGRID ID Interactor B,Systematic Name Interactor A,Systematic Name Interactor B,Official Symbol Interactor A,Official Symbol Interactor B,Synonyms Interactor A,...,REFSEQ Accessions Interactor A,SWISS-PROT Accessions Interactor B,TREMBL Accessions Interactor B,REFSEQ Accessions Interactor B,Ontology Term IDs,Ontology Term Names,Ontology Term Categories,Ontology Term Qualifier IDs,Ontology Term Qualifier Names,Ontology Term Types
0,103,6416,2318,112315,108607,-,-,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,...,NP_003001|NP_001268364,Q14315,Q59H94,NP_001120959|NP_001449,-,-,-,-,-,-


In [4]:
def process_biogrid_ppi_data(data):
    # Filter interactions down to physical interactions between two human proteins
    hsapien = 9606
    biogrid_ppi = data[(data['Organism Interactor A']==hsapien) & (data['Organism Interactor B']==hsapien) &
                       (data['Experimental System Type']=='physical')]
    # Filter to columns of interest + rename
    biogrid_ppi = biogrid_ppi[['Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'Experimental System']]
    biogrid_ppi = biogrid_ppi.rename(columns={'Entrez Gene Interactor A':'A1_entrez', 'Entrez Gene Interactor B':'A2_entrez',
                                              'Experimental System':'experimental_system'})
    biogrid_ppi = biogrid_ppi.drop_duplicates().reset_index(drop=True).astype({'A1_entrez':'int', 'A2_entrez':'int'})
    
    # Filter out self interactions
    biogrid_ppi = biogrid_ppi[biogrid_ppi.A1_entrez != biogrid_ppi.A2_entrez].reset_index(drop=True)
    print('N interactions:', biogrid_ppi.shape[0])
    
    # Sort interacting genes - to find unique interaction pairs
    biogrid_unique = pd.DataFrame(np.sort(biogrid_ppi[['A1_entrez','A2_entrez']], axis=1), 
                                  columns=['A1_entrez', 'A2_entrez']).drop_duplicates()
    assert(biogrid_unique.shape[0] <= biogrid_ppi.shape[0])

    print('N interactions after sorting for unique pairs:', biogrid_unique.shape[0])
    print('N genes in interaction map:', pd.concat([biogrid_unique.A1_entrez, biogrid_unique.A2_entrez]).nunique())
    
    return biogrid_unique

In [5]:
biogrid_unique = process_biogrid_ppi_data(biogrid_raw)
biogrid_genes = pd.concat([biogrid_unique.A1_entrez, biogrid_unique.A2_entrez]).unique()
display(biogrid_unique[:2])

N interactions: 495572
N interactions after sorting for unique pairs: 420095
N genes in interaction map: 18492


Unnamed: 0,A1_entrez,A2_entrez
0,2318,6416
1,88,84665


### Calculate shared and total protein-protein interactions for paralog pairs

In [6]:
# Assumes PPI has columns [A1_entrez, A2 entrez] and is sorted --> not symmetric and duplicates have been removed
# Assumes paralog_pairs is the unique list
def compute_ppi_summary_for_pairs(ppi, paralog_pairs):
    
    # Make ppi df symmetrical for merging with paralog pairs (which are the unique list)
    ppi_symmetric = pd.concat([ppi, ppi.rename(columns={'A1_entrez':'A2_entrez','A2_entrez':'A1_entrez'})])
    ppi_symmetric = ppi_symmetric.reset_index(drop=True)
    assert(ppi_symmetric.shape[0] == ppi.shape[0]*2)
    
    # Merge with paralog pairs - label interaction if paralog pair is found in PPI df
    paralog_ppi = pd.merge(paralog_pairs[['A1_entrez','A2_entrez']], ppi_symmetric, how='left', indicator='interact')
    paralog_ppi.interact = paralog_ppi.interact=='both'
    assert(paralog_ppi.shape[0] == paralog_pairs.shape[0])
    print('N. paralog pairs that interact:', sum(paralog_ppi.interact))
    
    # Use symmetric version of ppi table to get set of all interactors for each gene
    ppi_per_gene = ppi_symmetric.groupby('A1_entrez').agg({'A2_entrez':set}).reset_index()\
                                .rename(columns={'A1_entrez':'gene', 'A2_entrez':'ppi'})
    assert(ppi_per_gene.shape[0] == pd.concat([ppi.A1_entrez, ppi.A2_entrez]).nunique())

    # Merge ppi_per_gene with each A1 and A2 in all pairs
    # Note: pairs can have shared interactors even if there is no evidence they interact themselves
    df = pd.merge(paralog_ppi, ppi_per_gene.rename(columns={'gene':'A1_entrez','ppi':'A1_ppi'}), how='left')
    df = pd.merge(df, ppi_per_gene.rename(columns={'gene':'A2_entrez','ppi':'A2_ppi'}), how='left')
    assert(df.shape[0] == paralog_pairs.shape[0])
    print('N paralog pairs w/ 1+ interactor (A1 and/or A2):', df[(~df.A1_ppi.isna()) | (~df.A2_ppi.isna())].shape[0])
    
    # Fill NaNs with empty sets
    df['A1_ppi'] = df['A1_ppi'].apply(lambda d: d if not pd.isnull(d) else set())
    df['A2_ppi'] = df['A2_ppi'].apply(lambda d: d if not pd.isnull(d) else set())
    
    # Remove A2 gene in the set of interactors for A1 gene (and vice versa)
    # Don't want to include these in union for other calculations
    df.A1_ppi = df.apply(lambda x: x.A1_ppi - {x.A2_entrez}, axis=1)
    df.A2_ppi = df.apply(lambda x: x.A2_ppi - {x.A1_entrez}, axis=1)

    # Calculate total num interactors + shared interactors
    df['n_A1_ppi'] = df.apply(lambda x: len(x.A1_ppi), axis=1)
    df['n_A2_ppi'] = df.apply(lambda x: len(x.A2_ppi), axis=1)
    df['shared_ppi'] = df.apply(lambda x: x.A1_ppi.intersection(x.A2_ppi), axis=1)
    df['n_total_ppi'] = df.apply(lambda x: len(x.A1_ppi.union(x.A2_ppi)), axis=1)
    df['n_shared_ppi'] = df.apply(lambda x: len(x.shared_ppi), axis=1)

    # Calculate jaccard index for shared interactors
    def calc_jaccard_index(x):
        if x.n_shared_ppi == 0: return 0
        return x.n_shared_ppi / ((x.n_A1_ppi + x.n_A2_ppi) - x.n_shared_ppi)

    df['shared_ppi_jaccard_idx'] = df.apply(calc_jaccard_index, axis=1)

    # Calculate FET for overlap of interactors, N = all genes involved in interactions
    N = len(pd.concat([ppi.A1_entrez, ppi.A2_entrez]).unique())
    print('N genes involded in interactions:', N)
    assert(ppi_per_gene.shape[0] == N)
    # ctab:   | A2      | Not A2
    #      A1 | shared  | A1 only
    #  Not A1 | A2 only | N - union(A1, A2)
    def calc_fet_shared_ppi(x):
        ctab = pd.DataFrame({'A2': [x.n_shared_ppi, x.n_A2_ppi - x.n_shared_ppi],
                            'NA2': [x.n_A1_ppi - x.n_shared_ppi, N - x.n_total_ppi]}, index=['A1', 'NA1'])
        (OR, pval) = stats.fisher_exact(ctab)
        if pval==0: # Use smallest float64 number, to apply log10
            pval = np.nextafter(0, 1)
        log_pval = (-np.log10(pval)) if pval != 1 else 0
        log_pval = -log_pval if OR < 1 else log_pval # return negative pval if signif depletion in overlap
        return log_pval

    df['fet_ppi_overlap'] = df.apply(calc_fet_shared_ppi, axis=1)
    
    df = df.drop(columns=['A1_ppi','A2_ppi','n_A1_ppi','n_A2_ppi'])
    
    return df

In [7]:
biogrid_ppi_summary = compute_ppi_summary_for_pairs(biogrid_unique, paralog_pairs)
biogrid_ppi_summary[:1]

N. paralog pairs that interact: 2853
N paralog pairs w/ 1+ interactor (A1 and/or A2): 34457
N genes involded in interactions: 18492


Unnamed: 0,A1_entrez,A2_entrez,interact,shared_ppi,n_total_ppi,n_shared_ppi,shared_ppi_jaccard_idx,fet_ppi_overlap
0,1,126014,False,{},22,0,0.0,0.0


### Compute essentiality of shared interactors

#### Load Gene Scores

In [8]:
# Load gene scores
gene_scores_raw = pd.read_csv(file_gene_scores, index_col=0)
print('Gene scores:', gene_scores_raw.shape)
gene_scores_raw[:1]

Gene scores: (769, 16438)


Unnamed: 0,1,29974,2,144568,127550,53947,51146,8086,65985,13,...,221302,9183,55055,11130,79364,440590,79699,7791,23140,26009
ACH-000004,0.153,0.0372,-0.2442,-0.0256,-0.0196,-0.208,0.3096,-0.4438,0.2257,0.1447,...,-0.24,-0.1982,-0.132,-0.4609,0.1545,0.17,-0.4775,0.2669,0.1061,-0.2168


In [9]:
# Compute % of cell lines in which gene is essential + the avg. CERES score for each gene
gene_scores = pd.merge(gene_scores_raw.apply(lambda x: (x < -0.6).sum() / gene_scores_raw.shape[0]).reset_index(),
                       gene_scores_raw.mean().reset_index(), on=['index'])
gene_scores = gene_scores.rename(columns={'index':'entrez_id', '0_x':'essential_percent', '0_y':'avg_ceres_score'})
gene_scores = gene_scores.astype({'entrez_id':'int'})
print('Mean essential %%: %.2f, mean CERES: %.2f' % (gene_scores.essential_percent.mean()*100, 
                                                     gene_scores.avg_ceres_score.mean()))
display(gene_scores[:1])

Mean essential %: 10.60, mean CERES: -0.16


Unnamed: 0,entrez_id,essential_percent,avg_ceres_score
0,1,0.0,0.092581


In [10]:
def compute_shared_ppi_essentiality(ppi_summary, gene_scores):
    # Stack shared interactors set
    df = ppi_summary[ppi_summary.n_shared_ppi > 0].reset_index(drop=True)
    df = df.set_index(['A1_entrez','A2_entrez'])['shared_ppi'].apply(list).apply(pd.Series).stack()
    df = df.reset_index().drop(columns=['level_2']).rename(columns={0:'shared_interactor'}).astype({'shared_interactor':'int'})

    # Annotate each interactor w/ its essentiality and aggregate
    # NOTE: don't want to assign 0 if no scores are available
    df = pd.merge(df, gene_scores.rename(columns={'entrez_id':'shared_interactor'}), how='left')
    print('Interactors w/out essentiality:', df[df.essential_percent.isna()].shared_interactor.nunique(), '/', 
          df.shared_interactor.nunique())
    #df = df.fillna({'essential_percent':gene_scores.essential_percent.mean()})

    # Compute avg interactor essentiality and % interactors that are broadly essential (> 90% of cell lines)
    percent_broadly = lambda x: np.sum(x > 0.9) / len(x) if len(x)>0 and not x.isna().all() else float('nan')
    ppi_essentiality = df.groupby(['A1_entrez','A2_entrez'])\
                         .agg({'essential_percent':['mean', percent_broadly], 'avg_ceres_score':'mean'})
    ppi_essentiality.columns = ['shared_ppi_mean_essentiality','shared_ppi_percent_essential','shared_ppi_mean_ceres_score']
    ppi_essentiality = ppi_essentiality.reset_index()  

    return ppi_essentiality

In [11]:
biogrid_essentiality = compute_shared_ppi_essentiality(biogrid_ppi_summary, gene_scores)
print('Mean percent essential:', biogrid_essentiality.shared_ppi_percent_essential.mean())
biogrid_essentiality[:1]

Interactors w/out essentiality: 1185 / 10370
Mean percent essential: 0.1191932511829696


Unnamed: 0,A1_entrez,A2_entrez,shared_ppi_mean_essentiality,shared_ppi_percent_essential,shared_ppi_mean_ceres_score
0,2,718,0.177503,0.166667,-0.260447


### Merge all PPI features

In [12]:
biogrid_features = pd.merge(biogrid_ppi_summary.drop(columns=['shared_ppi']), biogrid_essentiality, how='left')
assert(biogrid_features.shape[0] == paralog_pairs.shape[0])

print('Shared interactors w/out essentiality:', 
      biogrid_features[(biogrid_features.n_shared_ppi>0) & biogrid_features.shared_ppi_percent_essential.isna()].shape[0])
print('Interactions between paralogs (biogrid all):', sum(biogrid_features.interact),'/',biogrid_features.shape[0])

biogrid_features[:1]

Shared interactors w/out essentiality: 477
Interactions between paralogs (biogrid all): 2853 / 36648


Unnamed: 0,A1_entrez,A2_entrez,interact,n_total_ppi,n_shared_ppi,shared_ppi_jaccard_idx,fet_ppi_overlap,shared_ppi_mean_essentiality,shared_ppi_percent_essential,shared_ppi_mean_ceres_score
0,1,126014,False,22,0,0.0,0.0,,,


In [13]:
biogrid_features.to_csv(file_ppi_features, index=0)