## Feature calculation/assembling - annotate all paralog pairs

Annotate all paralog pairs from Ensembl w/ min. 20% sequence identity (in at least 1 dirn)

**Inputs:**
* Ensembl paralog pairs
* Protein complex features (pre-computed)
* BioGRID PPI features (pre-computed)
* Ortholog/conservation features (pre-computed)
* Expression features (pre-computed)
* Protein age from Protein Historian
* Pfam domains from Ensembl
* Subcellular location from Protein Atlas

**Output:**
* All paralog pairs annotated with all features



In [1]:
import pandas as pd
import numpy as np
import os

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+ fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../../local_data/' +'/'.join(folders) +'/'+ fname)

# Inputs
file_unique_pairs = get_local_data_path(['processed', 'ensembl93'], 'unique_pairs.csv')
file_complex_features = get_local_data_path(['processed','paralog_features'], 'protein_complex_features.csv')
file_biogrid_features = get_local_data_path(['processed','paralog_features'], 'biogrid_ppi_features.csv')
file_ortholog_features = get_local_data_path(['processed','paralog_features'], 'ortholog_features.csv')
file_gtex_expr_feats = get_local_data_path(['processed', 'paralog_features'], 'gtex_expr_features.csv')
file_protein_age = get_data_path(['protein_historian'], 'HUMAN_PPODv4_PTHR7-OrthoMCL_wagner1.0_ages.txt')
file_pfam_domains = get_data_path(['ensembl','93'], 'paralog_pfam_domain_ids.txt')
file_protein_location = get_data_path(['protein_atlas'], 'subcellular_location_v19.3.tsv')

# Output
file_all_features = get_local_data_path(['processed', 'paralog_features'], 'all_features.csv')

In [2]:
pairs = pd.read_csv(file_unique_pairs, index_col=0)
print('Num pairs:', pairs.shape[0], ', num genes:', pd.concat([pairs.A1_ensembl, pairs.A2_ensembl]).nunique())
pairs[:1]

Num pairs: 36648 , num genes: 13320


Unnamed: 0,A1,A2,min_seq_id,max_seq_id,singh_wgd,makino_wgd,WGD,same_chr,closest,family_size,cds_length_ratio,A1_entrez,A1_ensembl,A2_entrez,A2_ensembl
0,A1BG,OSCAR,0.127273,0.22028,False,False,False,True,False,3,0.578629,1,ENSG00000121410,126014,ENSG00000170909


### 1. Protein complex features
- Either gene in (essential) complex
- Avg. essentiality of complexes in which gene(s) are members

In [3]:
complex_features = pd.read_csv(file_complex_features)
complex_features = pd.merge(pairs[['A1_entrez','A2_entrez']], complex_features)
assert(pairs.shape[0] == complex_features.shape[0])
print('Either in complex:', sum(complex_features.either_in_complex))
print('Either in essential complex:', sum(complex_features.either_in_essential_complex))
print('Avg. complex essentiality: %.2f%%' % (complex_features.mean_complex_essentiality.mean()*100))
complex_features[:1]

Either in complex: 8060
Either in essential complex: 2487
Avg. complex essentiality: 19.86%


Unnamed: 0,A1_entrez,A2_entrez,either_in_complex,in_same_complex,either_in_essential_complex,mean_complex_essentiality,flag_missing_scores
0,1,126014,False,False,False,,False


### 2. Protein-protein interactions
- Whether genes in a pair have a direct or indirect protein interaction with each other
- Total num interactors
- Fraction of interactors that are shared
- Avg essentiality of shared interactors

In [4]:
ppi_features = pd.read_csv(file_biogrid_features)
ppi_features = pd.merge(pairs[['A1_entrez','A2_entrez']], ppi_features)
assert(pairs.shape[0] == ppi_features.shape[0])
print('Interactions between paralog pairs:', sum(ppi_features.interact))
ppi_features[:1]

Interactions between paralog pairs: 2853


Unnamed: 0,A1_entrez,A2_entrez,interact,n_total_ppi,n_shared_ppi,shared_ppi_jaccard_idx,fet_ppi_overlap,shared_ppi_mean_essentiality,shared_ppi_percent_essential,shared_ppi_mean_ceres_score
0,1,126014,False,0.0,0.0,0.0,0.0,,,


### 3. Protein domains
* Jaccard index of shared domains

In [5]:
domains = pd.read_csv(file_pfam_domains)
domains = domains.rename(columns={'Gene stable ID':'ensembl_id','Pfam domain ID':'pfam_id'}).dropna()
print('Avg domains per gene:', domains.groupby('ensembl_id').pfam_id.count().mean())

# Get domains for each gene in each pair
domains_per_gene = domains.groupby('ensembl_id').agg({'pfam_id': set}).reset_index()
display(domains_per_gene[:1])
df = pd.merge(pairs[['A1_ensembl','A2_ensembl']], domains_per_gene.rename(columns={'ensembl_id':'A1_ensembl'}))
df = pd.merge(df, domains_per_gene.rename(columns={'ensembl_id':'A2_ensembl'}), on=['A2_ensembl'])

# Calculate num shared domains and jaccard index
df['n_shared_domains'] = df.apply(lambda x: len(x.pfam_id_x.intersection(x.pfam_id_y)), axis=1)
df['shared_domains'] = df.apply(lambda x: x.n_shared_domains / (len(x.pfam_id_x)+len(x.pfam_id_y) - x.n_shared_domains), axis=1)

# Merge back with pairs
domain_features = pd.merge(pairs[['A1_ensembl','A2_ensembl']], df.drop(columns=['pfam_id_x','pfam_id_y']), how='left')
domain_features = domain_features.fillna({'n_shared_domains':0, 'shared_domains':0})
assert(domain_features.shape[0] == pairs.shape[0])

print('Pairs w/ 1+ shared domain:', domain_features[domain_features.n_shared_domains > 0].shape[0])
print('Pairs w/ 2+ shared domain:', domain_features[domain_features.n_shared_domains > 1].shape[0])
domain_features[:1]

Avg domains per gene: 1.7544378698224852


Unnamed: 0,ensembl_id,pfam_id
0,ENSG00000000003,{PF00335}


Pairs w/ 1+ shared domain: 35476
Pairs w/ 2+ shared domain: 14019


Unnamed: 0,A1_ensembl,A2_ensembl,n_shared_domains,shared_domains
0,ENSG00000121410,ENSG00000170909,1.0,1.0


### 4. Orthologs in S. pombe, S. cerevisiae + cross-species conservation
* Has pombe/cerevisiae ortholog
* Has essential pombe/cerevisiae ortholog
* Conservation score

In [6]:
ortholog_features = pd.read_csv(file_ortholog_features)
ortholog_features = pd.merge(ortholog_features, pairs[['A1','A2']])
assert(ortholog_features.shape[0] == pairs.shape[0])
print('Pairs w/ pombe, cerevisiae ortholog:', sum(ortholog_features.has_pombe_ortholog), ',', 
      sum(ortholog_features.has_cerevisiae_ortholog))
print('Avg. conservation score:', ortholog_features.conservation_score.mean())
ortholog_features[:1]

Pairs w/ pombe, cerevisiae ortholog: 5757 , 8172
Avg. conservation score: 4.812404496834752


Unnamed: 0,A1,A2,A1_ensembl,A2_ensembl,has_cerevisiae_ortholog,has_essential_cerevisiae_ortholog,has_single_essential_cerevisiae_ortholog,has_pombe_ortholog,has_essential_pombe_ortholog,has_single_essential_pombe_ortholog,conservation_score
0,A1BG,OSCAR,ENSG00000121410,ENSG00000170909,False,False,False,False,False,False,3


### 5. Protein age from ProteinHistorian
Mean age for A1 and A2

In [7]:
protein_ages = pd.read_csv(file_protein_age, sep='\t', comment='#', names=['protein','age','taxon'])
print('N genes w/ ages:', protein_ages.shape[0])
display(protein_ages[:1])

# Extract ensembl id
protein_ages['ensembl_id'] = protein_ages.protein.apply(lambda x: x.split('~')[0])
protein_ages = protein_ages[['ensembl_id','age']]

# Assign age for each gene
paralog_ages = pd.merge(pairs[['A1_ensembl','A2_ensembl']], 
                        protein_ages.rename(columns={'ensembl_id':'A1_ensembl'}), how='left')
paralog_ages = pd.merge(paralog_ages, protein_ages.rename(columns={'ensembl_id':'A2_ensembl'}), on=['A2_ensembl'], how='left')
paralog_ages = paralog_ages.astype({'age_x':'float', 'age_y':'float'})
print('A1-A2 age correlation:', paralog_ages.age_x.corr(paralog_ages.age_y))

# Calculate mean age
paralog_ages['mean_age'] = paralog_ages[['age_x','age_y']].mean(axis=1)
print('N. pairs NA:', paralog_ages[paralog_ages.mean_age.isna()].shape[0])
print('Avg. age of pairs:', paralog_ages.mean_age.mean())

age_features = paralog_ages[['A1_ensembl','A2_ensembl','mean_age']]
age_features[:1]

N genes w/ ages: 12625


Unnamed: 0,protein,age,taxon
0,ENSG00000092850~Q9UIF3~TEKT2,842.0,Deuterostomia


A1-A2 age correlation: 0.5437136119265842
N. pairs NA: 1513
Avg. age of pairs: 591.6427380105307


Unnamed: 0,A1_ensembl,A2_ensembl,mean_age
0,ENSG00000121410,ENSG00000170909,210.95


### 6. Gene expression features
- Correlation of A1 and A2 gene expression - using all available GTEx expression data
- Min/max mean expression
- Expression asymmetry

In [8]:
gtex_expr_features = pd.read_csv(file_gtex_expr_feats)

# Replace A1/A2 mean expression with min/max mean expression
gtex_expr_features = gtex_expr_features.assign(
    min_mean_expr = gtex_expr_features.apply(lambda x: min(x.A1_mean_expr, x.A2_mean_expr), axis=1),
    max_mean_expr = gtex_expr_features.apply(lambda x: max(x.A1_mean_expr, x.A2_mean_expr), axis=1))
gtex_expr_features = gtex_expr_features.drop(columns=['A1_mean_expr', 'A2_mean_expr'])

display(gtex_expr_features[:1])

Unnamed: 0,A1_ensembl,A2_ensembl,spearman_corr,pearson_corr,min_mean_expr,max_mean_expr
0,ENSG00000092850,ENSG00000163060,0.457234,0.965852,1.181367,4.524118


In [9]:
# Add prefix for data source
expr_features = gtex_expr_features.set_index(['A1_ensembl','A2_ensembl']).add_prefix('gtex_').reset_index()
expr_features = pd.merge(pairs[['A1_ensembl','A2_ensembl']], expr_features, how='left')
assert(expr_features.shape[0] == pairs.shape[0])
expr_features[:1]

Unnamed: 0,A1_ensembl,A2_ensembl,gtex_spearman_corr,gtex_pearson_corr,gtex_min_mean_expr,gtex_max_mean_expr
0,ENSG00000121410,ENSG00000170909,0.273243,-0.030284,9.456192,10.980585


### 7. Protein Subcellular Location

Could treat this as binary feature (T if both paralogs have at least one common localization) or do Jaccard (common localizations / union of localizations).

Using all annotations for Main and Additions locations, expect those with reliability=='Uncertain'  
https://www.proteinatlas.org/about/assays+annotation#ifre 


Alternative? https://compartments.jensenlab.org/Downloads

In [10]:
protein_atlas_location_raw = pd.read_csv(file_protein_location, sep='\t')
print('N genes:', protein_atlas_location_raw.Gene.nunique())
display(protein_atlas_location_raw[:1])

N genes: 12390


Unnamed: 0,Gene,Gene name,Reliability,Main location,Additional location,Extracellular location,Enhanced,Supported,Approved,Uncertain,Single-cell variation intensity,Single-cell variation spatial,Cell cycle dependency,GO id
0,ENSG00000000003,TSPAN6,Approved,Cell Junctions;Cytosol,Nucleoli fibrillar center,,,,Cell Junctions;Cytosol;Nucleoli fibrillar center,,Cytosol,,,Cell Junctions (GO:0030054);Cytosol (GO:000582...


In [11]:
# Filter out uncertain annotations
locations = protein_atlas_location_raw[protein_atlas_location_raw.Reliability != 'Uncertain'].reset_index()
locations = locations[['Gene', 'Gene name', 'Main location', 'Additional location']]

# Stack location mappings
df_main = locations.set_index('Gene')['Main location'].apply(lambda x: pd.Series(x.split(';') if not pd.isna(x) else x))\
                   .stack().reset_index(level=1, drop=True).reset_index().rename(columns={0:'location'})
df_add = locations.set_index('Gene')['Additional location'].apply(lambda x: pd.Series(x.split(';') if not pd.isna(x) else x))\
                  .stack().reset_index(level=1, drop=True).reset_index().rename(columns={0:'location'})

# Merge into id - locations set map
location_map = pd.concat([df_main, df_add]).drop_duplicates()
location_map = location_map.groupby('Gene').location.apply(set).reset_index()
print('N genes:', location_map.shape[0])
location_map[:1]

N genes: 11723


Unnamed: 0,Gene,location
0,ENSG00000000003,"{Cytosol, Cell Junctions, Nucleoli fibrillar c..."


In [12]:
# Merge in set for A1 and A2, treating lack of data as empty set 
df = pd.merge(pairs[['A1_ensembl', 'A2_ensembl']], 
              location_map.rename(columns={'Gene':'A1_ensembl', 'location':'A1_loc'}), how='left')
df = pd.merge(df, location_map.rename(columns={'Gene':'A2_ensembl', 'location':'A2_loc'}), how='left')
df.loc[df.A1_loc.isnull(),'A1_loc'] = set()
df.loc[df.A2_loc.isnull(),'A2_loc'] = set()

df['loc_intersection'] = df.apply(lambda x: x.A1_loc.intersection(x.A2_loc), axis=1)
df['loc_union'] = df.apply(lambda x: x.A1_loc.union(x.A2_loc), axis=1)
df['colocalisation'] = df.apply(lambda x: len(x.loc_intersection)/len(x.loc_union) if len(x.loc_union)>0 else 0, axis=1)
df['same_subcell_loc'] = df.loc_intersection.apply(lambda x: len(x) > 0)

print('N w/ shared subcellular location:', df[df.same_subcell_loc].shape[0])
location_features = df[['A1_ensembl', 'A2_ensembl', 'colocalisation', 'same_subcell_loc']]
location_features[:1]

N w/ shared subcellular location: 7220


Unnamed: 0,A1_ensembl,A2_ensembl,colocalisation,same_subcell_loc
0,ENSG00000121410,ENSG00000170909,0.0,False


### Merge all features together

In [13]:
# Fill in mean for pairs where gene scores / expr / age was not available

def impute_missing_values(pairs):
    print('Pairs in complex, missing scores:', pairs[pairs.flag_missing_scores].shape[0])
    print('Pairs w/ shared ppi, missing scores:', 
          pairs[(pairs.n_shared_ppi>0) & pairs.shared_ppi_percent_essential.isna()].shape[0])
    
    for col in [col for col in pairs.columns if col in ['mean_complex_essentiality']]:
        pairs.loc[pairs.flag_missing_scores,:] = pairs.loc[pairs.flag_missing_scores,:].fillna({col:pairs[col].mean()})

    for col in [col for col in pairs.columns if col in 
                ['shared_ppi_mean_essentiality', 'shared_ppi_percent_essential', 'shared_ppi_mean_ceres_score']]:
        pairs.loc[(pairs.n_shared_ppi>0),:] = pairs.loc[(pairs.n_shared_ppi>0),:].fillna({col:pairs[col].mean()})

    for col in [col for col in pairs.columns if col.startswith('gtex') or col in ['mean_age']]:
        pairs = pairs.fillna({col:pairs[col].mean()})

    pairs = pairs.fillna(0)
    return pairs

In [14]:
# Some features were already in the paralog pairs file
data = pairs[['A1','A2','A1_ensembl','A2_ensembl','A1_entrez','A2_entrez', 'closest',
              'min_seq_id', 'max_seq_id', 'WGD', 'family_size','same_chr', 'cds_length_ratio']]
data = data.rename(columns={'min_seq_id':'min_sequence_identity', 'max_seq_id':'max_sequence_identity'})

data = pd.merge(data, complex_features)
data = pd.merge(data, ppi_features)
data = pd.merge(data, domain_features)
data = pd.merge(data, expr_features)
data = pd.merge(data, ortholog_features)
data = pd.merge(data, age_features)
data = pd.merge(data, location_features)

display(data.isna().sum()[data.isna().sum()>0])

data = impute_missing_values(data)
assert(pairs.shape[0]==data.shape[0])
data[:3]

mean_complex_essentiality       28728
shared_ppi_mean_essentiality    22342
shared_ppi_percent_essential    22342
shared_ppi_mean_ceres_score     22342
gtex_spearman_corr                507
gtex_pearson_corr                 507
gtex_min_mean_expr                 85
gtex_max_mean_expr                 85
mean_age                         1513
dtype: int64

Pairs in complex, missing scores: 60
Pairs w/ shared ppi, missing scores: 477


Unnamed: 0,A1,A2,A1_ensembl,A2_ensembl,A1_entrez,A2_entrez,closest,min_sequence_identity,max_sequence_identity,WGD,...,has_cerevisiae_ortholog,has_essential_cerevisiae_ortholog,has_single_essential_cerevisiae_ortholog,has_pombe_ortholog,has_essential_pombe_ortholog,has_single_essential_pombe_ortholog,conservation_score,mean_age,colocalisation,same_subcell_loc
0,A1BG,OSCAR,ENSG00000121410,ENSG00000170909,1,126014,False,0.127273,0.22028,False,...,False,False,False,False,False,False,3,210.95,0.0,False
1,A1BG,TARM1,ENSG00000121410,ENSG00000248385,1,441864,False,0.149495,0.265233,False,...,False,False,False,False,False,False,3,97.4,0.0,False
2,OSCAR,TARM1,ENSG00000170909,ENSG00000248385,126014,441864,True,0.269231,0.275986,False,...,False,False,False,False,False,False,3,324.5,0.0,False


In [15]:
data.columns

Index(['A1', 'A2', 'A1_ensembl', 'A2_ensembl', 'A1_entrez', 'A2_entrez',
       'closest', 'min_sequence_identity', 'max_sequence_identity', 'WGD',
       'family_size', 'same_chr', 'cds_length_ratio', 'either_in_complex',
       'in_same_complex', 'either_in_essential_complex',
       'mean_complex_essentiality', 'flag_missing_scores', 'interact',
       'n_total_ppi', 'n_shared_ppi', 'shared_ppi_jaccard_idx',
       'fet_ppi_overlap', 'shared_ppi_mean_essentiality',
       'shared_ppi_percent_essential', 'shared_ppi_mean_ceres_score',
       'n_shared_domains', 'shared_domains', 'gtex_spearman_corr',
       'gtex_pearson_corr', 'gtex_min_mean_expr', 'gtex_max_mean_expr',
       'has_cerevisiae_ortholog', 'has_essential_cerevisiae_ortholog',
       'has_single_essential_cerevisiae_ortholog', 'has_pombe_ortholog',
       'has_essential_pombe_ortholog', 'has_single_essential_pombe_ortholog',
       'conservation_score', 'mean_age', 'colocalisation', 'same_subcell_loc'],
      dtype='obj

In [16]:
data.isna().any()[data.isna().any()]

Series([], dtype: bool)

In [17]:
data.to_csv(file_all_features, index=0)