## Protein-coding paralog genes set from Ensembl data

**Input:** Ensembl list of paralogs & gene id, WGD and protein complex data

**Outputs:**  
- All protein coding paralog pairs  
- Closest protein coding paralog pairs  
- Paralog gene summary

Using Ensembl release 93: http://jul2018.archive.ensembl.org/index.html

Selected attributes in Ensembl query:
- Gene stable ID
- Human paralogue gene stable ID, 
- Paralogue %id. target Human gene identical to query gene
- Paralogue %id. query gene identical to target Human gene

Notes:  
ENSEMBL paralog data is listed as symmetric paralog pairs.  
The sequence identity of a paralog pair is not necessarily  symmetric because the paralogs may have different lengths.  

In [9]:
import pandas as pd
import numpy as np
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt

get_data_path = lambda folders, fname: os.path.normpath(os.environ['3RD_PARTY_DIR']+'/'+'/'.join(folders) +'/'+fname)
get_local_data_path = lambda folders, fname: os.path.normpath('../data/' +'/'.join(folders) +'/'+ fname)

# Inputs
file_ensembl_data = get_data_path(['ensembl', '93'], 'paralog_pairs_all.txt')
file_id_map = get_data_path(['HGNC'] , 'non_alt_loci_set_01_02_19.txt')
file_singh_wgds = get_data_path(['WGD_sources', 'singh_2015'], 'Pairs_HUMAN_Strict.txt')
file_makino_wgds = get_data_path(['WGD_sources'], 'makino_2010_ohnologs.xls')
file_complexes_map = get_local_data_path(['processed'], 'CORUM_entrez_map.csv')

# Outputs
file_all_pairs = get_local_data_path(['processed', 'ensembl93'], 'all_pairs.csv')
file_closest_pairs = get_local_data_path(['processed', 'ensembl93'], 'closest_pairs.csv')
file_paralog_summary = get_local_data_path(['processed', 'ensembl93'], 'paralog_summary.csv')

### Paralog pairs - HGNC protein-coding 

In [2]:
raw_data = pd.read_csv(file_ensembl_data)

In [3]:
# Clean up columns
data = raw_data.drop(columns=['Paralogue %id. query gene identical to target Human gene'])
data.rename(inplace=True, columns={
    'Gene stable ID': 'A1_ensembl',
    'Gene name': 'symbol',
    'Human paralogue gene stable ID': 'A2_ensembl',
    'Paralogue %id. target Human gene identical to query gene': 'percent_matched_in_paralog'})
data.percent_matched_in_paralog = data.percent_matched_in_paralog / 100
print('N', data.shape[0])

N 3654800


In [4]:
id_map = pd.read_csv(file_id_map, sep='\t', low_memory=False)
id_map = id_map[['ensembl_gene_id', 'locus_type', 'entrez_id']]
id_map[:1]

Unnamed: 0,ensembl_gene_id,locus_type,entrez_id
0,ENSG00000121410,gene with protein product,1.0


In [5]:
paralog_data = data[~data.A2_ensembl.isna()].reset_index(drop=True)

# Merge pairs with id_map to get locus type
paralog_data = pd.merge(paralog_data, id_map.rename(columns={'ensembl_gene_id':'A1_ensembl'}).drop(columns=['entrez_id']), 
                        on='A1_ensembl', how='inner')
paralog_data = pd.merge(paralog_data, id_map.rename(columns={'ensembl_gene_id':'A2_ensembl'}).drop(columns=['entrez_id']), 
                        on='A2_ensembl', how='inner')

# Only keep pairs where A1 and A2 are both protein-coding
coding_pairs = paralog_data[(paralog_data.locus_type_x == 'gene with protein product') & 
                            (paralog_data.locus_type_y == 'gene with protein product')]
coding_pairs = coding_pairs.drop(columns=['locus_type_x', 'locus_type_y']).reset_index(drop=True)
assert(coding_pairs.index[-1] == coding_pairs.shape[0]-1)

print('Protein coding genes w/ a paralog in Ensembl:', coding_pairs.A1_ensembl.nunique())
coding_pairs[:2]

Protein coding genes w/ a paralog in Ensembl: 13684


Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog
0,ENSG00000092850,ENSG00000163060,0.293023
1,ENSG00000167858,ENSG00000163060,0.375598


In [6]:
# Sort df and determine min./max sequence identity
sorted_pairs = pd.concat([pd.DataFrame(np.sort(coding_pairs[['A1_ensembl','A2_ensembl']]), columns=['A1_sorted','A2_sorted']), 
                          coding_pairs[['A1_ensembl','A2_ensembl','percent_matched_in_paralog']]], axis=1)
num_duplicated = sorted_pairs.groupby(['A1_sorted','A2_sorted']).A1_ensembl.count()
assert(sorted_pairs.duplicated(subset=['A1_sorted','A2_sorted']).sum() * 2 == coding_pairs.shape[0])

# Get min. sequence identity for each pair
sorted_pairs['min_seq_id'] = sorted_pairs.groupby(['A1_sorted','A2_sorted']).percent_matched_in_paralog.transform('min')

# Reduce paralog pairs to those with min reciprocal 20% sequence identity
pairs_min_20 = sorted_pairs[sorted_pairs.min_seq_id >= 0.2].reset_index(drop=True)
print('Unique protein coding pairs w/ >=20% seq id:', pairs_min_20.drop_duplicates(subset=['A1_sorted','A2_sorted']).shape[0])

# Go back to unsorted list
pairs_min_20 = pairs_min_20.drop(columns=['A1_sorted','A2_sorted'])
print('A1s in those paralog pairs:', pairs_min_20.A1_ensembl.nunique())
pairs_min_20[:1]

Unique protein coding pairs w/ >=20% seq id: 28538
A1s in those paralog pairs: 12138


Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id
0,ENSG00000092850,ENSG00000163060,0.293023,0.289655


### Annotations

In [10]:
# 1. Add duplication mode - union of WGDs from two sources
singh_wgd = pd.read_csv(file_singh_wgds, sep='\t')
display(singh_wgd[:1])
singh_wgd = singh_wgd.rename(columns={'Ohnolog-1 Id':'Ohno1','Ohnolog-2 Id':'Ohno2'})
singh_wgd = pd.DataFrame(np.sort(singh_wgd[['Ohno1', 'Ohno2']]), columns=['Ohno1','Ohno2']).drop_duplicates()
singh_wgd = pd.concat([singh_wgd, singh_wgd.rename(columns={'Ohno1':'Ohno2','Ohno2':'Ohno1'})], sort=True) # make symmetric
print('All singh WGDs:', int(singh_wgd.shape[0]/2))

makino_wgd = pd.read_excel(file_makino_wgds, skiprows=1, skipfooter=3, sheet_name='Table S7')
display(makino_wgd[:1])
makino_wgd = makino_wgd.rename(columns={'Ohnolog1':'Ohno1','Ohnolog2':'Ohno2'})
makino_wgd = pd.DataFrame(np.sort(makino_wgd[['Ohno1', 'Ohno2']]), columns=['Ohno1', 'Ohno2']).drop_duplicates()
makino_wgd = pd.concat([makino_wgd, makino_wgd.rename(columns={'Ohno1':'Ohno2','Ohno2':'Ohno1'})], sort=True)
print('All makino WGDs:', int(makino_wgd.shape[0]/2))

all_wgds = pd.merge(singh_wgd, makino_wgd, how="outer")

# Merge both WGD sets with full list of sorted pairs
pairs_annot = pd.merge(pairs_min_20, all_wgds.rename(columns={'Ohno1':'A1_ensembl', 'Ohno2':'A2_ensembl'}), 
                       how='left', indicator='WGD')
pairs_annot['WGD'] = (pairs_annot.WGD == "both")
print('N WGD among Ensembl paralog pairs:', int(pairs_annot[pairs_annot.WGD].shape[0]/2))
pairs_annot[:1]

Unnamed: 0,Ohnolog-1 Id,Ohnolog-2 Id,Ohnolog-1 Symbol,Ohnolog-2 Symbol,Synteny Outgroup Support,Combine p-value(self) from all vetebrates,Combine p-value(outgroup) from all vetebrates,p-value from self comparison,Combined p-value for all outgroups,P-value for Amphioxus,P-value for Ciona intestinalis,P-value for Ciona savignyi,P-value for Drosophila,P-value for Sea Urchin,P-value for Worm,Duplication node form Ensembl
0,ENSG00000095464,ENSG00000132915,PDE6C,PDE6A,4.0,8e-06,7.6e-05,7.96e-12,1.09e-09,2.63e-08,,0.431946,0.101149,,0.951,Vertebrata


All singh WGDs: 2695


Unnamed: 0,Ohnolog1,chr_ohno1,Position1,Ohnolog2,chr_ohno2,Position2,C_int,C_sav,Amphi,Sea_u,Fly,Worm
0,ENSG00000130762,1,3361100,ENSG00000142632,1,16411691,,,,Ohno,Ohno,


All makino WGDs: 9057
N WGD among Ensembl paralog pairs: 6215


Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id,WGD
0,ENSG00000092850,ENSG00000163060,0.293023,0.289655,True


In [11]:
# 2. Protein complex membership
complex_map = pd.read_csv(file_complexes_map, index_col=0)
complex_map = pd.merge(complex_map, id_map, on='entrez_id').drop(columns=['entrez_id'])
complexes_per_gene = complex_map.groupby('ensembl_gene_id').agg({'complex_id': set}).reset_index()

# Merge with each gene in all pairs and fill in empty sets
df = pd.merge(pairs_annot[['A1_ensembl','A2_ensembl']], 
              complexes_per_gene.rename(columns={'ensembl_gene_id':'A1_ensembl','complex_id':'A1_complex_ids'}), how='left')
df = pd.merge(df, complexes_per_gene.rename(columns={'ensembl_gene_id':'A2_ensembl','complex_id':'A2_complex_ids'}), how='left')
df.loc[df.A1_complex_ids.isna(), 'A1_complex_ids'] = set()
df.loc[df.A2_complex_ids.isna(), 'A2_complex_ids'] = set()

# Calculate complex intersection and union
complex_membership = df.assign(shared_complexes=df.apply(lambda x: x.A1_complex_ids.intersection(x.A2_complex_ids), axis=1),
                                all_complexes=df.apply(lambda x: x.A1_complex_ids.union(x.A2_complex_ids), axis=1))

complex_membership['either_in_complex'] = complex_membership.all_complexes.apply(lambda x: len(x) > 0)
complex_membership['in_same_complex'] = complex_membership.shared_complexes.apply(lambda x: len(x) > 0)
complex_membership = complex_membership[['A1_ensembl','A2_ensembl','either_in_complex','in_same_complex']]
display(complex_membership[:1])
print('Either in a complex:', complex_membership[complex_membership.either_in_complex].shape[0])
print('In same complex:', complex_membership[complex_membership.in_same_complex].shape[0])

pairs_annot2 = pd.merge(pairs_annot, complex_membership, how='left').fillna(False)
pairs_annot2[:1]

Unnamed: 0,A1_ensembl,A2_ensembl,either_in_complex,in_same_complex
0,ENSG00000092850,ENSG00000163060,False,False


Either in a complex: 11334
In same complex: 676


Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id,WGD,either_in_complex,in_same_complex
0,ENSG00000092850,ENSG00000163060,0.293023,0.289655,True,False,False


In [13]:
# 3. Symbol and entrez id
id_map = pd.read_csv(file_id_map, sep='\t', low_memory=False).dropna(subset=['ensembl_gene_id', 'entrez_id'])
id_map = id_map[['ensembl_gene_id', 'symbol', 'entrez_id']]
pairs_annot3 = pd.merge(pairs_annot2, 
                        id_map.rename(columns={'ensembl_gene_id':'A1_ensembl','symbol':'A1_symbol','entrez_id':'A1_entrez'}))
pairs_annot3 = pd.merge(pairs_annot3,
                        id_map.rename(columns={'ensembl_gene_id':'A2_ensembl','symbol':'A2_symbol','entrez_id':'A2_entrez'}))
pairs_annot3 = pairs_annot3.astype({'A1_entrez':'int', 'A2_entrez':'int'})
assert(pairs_annot2.shape[0] == pairs_annot3.shape[0])
pairs_annot3[:2]

Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id,WGD,either_in_complex,in_same_complex,A1_symbol,A1_entrez,A2_symbol,A2_entrez
0,ENSG00000092850,ENSG00000163060,0.293023,0.289655,True,False,False,TEKT2,27285,TEKT4,150483
1,ENSG00000167858,ENSG00000163060,0.375598,0.36092,True,False,False,TEKT1,83659,TEKT4,150483


In [14]:
# 4. Whether pairs is closest pair (there could be more than one)
max_per_gene = pd.DataFrame(pairs_annot3.groupby('A1_ensembl').percent_matched_in_paralog.max()).reset_index()
pairs_annot4 = pd.merge(pairs_annot3, max_per_gene, how='left', indicator='closest')
pairs_annot4.closest = pairs_annot4.closest == 'both'
pairs_annot4[:2]

Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id,WGD,either_in_complex,in_same_complex,A1_symbol,A1_entrez,A2_symbol,A2_entrez,closest
0,ENSG00000092850,ENSG00000163060,0.293023,0.289655,True,False,False,TEKT2,27285,TEKT4,150483,False
1,ENSG00000167858,ENSG00000163060,0.375598,0.36092,True,False,False,TEKT1,83659,TEKT4,150483,True


### Export files

In [15]:
all_pairs = pairs_annot4
print('Num pairs:', all_pairs.shape[0], ', num A1s:', all_pairs.A1_ensembl.nunique())
all_pairs['WGD'] = all_pairs['WGD'].apply(lambda x: 'WGD' if x==True else 'SSD')
all_pairs.to_csv(file_all_pairs)
all_pairs[:1]

Num pairs: 57076 , num A1s: 12138


Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id,WGD,either_in_complex,in_same_complex,A1_symbol,A1_entrez,A2_symbol,A2_entrez,closest
0,ENSG00000092850,ENSG00000163060,0.293023,0.289655,WGD,False,False,TEKT2,27285,TEKT4,150483,False


In [16]:
closest_pairs = all_pairs[all_pairs.closest == True]
multiple_closest = closest_pairs[closest_pairs.duplicated(subset=['A1_ensembl'], keep=False)].sort_values('A1_ensembl')
print('Num A1s w/ multiple closest paralogs: ' + str(multiple_closest.A1_ensembl.nunique()))
print('Num paralog pairs when keeping only closest paralog(s): ' + str(closest_pairs.shape[0]))
closest_pairs.to_csv(file_closest_pairs)
closest_pairs[:1]

Num A1s w/ multiple closest paralogs: 418
Num paralog pairs when keeping only closest paralog(s): 13107


Unnamed: 0,A1_ensembl,A2_ensembl,percent_matched_in_paralog,min_seq_id,WGD,either_in_complex,in_same_complex,A1_symbol,A1_entrez,A2_symbol,A2_entrez,closest
1,ENSG00000167858,ENSG00000163060,0.375598,0.36092,WGD,False,False,TEKT1,83659,TEKT4,150483,True


In [17]:
# Paralog summary:
# Create summary for each A1 gene (table is symmetric so this is every paralog gene)
# Assign duplication mode based on whether gene appears in any WGD pairs
paralogs_summary = all_pairs.groupby(['A1_ensembl', 'A1_symbol', 'A1_entrez'])\
                            .agg({'percent_matched_in_paralog':'max', 'A2_ensembl':'count'}).reset_index()
paralogs_summary = paralogs_summary.rename(columns={'A2_ensembl':'num_paralogs', 'A1_ensembl':'ensembl_id', 
                                                    'A1_symbol':'symbol', 'A1_entrez':'entrez_id'})
paralogs_summary['WGD'] = paralogs_summary.ensembl_id.isin(all_wgds.Ohno1) | paralogs_summary.ensembl_id.isin(all_wgds.Ohno2)
paralogs_summary['WGD'] = paralogs_summary['WGD'].apply(lambda x: 'WGD' if x==True else 'SSD')
print('Num paralog genes:', paralogs_summary.shape[0])
paralogs_summary.to_csv(file_paralog_summary)
paralogs_summary[:2]

Num paralog genes: 12138


Unnamed: 0,ensembl_id,symbol,entrez_id,percent_matched_in_paralog,num_paralogs,WGD
0,ENSG00000000003,TSPAN6,7105,0.583673,8,WGD
1,ENSG00000000005,TNMD,64102,0.365931,1,SSD
