Trying to organize the paralog data

Current problems:

- the paralog sequence identity file is old and missing data
- the updated paralog sequence identity file uses different isoform IDs

There are four categories in the Y2H data:

tf_isoform_ppis      -- isoforms
tf_paralog_ppis      -- paralogs
non_paralog_control  -- random other pairs of not paralogs
paralog_with_PDI     -- **I don't know**

In [1]:
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns

import ccsblib

from data_loading import (load_isoform_and_paralog_y2h_data,
                          load_valid_isoform_clones,
                          load_paralog_pairs,
                          load_ppi_partner_categories)
from plotting import y2h_ppi_per_paralog_pair_plot

pd.set_option('display.max_columns', 50)

In [2]:
df = pd.read_csv('../../data/a_tf_iso_paralog_nonparalogs_tested.tsv', sep='\t')
aa_old = pd.read_csv('../../data/tf_AA_seq_identities/b_2018-11-30_AA_seq_identity_Paralog_comparisons_unique_acc.txt', sep='\t')
aa_new = pd.read_csv('../../data/tf_AA_seq_identities/2019-07-01_AA_seq_identity_all_paralog_pair_comparisons.txt', sep='\t')

In [12]:
aa_old.head()

Unnamed: 0,gene1,ref_iso1,gene2,ref_iso2,comparison_type,AAseq_identity%
0,AEBP2,AEBP2|1/3|11E07,ATF2,ATF2|1/6|12H04,ref-ref,7.6
1,AEBP2,AEBP2|1/3|11E07,ATF3,ATF3|1/2|08B04,ref-ref,13.1
2,AEBP2,AEBP2|1/3|11E07,BLZF1,BLZF1|1/3|01A11,ref-ref,12.2
3,AEBP2,AEBP2|1/3|11E07,CREB3L3,CREB3L3|1/2|07C03,ref-ref,15.0
4,AEBP2,AEBP2|1/3|11E07,DDIT3,DDIT3|1/2|05B05,ref-ref,12.6


In [18]:
aa_new.head()

Unnamed: 0,gene1_name,iso1,gene2_name,iso2,comparison_type,AAseq_identity%
0,AEBP2,AEBP2_p1_g03.TRINITY_DN3_c2_g1_i1,NR0B1,NR0B1_p2_g04.TRINITY_DN9_c7_g1_i1,paralogs,11.7
1,AEBP2,AEBP2_p1_g03.TRINITY_DN3_c2_g1_i1,NR0B1,NR0B1_pb061,paralogs,6.0
2,AEBP2,AEBP2_p1_g03.TRINITY_DN3_c2_g1_i1,TSC22D1,orfid8601,paralogs,7.6
3,AEBP2,AEBP2_p1_g03.TRINITY_DN3_c2_g1_i1,TSC22D1,TSC22D1_p3_g01.TRINITY_DN2_c2_g1_i1,paralogs,2.3
4,AEBP2,AEBP2_p1_g03.TRINITY_DN3_c2_g1_i1,TSC22D1,TSC22D1_p4_g01.TRINITY_DN0_c0_g1_i1,paralogs,7.6


In [2]:
y2h = load_isoform_and_paralog_y2h_data()
isoforms = load_valid_isoform_clones()
pairs = load_paralog_pairs()

In [4]:
print(pairs.shape)
pairs.head()

(197, 4)


Unnamed: 0,tf_gene_a,tf_gene_b,is_paralog_pair,pct_aa_seq_identity
0,ZNF18,ZNF263,True,28.9
1,ZNF24,ZNF394,True,30.5
2,ZSCAN21,ZSCAN9,True,37.3
3,LHX8,LHX9,True,30.0
4,ELF2,ETV6,True,10.7


In [7]:
pairs.loc[(pairs['tf_gene_a'] == 'RXRB') | (pairs['tf_gene_b'] == 'RXRB'), :]

Unnamed: 0,tf_gene_a,tf_gene_b,is_paralog_pair,pct_aa_seq_identity
15,NR2F2,RXRB,True,
18,RXRB,RXRG,True,
28,HNF4A,RXRB,True,
67,RXRA,RXRB,True,


In [6]:
y2h.category.value_counts()

tf_isoform_ppis        5959
tf_paralog_ppis        3836
non_paralog_control    2580
paralog_with_PDI       1063
lit_bm_isoforms         384
lit_bm_paralogs         311
rrs_isoforms            306
rrs_paralogs            285
Name: category, dtype: int64

In [None]:
for i, row in pairs.iterrows():
    fig, ax = plt.subplots(1, 1)
    a = row['tf_gene_a']
    b = row['tf_gene_b']
    y2h_ppi_per_paralog_pair_plot(a, b, y2h, ax=ax)
    os.makedirs('../../figures/paralog_pairs', exist_ok=True)
    plt.savefig('../../figures/paralog_pairs/' + a + '_' + b + '.pdf',
                bbox_inches='tight')
    plt.close(fig)