Make a table in relation to this dominant negative idea.

Variables:

- DBD intact?
- Dimerizing PPI intact?
- M1H - loss of activation
- Y1H - loss of PDIs

In [1]:
import numpy as np
import pandas as pd

from data_loading import (load_annotated_6k_collection,
                          load_y2h_isoform_data,
                          load_y1h_pdi_data,
                          load_m1h_activation_data,
                          load_valid_isoform_clones)
from isoform_pairwise_metrics import pairs_of_isoforms_comparison_table

In [2]:
# TODO: move to data loading
DIMERIZING_TF_FAMILIES = {'bHLH',
              'bZIP',
              'Nuclear receptor',
              'E2F',
              'CENPB',
              'MADS box',
              'Grainyhead',
              'SAND',
              'Rel',
              'EBF1',
              'STAT',
              'IRF',
              'RFX',
              'HSF',
              'p53',
              'AP-2',
              'GCM',
              'BED ZF',
              'MADF',
              'ARID/BRIGHT',
              'Myb/SANT',
              'SMAD'}

In [3]:
tfs = load_annotated_6k_collection()

reading from cache


In [4]:
m1h = load_m1h_activation_data()
y1h = load_y1h_pdi_data()
y2h = load_y2h_isoform_data()

In [5]:
from isoform_pairwise_metrics import _pairs_comparison_table


def pairs_of_ref_vs_alt_isoforms_comparison_table(tfs, y2h=None, y1h=None, m1h=None):
    iso_pairs = []
    for tf in tfs.values():
        ref = tf.cloned_reference_isoform
        for alt in tf.cloned_isoforms:
            if alt.name == ref.name:
                continue
            iso_pairs.append((tf.name,
                              tf.ensembl_gene_id,
                              tf.tf_family,
                              tf.tf_family in DIMERIZING_TF_FAMILIES,
                              ref.clone_acc,
                              alt.clone_acc,
                              '|'.join(ref.ensembl_transcript_ids) if ref.ensembl_transcript_ids is not None else np.nan,
                              '|'.join(alt.ensembl_transcript_ids) if alt.ensembl_transcript_ids is not None else np.nan,
                              ref.is_novel_isoform(),
                              alt.is_novel_isoform(),
                              tf.cloned_MANE_select_isoform,
                              len(ref.aa_seq),
                              len(alt.aa_seq),
                              len(ref.exons),
                              len(alt.exons),
                              tf.splicing_categories(ref.name, alt.name)["alternative N-terminal"],
                              tf.splicing_categories(ref.name, alt.name)["alternative C-terminal"],
                              tf.splicing_categories(ref.name, alt.name)["alternative internal exon"],
                              tf.splicing_categories(ref.name, alt.name)["alternative 5' splice site"],
                              tf.splicing_categories(ref.name, alt.name)["alternative 3' splice site"],
                              tf.splicing_categories(ref.name, alt.name)["exon skipping"],
                              tf.splicing_categories(ref.name, alt.name)["mutually exclusive exons"],
                              tf.splicing_categories(ref.name, alt.name)["intron retention"],

                              ))
    iso_pairs = pd.DataFrame(
        data=iso_pairs,
        columns=["gene_symbol",
                 "Ensembl_gene_ID",
                 "family",
                 "is_dimerizing_TF_family",
                 "clone_acc_ref",
                 "clone_acc_alt",
                 "Ensembl_transcript_IDs_ref",
                 "Ensembl_transcript_IDs_alt",
                 "is_ref_novel_isoform",
                 "is_alt_novel_isoform",
                 "is_MANE_select_isoform_cloned",
                 "n_aa_ref",
                 "n_aa_alt",
                 "n_exons_ref",
                 "n_exons_alt",
                 "is_alternative_N_terminal",
                 "is_alternative_C_terminal",
                 "is_alternative_internal_exon",
                 "is_alternative_5_prime_donor",
                 "is_alternative_3_prime_acceptor",
                 "is_exon_skipping",
                 "is_mutually_exclusive_exons",
                 "is_intron_retention",
                 ]
    )
    return iso_pairs



df = pairs_of_ref_vs_alt_isoforms_comparison_table(tfs, y2h=y2h, y1h=y1h, m1h=m1h)

In [6]:
# DBD intact
from data_loading import load_dbd_accessions

def load_dbd_affected():
    df = pd.concat([g.aa_feature_disruption(g.cloned_reference_isoform.name) for g in tfs.values()])
    df['is_DBD'] = df['accession'].isin(load_dbd_accessions())
    df_new = (df.loc[df['is_DBD'], :]
        .groupby(['gene', 'ref_iso', 'alt_iso'])
        [['deletion', 'frameshift']].sum()
        .sum(axis=1) / df.loc[df['is_DBD'], :]
        .groupby(['gene', 'ref_iso', 'alt_iso'])
        ['length'].sum()).to_frame(name='dbd_fraction')
    df_new['dbd_insertion_n_aa'] = (df.loc[df['is_DBD'], :]
                                  .groupby(['gene', 'ref_iso', 'alt_iso'])
                                  ['insertion']
                                  .sum())
    df = df_new.reset_index()
    df['dbd_pct_lost'] = df['dbd_fraction'] * 100.
    df = df.drop(columns=['dbd_fraction'])
    return df


dbd = load_dbd_affected()
dbd['clone_acc_ref'] = dbd['ref_iso'].map({iso.name: iso.clone_acc for tf in tfs.values() for iso in tf.cloned_isoforms})
dbd['clone_acc_alt'] = dbd['alt_iso'].map({iso.name: iso.clone_acc for tf in tfs.values() for iso in tf.cloned_isoforms})
dbd = dbd.drop(columns=['gene', 'ref_iso', 'alt_iso'])
df = pd.merge(df, dbd, how='left', on=['clone_acc_ref', 'clone_acc_alt'])
df['dbd_affected'] = df['dbd_pct_lost'] > 0

In [7]:
from data_loading import load_seq_comparison_data

aa_ident = load_seq_comparison_data()
df["aa_seq_pct_id"] = df.apply(
    lambda x: "_".join(sorted([x["clone_acc_ref"], x["clone_acc_alt"]])), axis=1
).map(aa_ident)
if df['aa_seq_pct_id'].isnull().any():
    raise UserWarning('Unexpected missing sequence similarity values')

In [8]:
# now do the assays

# y2h n_positive_ref n_positive_ref_filtered n_shared_ref_alt
# y2h n successfully tested in both
# M1H at least one isoform of gene has |activation| >= 2 fold

y2h_complete = load_y2h_isoform_data(require_at_least_one_ppi_per_isoform=False)
n_ppi = y2h_complete.loc[(y2h_complete['Y2H_result'] == True), :].groupby('ad_clone_acc').size()
df['n_positive_PPI_ref'] = df['clone_acc_ref'].map(n_ppi)
df['n_positive_PPI_alt'] = df['clone_acc_alt'].map(n_ppi)
# BUG MISSING 0's here!
df.loc[df['n_positive_PPI_ref'].isnull() &
       df['clone_acc_ref'].isin(y2h_complete.loc[(y2h_complete['Y2H_result'] == False), 
                              'ad_clone_acc'].unique()),
       'n_positive_PPI_ref'] = 0
df.loc[df['n_positive_PPI_alt'].isnull() &
       df['clone_acc_alt'].isin(y2h_complete.loc[(y2h_complete['Y2H_result'] == False), 
                              'ad_clone_acc'].unique()),
       'n_positive_PPI_alt'] = 0

In [9]:
from isoform_pairwise_metrics import (
 number_tested_partners,
  number_shared_partners,
  jaccard_index)

def ppi_metric(row, data, function, suffixes=('_a', '_b')):
    ad_a = row["clone_acc" + suffixes[0]]
    ad_b = row["clone_acc" + suffixes[1]]
    pair = data.loc[data["ad_clone_acc"].isin([ad_a, ad_b]), :].pivot(
        values="Y2H_result", index="db_gene_symbol", columns="ad_clone_acc"
    )
    if ad_a not in pair.columns or ad_b not in pair.columns:
        return np.nan
    # remove any partner with AA / NC / NS / NaN in either
    pair = pair.loc[pair.notnull().all(axis=1), :].astype(int).astype(bool)
    # remove partners that tested negative in both
    pair = pair.loc[pair.any(axis=1), :]
    if pair.shape[0] > 0:
        return function(
            set(pair.index[pair[ad_a]].values), set(pair.index[pair[ad_b]].values)
        )
    else:
        return np.nan

df['n_PPI_successfully_tested_in_ref_and_alt'] = df.apply(
            ppi_metric, data=y2h_complete, suffixes=("_ref", "_alt"), function=number_tested_partners, axis=1
        )
df['n_positive_PPI_ref_filtered'] = df.apply(
            ppi_metric, data=y2h_complete, suffixes=("_ref", "_alt"), function=lambda a, b: len(a), axis=1
        )
df['n_positive_PPI_alt_filtered'] = df.apply(
            ppi_metric, data=y2h_complete, suffixes=("_ref", "_alt"), function=lambda a, b: len(b), axis=1
        )
df['n_shared_PPI'] = df.apply(
            ppi_metric, data=y2h_complete, suffixes=("_ref", "_alt"), function=number_shared_partners, axis=1
        )
df['PPI_jaccard'] = df.apply(
            ppi_metric, data=y2h_complete, suffixes=("_ref", "_alt"), function=jaccard_index, axis=1
        )

In [10]:
from data_loading import load_tf_families
from data_loading import load_human_tf_db
from data_loading import load_ppi_partner_categories


ppi_partner_cats = load_ppi_partner_categories()
tfdb = load_human_tf_db()
fam = load_tf_families()
y2h['ad_tf_family'] = y2h['ad_gene_symbol'].map(fam)
y2h['db_tf_family'] = y2h['db_gene_symbol'].map(fam)
y2h['is_dimerizing_ppi'] = (y2h['ad_tf_family'].isin(DIMERIZING_TF_FAMILIES) &
                        (y2h['ad_tf_family'] == y2h['db_tf_family']))
y2h['is_tf_tf_ppi'] = y2h['db_gene_symbol'].isin(tfdb['HGNC symbol'].unique())

# of reference dimer PPI, are all lost, some lost, none lost
def ppi_pertubation(row, ppi):
    ref_clone_acc = row['clone_acc_ref']
    alt_clone_acc = row['clone_acc_alt']
    if ref_clone_acc not in ppi['ad_clone_acc'].unique() or alt_clone_acc not in ppi['ad_clone_acc'].unique():
        return np.nan
    df = (ppi.loc[ppi['ad_clone_acc'].isin([ref_clone_acc, alt_clone_acc]),
                  ['ad_clone_acc', 'db_gene_symbol', 'Y2H_result']]
            .pivot(values='Y2H_result', index='db_gene_symbol', columns='ad_clone_acc')
            .dropna())
    df = df.loc[df.any(axis=1), :]
    if df.shape[0] == 0:
        return np.nan
    if df.all().all():
        return 'retains all'
    elif not df[alt_clone_acc].any():
        return 'loses all'
    elif df[alt_clone_acc].sum() > df[ref_clone_acc].sum():
        return 'gains some'
    else:
        return 'loses some'

df['dimer_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[y2h['is_dimerizing_ppi'], :],
                           axis=1)
df['other_than_dimer_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[~y2h['is_dimerizing_ppi'], :],
                           axis=1)
df['tf_tf_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[y2h['is_tf_tf_ppi'], :],
                           axis=1)

                           
df['tf_cofactor_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[y2h['db_gene_symbol'].isin(ppi_partner_cats.loc[ppi_partner_cats['category'] == 'cofactor', 'partner'].unique()) &
                                       ~y2h['is_tf_tf_ppi'], :],
                           axis=1)
df['tf_signalling_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[y2h['db_gene_symbol'].isin(ppi_partner_cats.loc[ppi_partner_cats['category'].isin(['cell cycle', 'protein traffiking', 'protein turnover', 'signaling', 'cell-cell signaling']), 'partner'].unique()) &
                                       ~y2h['is_tf_tf_ppi'] &
                                       ~y2h['db_gene_symbol'].isin(ppi_partner_cats.loc[ppi_partner_cats['category'] == 'cofactor', 'partner'].unique()), :],
                           axis=1)

In [11]:
# TODO: move code to data loading
y1h = y1h.drop_duplicates(subset=['unique_acc'])

In [16]:
y1h.head()

Unnamed: 0,tf,unique_acc,HS1022,HS1043,HS1067,HS1139,HS1142,HS118,HS1181,HS1185,HS1199,HS1208,HS1242,HS1271,HS129,HS1315,HS1329,HS1339,HS135,HS1419,HS1436,HS1480,HS149,HS151,HS1516,HS1551,HS1597,HS162,HS1657,HS169,HS170,HS1707,HS174,HS181,HS1811,HS1833,HS1867,HS189,HS194,HS20,HS204,HS2048,HS205,HS2062,HS2081,HS213,HS215,HS218,HS238,HS240,...,HS816,HS818,HS836,HS863,HS864,HS865,HS878,HS930,HS932,HS935,HS974,HS978,HS990,MUT_112,MUT_115,MUT_116,MUT_118,MUT_119,MUT_120,MUT_129,MUT_137,MUT_142,MUT_143,MUT_144,MUT_15,MUT_156,MUT_158,MUT_162,MUT_163,MUT_166,MUT_17,MUT_187,MUT_19,MUT_193,MUT_20,MUT_207,MUT_214,MUT_218,MUT_219,MUT_260,MUT_281,MUT_30,MUT_32,MUT_36,MUT_37,MUT_40,MUT_41,MUT_46,MUT_64,MUT_75
162,CREB1,CREB1|1/2|02E01,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
0,CREB1,CREB1|2/2|01F12,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,DLX1,DLX1|1/2|07A12,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
174,DLX1,DLX1|2/2|07E09,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,DLX4,DLX4|1/3|11A04,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [18]:
y1h = y1h.set_index('unique_acc')

In [20]:
def pdi_metric(row, data, function, suffixes=('_a', '_b')):
    clone_acc_a = row["clone_acc" + suffixes[0]]
    clone_acc_b = row["clone_acc" + suffixes[1]]
    df = data.loc[
        (data.index == clone_acc_a)
        | (data.index == clone_acc_b),
        data.columns[1:],
    ].copy()
    if df.shape[0] < 2:
        return np.nan
    df = df.loc[[clone_acc_a, clone_acc_b], df.any(axis=0)]
    if df.shape[1] == 0:
        return np.nan
    a = set(df.columns[df.iloc[0]])
    b = set(df.columns[df.iloc[1]])
    return function(a, b)

n_pdi = y1h.drop(columns=['tf']).sum(axis=1)
df['n_positive_PDI_ref'] = df['clone_acc_ref'].map(n_pdi)
df['n_positive_PDI_alt'] = df['clone_acc_alt'].map(n_pdi)
df['n_PDI_successfully_tested_in_ref_and_alt'] = df.apply(
            pdi_metric, data=y1h, suffixes=("_ref", "_alt"), function=number_tested_partners, axis=1
        )

df['n_positive_PDI_ref_filtered'] = df.apply(
            pdi_metric, data=y1h, suffixes=("_ref", "_alt"), function=lambda a, b: len(a), axis=1
        )
df['n_positive_PDI_alt_filtered'] = df.apply(
            pdi_metric, data=y1h, suffixes=("_ref", "_alt"), function=lambda a, b: len(b), axis=1
        )


df['n_shared_PDI'] = df.apply(
            pdi_metric, data=y1h, suffixes=("_ref", "_alt"), function=number_shared_partners, axis=1
        )
df['PDI_jaccard'] = df.apply(
            pdi_metric, data=y1h, suffixes=("_ref", "_alt"), function=jaccard_index, axis=1
        )

In [22]:
m1h['mean'] = m1h[['M1H_rep1', 'M1H_rep2', 'M1H_rep3']].mean(axis=1)
m1h['abs_mean'] = m1h['mean'].abs()
df['at_least_one_isoform_in_gene_abs_activation_gte_2fold'] = df['gene_symbol'].map(m1h.groupby('gene')['abs_mean'].max() >= 1)
df['activation_ref'] = df['clone_acc_ref'].map(m1h.set_index('clone_acc')['mean'])
df['activation_alt'] = df['clone_acc_alt'].map(m1h.set_index('clone_acc')['mean'])
df['activation_fold_change_log2'] = (df['activation_alt'] - df['activation_ref'])

In [23]:
df.to_csv('../output/TF-iso_ref-vs-alt.tsv', sep='\t', index=False)

In [4]:
df = pairs_of_isoforms_comparison_table(isoforms=load_valid_isoform_clones(),
                                        m1h=load_m1h_activation_data(),
                                        y1h=load_y1h_pdi_data(),
                                        y2h=load_y2h_isoform_data())

In [5]:
ref_clones = {tf.cloned_reference_isoform.clone_acc for tf in tfs.values()}
df = df.loc[df['clone_acc_a'].isin(ref_clones) | df['clone_acc_b'].isin(ref_clones)]

In [6]:
df['clone_acc_ref'] = (df['clone_acc_a'] * df['clone_acc_a'].isin(ref_clones) +
                       df['clone_acc_b'] * df['clone_acc_b'].isin(ref_clones))
df['clone_acc_alt'] = (df['clone_acc_b'] * df['clone_acc_a'].isin(ref_clones) +
                       df['clone_acc_a'] * df['clone_acc_b'].isin(ref_clones))

In [10]:
from data_loading import load_tf_families

y2h = load_y2h_isoform_data()
fam = load_tf_families()
# NOTE: this is not complete. See issue #69
y2h['ad_tf_family'] = y2h['ad_gene_symbol'].map(fam)
y2h['db_tf_family'] = y2h['db_gene_symbol'].map(fam)
y2h['is_dimerizing_ppi'] = (y2h['ad_tf_family'].isin(DIMERIZING_TF_FAMILIES) &
                        (y2h['ad_tf_family'] == y2h['db_tf_family']))

In [11]:
# of reference dimer PPI, are all lost, some lost, none lost
def ppi_pertubation(row, ppi):
    ref_clone_acc = row['clone_acc_ref']
    alt_clone_acc = row['clone_acc_alt']
    if ref_clone_acc not in ppi['ad_clone_acc'].unique() or alt_clone_acc not in ppi['ad_clone_acc'].unique():
        return np.nan
    df = (ppi.loc[ppi['ad_clone_acc'].isin([ref_clone_acc, alt_clone_acc]),
                  ['ad_clone_acc', 'db_gene_symbol', 'Y2H_result']]
            .pivot(values='Y2H_result', index='db_gene_symbol', columns='ad_clone_acc')
            .dropna())
    df = df.loc[df.any(axis=1), :]
    if df.shape[0] == 0:
        return np.nan
    if df.all().all():
        return 'retains all'
    elif not df[alt_clone_acc].any():
        return 'loses all'
    elif df[alt_clone_acc].sum() > df[ref_clone_acc].sum():
        return 'gains some'
    else:
        return 'loses some'

df['dimer_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[y2h['is_dimerizing_ppi'], :],
                           axis=1)
df['other_ppi'] = df.apply(ppi_pertubation, 
                           ppi=y2h.loc[~y2h['is_dimerizing_ppi'], :],
                           axis=1)

In [13]:
df['dimer_ppi'].value_counts()

retains all    23
loses some      4
loses all       3
gains some      1
Name: dimer_ppi, dtype: int64

In [117]:
df.loc[(df['dimer_ppi'] == 'loses some'), :]

Unnamed: 0_level_0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi
pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
ATF2|1/6|12H04_ATF2|4/6|09B05,ATF2,ATF2|1/6|12H04,ATF2|4/6|09B05,3.0,1.0,1.0,0.0,0.333333,1.0,2.0,-2.0,,,,,,,,,-0.125596,0.361183,-0.486778,0.486778,45.7,ATF2|1/6|12H04,ATF2|4/6|09B05,loses some,loses all
E2F3|2/4|10A08_E2F3|3/4|10B08,E2F3,E2F3|2/4|10A08,E2F3|3/4|10B08,5.0,2.0,2.0,0.0,0.4,1.0,3.0,-3.0,,,,,,,,,-0.989488,6.444157,-7.433645,7.433645,63.4,E2F3|2/4|10A08,E2F3|3/4|10B08,loses some,loses all
TCF12|1/3|07D07_TCF12|3/3|07B07,TCF12,TCF12|1/3|07D07,TCF12|3/3|07B07,32.0,14.0,14.0,0.0,0.4375,1.0,18.0,-18.0,,,,,,,,,2.668221,5.255564,-2.587343,2.587343,68.6,TCF12|1/3|07D07,TCF12|3/3|07B07,loses some,loses some
TCF4|1/9|07E01_TCF4|6/9|07H03,TCF4,TCF4|1/9|07E01,TCF4|6/9|07H03,156.0,39.0,39.0,0.0,0.25,1.0,117.0,-117.0,,,,,,,,,2.107094,3.705451,-1.598357,1.598357,72.0,TCF4|1/9|07E01,TCF4|6/9|07H03,loses some,loses some


In [149]:
df.loc[(df['dimer_ppi'] == 'loses all'), :]

Unnamed: 0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi,gene,ref_iso,alt_iso,dbd_insertion_n_aa,dbd_pct_lost
10,ATF2,ATF2|1/6|12H04,ATF2|5/6|09A05,3.0,1.0,1.0,0.0,0.333333,1.0,2.0,-2.0,,,,,,,,,0.361183,3.668095,3.306912,3.306912,41.4,ATF2|1/6|12H04,ATF2|5/6|09A05,loses all,retains all,ATF2,ATF2-1,ATF2-5,0.0,100.0
11,ATF2,ATF2|1/6|12H04,ATF2|6/6|09H03,3.0,1.0,1.0,0.0,0.333333,1.0,2.0,-2.0,,,,,,,,,0.110722,0.361183,-0.250461,0.250461,22.4,ATF2|1/6|12H04,ATF2|6/6|09H03,loses all,retains all,ATF2,ATF2-1,ATF2-6,0.0,71.875
173,NR4A2,NR4A2|1/3|06E07,NR4A2|2/3|06H07,4.0,1.0,2.0,1.0,0.25,0.5,3.0,-1.0,,,,,,,,,2.557452,3.232589,-0.675137,0.675137,75.9,NR4A2|1/3|06E07,NR4A2|2/3|06H07,loses all,loses some,NR4A2,NR4A2-1,NR4A2-2,0.0,0.0


In [120]:
df.loc[(df['dimer_ppi'] == 'gains some'), :]

Unnamed: 0_level_0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi
pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
DDIT3|1/2|05B05_DDIT3|2/2|05G05,DDIT3,DDIT3|1/2|05B05,DDIT3|2/2|05G05,13.0,12.0,12.0,0.0,0.923077,1.0,1.0,-1.0,,,,,,,,,,,,,88.0,DDIT3|2/2|05G05,DDIT3|1/2|05B05,gains some,retains all


In [158]:
# retains dimer PPI and loses PDI or M1H or DBD
df.loc[(df['dimer_ppi'] == 'retains all') & 
       (df['activation_abs_fold_change'] >= 1)].shape

(7, 33)

In [157]:
df.loc[(df['dimer_ppi'] == 'retains all') & 
       ~(
       (df['activation_abs_fold_change'] >= 1) |
       (df['pdi_n_diff'].abs() > 0) |
       (df['dbd_pct_lost'] > 0) |
       (df['other_ppi'].isin(['loses some', 'loses all']))
       )]

Unnamed: 0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi,gene,ref_iso,alt_iso,dbd_insertion_n_aa,dbd_pct_lost
4,ARNTL2,ARNTL2|1/5|10H01,ARNTL2|2/5|12C06,2.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,,,,,94.6,ARNTL2|2/5|12C06,ARNTL2|1/5|10H01,retains all,,ARNTL2,ARNTL2-2,ARNTL2-1,34.0,0.0
5,ARNTL2,ARNTL2|2/5|12C06,ARNTL2|3/5|10B01,2.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,0.22533,0.363099,-0.137769,0.137769,87.0,ARNTL2|2/5|12C06,ARNTL2|3/5|10B01,retains all,,ARNTL2,ARNTL2-2,ARNTL2-3,34.0,0.0
155,NFE2L2,NFE2L2|1/3|12H08,NFE2L2|2/3|01F05,4.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,8.724385,9.265946,0.541562,0.541562,97.4,NFE2L2|1/3|12H08,NFE2L2|2/3|01F05,retains all,retains all,NFE2L2,NFE2L2-1,NFE2L2-2,0.0,0.0
227,SMAD4,SMAD4|1/2|01A02,SMAD4|2/2|01F07,4.0,4.0,4.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,,,,,87.5,SMAD4|2/2|01F07,SMAD4|1/2|01A02,retains all,retains all,SMAD4,SMAD4-2,SMAD4-1,0.0,0.0


In [115]:
df.loc[(df['dimer_ppi'] == 'retains all') & 
       (df['pdi_n_diff'].abs() > 0)]

Unnamed: 0_level_0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi
pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
TCF4|1/9|07E01_TCF4|2/9|07E03,TCF4,TCF4|1/9|07E01,TCF4|2/9|07E03,157.0,156.0,156.0,0.0,0.993631,1.0,1.0,1.0,18.0,16.0,16.0,0.0,0.888889,1.0,2.0,-2.0,3.705451,3.808061,0.10261,0.10261,99.7,TCF4|1/9|07E01,TCF4|2/9|07E03,retains all,gains some
TCF4|1/9|07E01_TCF4|4/9|07G02,TCF4,TCF4|1/9|07E01,TCF4|4/9|07G02,155.0,153.0,153.0,0.0,0.987097,1.0,2.0,-2.0,19.0,15.0,16.0,1.0,0.789474,0.9375,4.0,-2.0,3.705451,4.405441,0.69999,0.69999,99.3,TCF4|1/9|07E01,TCF4|4/9|07G02,retains all,loses some
TCF4|1/9|07E01_TCF4|8/9|07A05,TCF4,TCF4|1/9|07E01,TCF4|8/9|07A05,156.0,73.0,73.0,0.0,0.467949,1.0,83.0,-83.0,20.0,17.0,18.0,1.0,0.85,0.944444,3.0,1.0,3.705451,4.161332,0.455881,0.455881,72.0,TCF4|1/9|07E01,TCF4|8/9|07A05,retains all,loses some


In [151]:
# retains dimer PPI loses other PPI
df.loc[(df['dimer_ppi'] == 'retains all') & 
       (df['other_ppi'].isin(['loses some', 'loses all']))]

Unnamed: 0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi,gene,ref_iso,alt_iso,dbd_insertion_n_aa,dbd_pct_lost
1,ARNT2,ARNT2|1/6|08C12,ARNT2|2/6|09E01,10.0,6.0,6.0,0.0,0.6,1.0,4.0,-4.0,,,,,,,,,2.337698,4.432498,-2.0948,2.0948,93.0,ARNT2|1/6|08C12,ARNT2|2/6|09E01,retains all,loses some,ARNT2,ARNT2-1,ARNT2-2,0.0,0.0
8,ATF2,ATF2|1/6|12H04,ATF2|3/6|09C05,3.0,2.0,2.0,0.0,0.666667,1.0,1.0,-1.0,,,,,,,,,-0.344877,0.361183,-0.706059,0.706059,86.4,ATF2|1/6|12H04,ATF2|3/6|09C05,retains all,loses all,ATF2,ATF2-1,ATF2-3,0.0,0.0
14,CREB5,CREB5|1/3|08E11,CREB5|2/3|08A12,10.0,1.0,1.0,0.0,0.1,1.0,9.0,-9.0,,,,,,,,,-0.600782,1.741278,-2.34206,2.34206,71.1,CREB5|1/3|08E11,CREB5|2/3|08A12,retains all,loses all,CREB5,CREB5-1,CREB5-2,0.0,0.0
47,ESRRA,ESRRA|1/2|09B12,ESRRA|2/2|01H06,13.0,10.0,10.0,0.0,0.769231,1.0,3.0,-3.0,,,,,,,,,-0.02188,0.656687,-0.678567,0.678567,66.2,ESRRA|1/2|09B12,ESRRA|2/2|01H06,retains all,loses some,ESRRA,ESRRA-1,ESRRA-2,0.0,95.714286
48,ESRRG,ESRRG|1/2|09B10,ESRRG|2/2|09C10,10.0,8.0,9.0,1.0,0.8,0.888889,2.0,0.0,,,,,,,,,1.379506,3.335048,1.955542,1.955542,92.3,ESRRG|1/2|09B10,ESRRG|2/2|09C10,retains all,loses some,ESRRG,ESRRG-1,ESRRG-2,0.0,0.0
59,FOS,FOS|1/4|03D05,FOS|2/4|03E05,7.0,6.0,6.0,0.0,0.857143,1.0,1.0,-1.0,,,,,,,,,6.269662,6.825441,-0.555779,0.555779,61.1,FOS|1/4|03D05,FOS|2/4|03E05,retains all,loses some,FOS,FOS-1,FOS-2,0.0,22.222222
61,FOSB,FOSB|1/3|05D03,FOSB|2/3|05E04,8.0,3.0,3.0,0.0,0.375,1.0,5.0,-5.0,,,,,,,,,4.731216,7.024391,-2.293175,2.293175,70.1,FOSB|1/3|05D03,FOSB|2/3|05E04,retains all,loses some,FOSB,FOSB-1,FOSB-2,0.0,0.0
82,GRHL3,GRHL3|3/7|08G09,GRHL3|4/7|08F09,2.0,1.0,1.0,0.0,0.5,1.0,1.0,-1.0,,,,,,,,,0.041061,0.436213,-0.395152,0.395152,92.0,GRHL3|3/7|08G09,GRHL3|4/7|08F09,retains all,loses all,GRHL3,GRHL3-3,GRHL3-4,0.0,0.0
201,PPARG,PPARG|1/4|10G02,PPARG|2/4|10F03,6.0,5.0,5.0,0.0,0.833333,1.0,1.0,-1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,2.212049,2.423347,-0.211298,0.211298,94.5,PPARG|1/4|10G02,PPARG|2/4|10F03,retains all,loses some,PPARG,PPARG-1,PPARG-2,0.0,0.0
216,RFX4,RFX4|1/3|10C04,RFX4|2/3|10D02,8.0,5.0,5.0,0.0,0.625,1.0,3.0,-3.0,,,,,,,,,-1.550829,-0.890237,-0.660592,0.660592,95.9,RFX4|1/3|10C04,RFX4|2/3|10D02,retains all,loses some,RFX4,RFX4-1,RFX4-2,0.0,0.0


In [14]:
# DBD intact
from data_loading import load_dbd_accessions

def load_dbd_affected():
    df = pd.concat([g.aa_feature_disruption(g.cloned_reference_isoform.name) for g in tfs.values()])
    df['is_DBD'] = df['accession'].isin(load_dbd_accessions())
    df_new = (df.loc[df['is_DBD'], :]
        .groupby(['gene', 'ref_iso', 'alt_iso'])
        [['deletion', 'frameshift']].sum()
        .sum(axis=1) / df.loc[df['is_DBD'], :]
        .groupby(['gene', 'ref_iso', 'alt_iso'])
        ['length'].sum()).to_frame(name='dbd_fraction')
    df_new['dbd_insertion_n_aa'] = (df.loc[df['is_DBD'], :]
                                  .groupby(['gene', 'ref_iso', 'alt_iso'])
                                  ['insertion']
                                  .sum())
    df = df_new.reset_index()
    df['dbd_pct_lost'] = df['dbd_fraction'] * 100.
    df = df.drop(columns=['dbd_fraction'])
    return df


dbd = load_dbd_affected()
dbd['clone_acc_ref'] = dbd['ref_iso'].map({iso.name: iso.clone_acc for tf in tfs.values() for iso in tf.cloned_isoforms})
dbd['clone_acc_alt'] = dbd['alt_iso'].map({iso.name: iso.clone_acc for tf in tfs.values() for iso in tf.cloned_isoforms})
df = pd.merge(df, dbd, how='left', on=['clone_acc_ref', 'clone_acc_alt'])

  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(
  dbd = dbd.append(


In [15]:
df.loc[(df['dimer_ppi'] == 'retains all') & 
       (df['dbd_pct_lost'] > 0)]

Unnamed: 0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi,gene,ref_iso,alt_iso,dbd_insertion_n_aa,dbd_pct_lost
47,ESRRA,ESRRA|1/2|09B12,ESRRA|2/2|01H06,13.0,10.0,10.0,0.0,0.769231,1.0,3.0,-3.0,,,,,,,,,-0.02188,0.656687,-0.678567,0.678567,66.2,ESRRA|1/2|09B12,ESRRA|2/2|01H06,retains all,loses some,ESRRA,ESRRA-1,ESRRA-2,0.0,95.714286
59,FOS,FOS|1/4|03D05,FOS|2/4|03E05,7.0,6.0,6.0,0.0,0.857143,1.0,1.0,-1.0,,,,,,,,,6.269662,6.825441,-0.555779,0.555779,61.1,FOS|1/4|03D05,FOS|2/4|03E05,retains all,loses some,FOS,FOS-1,FOS-2,0.0,22.222222
209,RARB,RARB|1/2|12H03,RARB|2/2|08B05,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,3.59394,4.731984,1.138044,1.138044,75.0,RARB|1/2|12H03,RARB|2/2|08B05,retains all,,RARB,RARB-1,RARB-2,0.0,48.571429


In [137]:
df['dbd_pct_lost'].isnull().sum()

45

In [161]:
df['tf_gene_symbol'].map(fam).isin(DIMERIZING_TF_FAMILIES).value_counts()

False    294
True     153
Name: tf_gene_symbol, dtype: int64

In [16]:
df['tf_family'] = df['tf_gene_symbol'].map(fam)
df['is_dimer_fam'] = df['tf_gene_symbol'].map(fam).isin(DIMERIZING_TF_FAMILIES)
df['dbd_affected'] = df['dbd_pct_lost'] > 0

In [18]:
# try and understand differences in DBD affected between dimer and non-dimer DBD affected fraction
# confounding factors:
# length of DBD / fraction of protein of DBD
# identity of isoforms
df.groupby('is_dimer_fam')['dbd_affected'].mean()

is_dimer_fam
False    0.418367
True     0.248366
Name: dbd_affected, dtype: float64

In [41]:
df.groupby('tf_family')['dbd_affected'].mean()

tf_family
AP-2                       0.000000
AT hook                    0.000000
C2H2 ZF                    0.464789
C2H2 ZF; AT hook           1.000000
CBF/NF-Y                   0.000000
CCCH ZF                    0.000000
CSL                        0.000000
CxxC                       0.000000
E2F                        0.428571
EBF1                       0.800000
Ets                        0.000000
Forkhead                   0.363636
GATA                       0.000000
Grainyhead                 0.142857
HMG/Sox                    0.333333
HSF                        0.500000
Homeodomain                0.285714
Homeodomain; Paired box    1.000000
IRF                        0.000000
Myb/SANT                   0.000000
Nuclear receptor           0.137931
Paired box                 1.000000
RFX                        0.250000
Rel                        0.500000
SAND                       1.000000
SMAD                       0.166667
STAT                       0.333333
T-box             

In [165]:
# tidy up table and send
# reorder columns
# add loss of ligand binding domain?
# can we add dimer domains?
df.head()

Unnamed: 0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi,gene,ref_iso,alt_iso,dbd_insertion_n_aa,dbd_pct_lost,is_dimer_fam,dbd_affected
0,AEBP2,AEBP2|2/3|05F03,AEBP2|3/3|05E07,,,,,,,,,,,,,,,,,-0.234322,0.301297,-0.535619,0.535619,82.0,AEBP2|2/3|05F03,AEBP2|3/3|05E07,,,AEBP2,AEBP2-2,AEBP2-3,0.0,0.0,False,False
1,ARNT2,ARNT2|1/6|08C12,ARNT2|2/6|09E01,10.0,6.0,6.0,0.0,0.6,1.0,4.0,-4.0,,,,,,,,,2.337698,4.432498,-2.0948,2.0948,93.0,ARNT2|1/6|08C12,ARNT2|2/6|09E01,retains all,loses some,ARNT2,ARNT2-1,ARNT2-2,0.0,0.0,True,False
2,ARNT2,ARNT2|1/6|08C12,ARNT2|3/6|10D11,,,,,,,,,,,,,,,,,,,,,95.7,ARNT2|1/6|08C12,ARNT2|3/6|10D11,,,ARNT2,ARNT2-1,ARNT2-3,0.0,0.0,True,False
3,ARNTL,ARNTL|1/2|08H08,ARNTL|2/2|08E08,,,,,,,,,,,,,,,,,,,,,90.3,ARNTL|1/2|08H08,ARNTL|2/2|08E08,,,ARNTL,ARNTL-1,ARNTL-2,0.0,0.0,True,False
4,ARNTL2,ARNTL2|1/5|10H01,ARNTL2|2/5|12C06,2.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,,,,,94.6,ARNTL2|2/5|12C06,ARNTL2|1/5|10H01,retains all,,ARNTL2,ARNTL2-2,ARNTL2-1,34.0,0.0,True,False


In [19]:
# add loses all PPIs
# how is it done in the website figures?
y2h_complete = load_y2h_isoform_data(require_at_least_one_ppi_per_isoform=False)

In [20]:
def ppi_total_loss(row, ppi):
    ref_clone_acc = row['clone_acc_ref']
    alt_clone_acc = row['clone_acc_alt']
    if ref_clone_acc not in ppi['ad_clone_acc'].unique() or alt_clone_acc not in ppi['ad_clone_acc'].unique():
        return np.nan
    df = (ppi.loc[ppi['ad_clone_acc'].isin([ref_clone_acc, alt_clone_acc]),
                  ['ad_clone_acc', 'db_gene_symbol', 'Y2H_result']]
            .pivot(values='Y2H_result', index='db_gene_symbol', columns='ad_clone_acc')
            .dropna())
    df = df.loc[df.any(axis=1), :]
    if df.shape[0] == 0:
        return np.nan
    return not df[alt_clone_acc].any()


df['ppi_alternative_loses_all'] = df.apply(ppi_total_loss, 
                                           ppi=y2h_complete,
                                           axis=1)

In [21]:
df['ppi_alternative_loses_all'].sum()

54

In [22]:
# check there is at least one assay
# but need to include all zero ppi pairs
no_assay_data = (df['ppi_n_tested'].isnull() &
                 ~(df['ppi_alternative_loses_all'] == True) &
                 df['activation_fold_change'].isnull() &
                 df['pdi_jaccard'].isnull())
df = df.loc[~no_assay_data, :]

In [28]:
clone_acc_to_enst = {iso.clone_acc: '|'.join(iso.ensembl_transcript_ids)
                     if iso.ensembl_transcript_ids is not None else np.nan
                     for tf in tfs.values()
                     for iso in tf.cloned_isoforms}
df['ENST_ref'] = df['clone_acc_ref'].map(clone_acc_to_enst)
df['ENST_alt'] = df['clone_acc_alt'].map(clone_acc_to_enst)

In [31]:
# change to ppi_n_ref / alt / shared
# add has PPI/PDI/M1H data
# add M1H absolute highest across the whole gene
# add splicing type?
# add length in aa of ref and alt
# add ref is MANE
# add ref(?) / alt is novel

# add expression data?

cols = ['tf_gene_symbol',
        'tf_family',
        'is_dimer_fam',
        'clone_acc_ref',
        'clone_acc_alt',
        'ENST_ref',
        'ENST_alt',
        'aa_seq_pct_id',
        'dbd_affected',
        'dbd_insertion_n_aa',
        'dbd_pct_lost',
        'ppi_n_tested',
        'ppi_jaccard',
        'dimer_ppi',
        'other_ppi',
        'ppi_alternative_loses_all',
        'pdi_n_union',
        'pdi_n_shared',
        'pdi_jaccard',
        'm1h_min',
        'm1h_max',
        'activation_abs_fold_change']

df = df.rename(columns={'pdi_n_tested': 'pdi_n_union'})
df.loc[:, cols].to_csv('../output/TF-iso_ref-vs-alt.tsv', sep='\t', index=False)

In [18]:
df.to_csv('../output/TF-iso_ref-vs-alt.tsv', sep='\t', index=False)

In [40]:
y2h.loc[y2h['db_gene_symbol'] == 'ID3']

Unnamed: 0,ad_clone_acc,ad_gene_symbol,db_gene_symbol,Y2H_result,ad_tf_family,db_tf_family,is_dimerizing_ppi
6988,TCF4|1/9|07E01,TCF4,ID3,True,bHLH,,False
6989,TCF4|2/9|07E03,TCF4,ID3,True,bHLH,,False
6990,TCF4|3/9|07C04,TCF4,ID3,True,bHLH,,False
6991,TCF4|4/9|07G02,TCF4,ID3,True,bHLH,,False
6992,TCF4|5/9|07C02,TCF4,ID3,True,bHLH,,False
6993,TCF4|6/9|07H03,TCF4,ID3,True,bHLH,,False
6994,TCF4|7/9|07G03,TCF4,ID3,True,bHLH,,False
6995,TCF4|8/9|07A05,TCF4,ID3,True,bHLH,,False
9935,TCF12|3/3|07B07,TCF12,ID3,True,bHLH,,False
9947,TCF12|2/3|07A09,TCF12,ID3,True,bHLH,,False


In [138]:
# DEBUG
df.loc[df['dbd_pct_lost'].isnull(), :]

Unnamed: 0,tf_gene_symbol,clone_acc_a,clone_acc_b,ppi_n_tested,ppi_n_shared,ppi_n_min,ppi_n_min_diff,ppi_jaccard,ppi_simpson,ppi_n_diff,ppi_delta_n,pdi_n_tested,pdi_n_shared,pdi_n_min,pdi_n_min_diff,pdi_jaccard,pdi_simpson,pdi_n_diff,pdi_delta_n,m1h_min,m1h_max,activation_fold_change,activation_abs_fold_change,aa_seq_pct_id,clone_acc_ref,clone_acc_alt,dimer_ppi,other_ppi,gene,ref_iso,alt_iso,dbd_insertion_n_aa,dbd_pct_lost
88,HIF1A,HIF1A|1/4|11G01,HIF1A|2/4|08C04,,,,,,,,,,,,,,,,,,,,,92.9,HIF1A|1/4|11G01,HIF1A|2/4|08C04,,,,,,,
89,HIF1A,HIF1A|1/4|11G01,HIF1A|3/4|08H04,,,,,,,,,,,,,,,,,,,,,88.6,HIF1A|1/4|11G01,HIF1A|3/4|08H04,,,,,,,
130,MAZ,MAZ|2/3|01G05,MAZ|3/3|09D04,,,,,,,,,,,,,,,,,-1.096721,2.088471,-3.185192,3.185192,44.9,MAZ|2/3|01G05,MAZ|3/3|09D04,,,,,,,
132,MEIS1,MEIS1|1/2|09A04,MEIS1|2/2|09G04,,,,,,,,,,,,,,,,,-1.243943,-0.954097,-0.289846,0.289846,78.3,MEIS1|1/2|09A04,MEIS1|2/2|09G04,,,,,,,
133,MEIS2,MEIS2|1/4|08H09,MEIS2|2/4|12A07,19.0,19.0,19.0,0.0,1.0,1.0,0.0,0.0,,,,,,,,,-1.914068,-0.581469,-1.332599,1.332599,98.5,MEIS2|1/4|08H09,MEIS2|2/4|12A07,,retains all,,,,,
134,MEIS2,MEIS2|1/4|08H09,MEIS2|3/4|11H09,19.0,1.0,1.0,0.0,0.052632,1.0,18.0,-18.0,,,,,,,,,,,,,77.3,MEIS2|1/4|08H09,MEIS2|3/4|11H09,,loses some,,,,,
135,MEIS3,MEIS3|1/5|07C11,MEIS3|4/5|07H10,,,,,,,,,,,,,,,,,-0.944228,-0.422032,0.522196,0.522196,72.9,MEIS3|4/5|07H10,MEIS3|1/5|07C11,,,,,,,
136,MEIS3,MEIS3|2/5|07B10,MEIS3|4/5|07H10,,,,,,,,,,,,,,,,,-1.007193,-0.422032,0.58516,0.58516,80.1,MEIS3|4/5|07H10,MEIS3|2/5|07B10,,,,,,,
137,MEIS3,MEIS3|3/5|07D10,MEIS3|4/5|07H10,,,,,,,,,,,,,,,,,-0.91482,-0.422032,0.492787,0.492787,76.5,MEIS3|4/5|07H10,MEIS3|3/5|07D10,,,,,,,
138,MEIS3,MEIS3|4/5|07H10,MEIS3|5/5|07F09,,,,,,,,,,,,,,,,,-0.422032,0.933959,1.355991,1.355991,95.2,MEIS3|4/5|07H10,MEIS3|5/5|07F09,,,,,,,
