## TODO

- map partners to domains
- domain motif interactions
- map motifs to isoforms
- vizualize motifs

In [9]:
import pandas as pd

from ccsblib import huri

from data_loading import load_isoform_and_paralog_y2h_data

In [2]:
pd.set_option('display.max_columns', 100)

In [3]:
pfam = huri.load_pfam_domains()

In [4]:
pfam.head()

Unnamed: 0,orf_id,pfam_accession,domain_name,domain_description,start,stop,domain_length
10,1022,PF00812,Ephrin,Ephrin,31,163,139
16,6750,PF02089,Palm_thioest,Palmitoyl protein thioesterase,86,274,279
17,6750,PF00561,Abhydrolase_1,alpha/beta hydrolase fold,37,137,256
18,6750,PF00561,Abhydrolase_1,alpha/beta hydrolase fold,216,254,256
23,7051,PF14555,UBA_4,UBA-like domain,15,55,43


In [5]:
dlm = pd.read_csv('../../data/external/elm_interaction_domains.tsv',
                  sep='\t')

In [6]:
dlm.head()

Unnamed: 0,ELM identifier,Interaction Domain Id,Interaction Domain Description,Interaction Domain Name
0,CLV_NRD_NRD_1,PF00675,Peptidase_M16,Insulinase (Peptidase family M16)
1,CLV_PCSK_FUR_1,PF00082,Peptidase_S8,Subtilase family
2,CLV_PCSK_PC1ET2_1,PF00082,Peptidase_S8,Subtilase family
3,CLV_PCSK_PC7_1,PF00082,Peptidase_S8,Subtilase family
4,CLV_PCSK_SKI1_1,PF00082,Peptidase_S8,Subtilase family


In [21]:
orfs_with_slim_dom = pfam.loc[pfam['pfam_accession'].isin(dlm['Interaction Domain Id']),
                              'orf_id'].unique()

In [30]:
ppi.loc[:, ['ad_gene_symbol', 'db_gene_symbol']].drop_duplicates().shape[0]

1827

In [29]:
ppi.loc[ppi['db_orf_id'].isin(orfs_with_slim_dom),
        ['ad_gene_symbol', 'db_gene_symbol']].drop_duplicates().shape[0]

512

In [32]:
ppi['gene_level_pair'] = ppi['ad_gene_symbol'] + '_' + ppi['db_gene_symbol']

In [34]:
dom_slim_ppi = set(ppi.loc[ppi['db_orf_id'].isin(orfs_with_slim_dom),
                       'gene_level_pair'].unique())
pos_ppi = set(ppi.loc[(ppi['score'] == '1'), 'gene_level_pair'].unique())
neg_ppi = set(ppi.loc[(ppi['score'] == '0'), 'gene_level_pair'].unique())
diff_slim_ppi = dom_slim_ppi.intersection(pos_ppi.intersection(neg_ppi))
len(diff_slim_ppi)

212

In [36]:
len({p.split('_')[0] for p in diff_slim_ppi})  # number of TF genes with differentially interacting dom-slim pairs

56

In [None]:
# subset of interactions that are in some, not in others

In [27]:
iso = load_valid_isoform_clones()

In [31]:
iso.head()

Unnamed: 0,gene,clone_acc,aa_seq,num_aa
0,AEBP2,AEBP2|2/3|05F03,MDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSS...,268
1,AEBP2,AEBP2|3/3|05E07,MYTRRYSSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYN...,263
2,ARNT2,ARNT2|1/6|08C12,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,717
3,ARNT2,ARNT2|2/6|09E01,MASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKRRSGMDFDDEDG...,683
4,ARNT2,ARNT2|3/6|10D11,MATPAAVNPPEMASDIPGSVTLPVAPMAATGQVRMAGAMPARGGKR...,716


In [45]:
iso = iso.set_index('clone_acc')

In [48]:
from ccsblib.utils import _get_data_file_path

def match_elm_motifs(orfs):
    """Take the ELM motifs file and match the regex to given aa seqs
    Args:
        orfs (Series): amino acid sequences indexed by ORF ID
    Returns:
        DataFrame: one row for each seperate match of a motif to an ORF
    """
    file_path = _get_data_file_path('elms_index.tsv')
    elm = pd.read_csv(file_path, header=0, comment='#', sep='\t')
    motifs = []
    for elmIdx, elmRow in elm.iterrows():
        # inserting ?: to make groups non-capture
        motifMatches = orfs.str.findall(elmRow['Regex'].replace('(', '(?:'))
        for orfID, matches in zip(motifMatches.index, motifMatches.values):
            for match in matches:
                start = orfs.loc[orfID].find(match)
                # switch from 0 to 1-based indexing
                motifs.append((orfID,
                               elmRow['Accession'],
                               elmRow['ELMIdentifier'],
                               start + 1,
                               start + len(match)))
    motifs = pd.DataFrame(data=motifs,
                          columns=('orf_id', 'region_id', 'ELM_ID', 'start', 'end'))
    motifs['source'] = 'ELM_prediction'
    motifs['type'] = 'SLiM'
    return motifs



slims = match_elm_motifs(iso['aa_seq'])

In [49]:
slims.head()

Unnamed: 0,orf_id,region_id,ELM_ID,start,end,source,type
0,ARNT2|1/6|08C12,ELME000321,CLV_C14_Caspase3-7,144,148,ELM_prediction,SLiM
1,ARNT2|1/6|08C12,ELME000321,CLV_C14_Caspase3-7,481,485,ELM_prediction,SLiM
2,ARNT2|2/6|09E01,ELME000321,CLV_C14_Caspase3-7,133,137,ELM_prediction,SLiM
3,ARNT2|2/6|09E01,ELME000321,CLV_C14_Caspase3-7,470,474,ELM_prediction,SLiM
4,ARNT2|3/6|10D11,ELME000321,CLV_C14_Caspase3-7,144,148,ELM_prediction,SLiM


In [10]:
ppi = load_isoform_and_paralog_y2h_data()
ppi = ppi.loc[ppi['category'] == 'tf_isoform_ppis', :]

In [11]:
import numpy as np

from data_loading import load_aligned_aa_seqs


def isoform_specific_regions(gene_name):
    """The name is a bit misleading because it's not specific to one isoform but just
       not common to all isoforms.



       Should return a list of contiguous regions (AA seq),
       with a list of the isoform IDs that contain that region.

       TODO:
        - frame shifts
        - 
    
    """
    algn = load_aligned_aa_seqs(gene_name)
    subset_prev = None
    isr = []
    len_algn = len(list(algn.values())[0])
    for i in range(len_algn):
        subset = {k for k, v in algn.items() if v[i] != '-'}
        if subset_prev is None:
            if (len(subset) < len(algn)) and (len(subset) > 0):
                start = i
                subset_prev = subset
        else:
            if subset != subset_prev or i == len_algn - 1:
                if (len(subset_prev) < len(algn)) and (len(subset_prev) > 0):
                    isr.append((subset_prev, algn[list(subset_prev)[0]][start:i]))
                start = i
                subset_prev = subset
    return isr


def ppi_tf_gene(data, gene_name):
    tf = data.loc[(data['category'] == 'tf_isoform_ppis') &
                (data['ad_gene_symbol'] == gene_name),
                ['ad_clone_acc', 'db_gene_symbol', 'score']].copy()
    tf['score'] = tf['score'].map({'1': True,
                               '0': False,
                               'AA': np.nan,
                               'NC': np.nan})
    tf = tf.pivot(index='ad_clone_acc',
                  columns='db_gene_symbol',
                  values='score')
    return tf


def ppi_linked_isoform_specific_regions(ppi_data, gene_name):
    """
    For now, do not use cases where there are missing values
    """
    isr = isoform_specific_regions(gene_name)
    ppi = ppi_tf_gene(ppi_data, gene_name)
    ppi_isr = {}
    ppi_iso = {partner: set(ppi.index[ppi[partner]])
               for partner in ppi.columns
               if ppi[partner].notnull().all()}
    for partner, ppi_iso_subset in ppi_iso.items():
        for isr_subset, aa_seq in isr:
            if ppi_iso_subset == isr_subset:
                ppi_isr[partner] = (isr_subset, aa_seq)
    return ppi_isr


ppi_isr = ppi_linked_isoform_specific_regions(ppi, 'ATF2')
ppi_isr

{'ATF2': ({'ATF2|1/6|12H04',
   'ATF2|2/6|09E05',
   'ATF2|3/6|09C05',
   'ATF2|4/6|09B05'},
  'RLKAALTQQHPPVTNGDTVKGHGSGLVRTQSEESRPQSLQQPATSTTETPASPAHTTPQTQSTSGRRRRAANEDPDEKRRKFLERNRAAASRCRQKRKVWVQSLEKKAEDLSSLNGQLQ'),
 'FOSL2': ({'ATF2|1/6|12H04',
   'ATF2|2/6|09E05',
   'ATF2|3/6|09C05',
   'ATF2|4/6|09B05'},
  'RLKAALTQQHPPVTNGDTVKGHGSGLVRTQSEESRPQSLQQPATSTTETPASPAHTTPQTQSTSGRRRRAANEDPDEKRRKFLERNRAAASRCRQKRKVWVQSLEKKAEDLSSLNGQLQ'),
 'JDP2': ({'ATF2|1/6|12H04',
   'ATF2|2/6|09E05',
   'ATF2|3/6|09C05',
   'ATF2|4/6|09B05'},
  'RLKAALTQQHPPVTNGDTVKGHGSGLVRTQSEESRPQSLQQPATSTTETPASPAHTTPQTQSTSGRRRRAANEDPDEKRRKFLERNRAAASRCRQKRKVWVQSLEKKAEDLSSLNGQLQ'),
 'MAPK9': ({'ATF2|1/6|12H04', 'ATF2|2/6|09E05', 'ATF2|5/6|09A05'},
  'RFTNEDHLAVHKHKHEMTLKFGPARNDSVIVAD')}

In [12]:
slim_binding_domains = pd.merge(pfam,
                                dlm,
                                how='inner',
                                left_on='pfam_accession',
                                right_on='Interaction Domain Id')
slim_binding_domains = pd.merge(slim_binding_domains,
                                ppi,
                                how='inner',
                                left_on='orf_id',
                                right_on='db_orf_id')
slim_ppis = slim_binding_domains.loc[:, ['ad_gene_symbol', 
                                        'db_gene_symbol',
                                        'pfam_accession',
                                        'domain_name',
                                        'domain_description',
                                        'ELM identifier']].drop_duplicates()

In [13]:
slim_ppis

Unnamed: 0,ad_gene_symbol,db_gene_symbol,pfam_accession,domain_name,domain_description,ELM identifier
0,TCF4,KLC3,PF00515,TPR_1,Tetratricopeptide repeat,LIG_TPR
8,TCF4,KLC3,PF00515,TPR_1,Tetratricopeptide repeat,TRG_PTS1
16,TCF4,KLC3,PF00515,TPR_1,Tetratricopeptide repeat,LIG_APCC_Cbox_2
24,TCF4,KLC3,PF00515,TPR_1,Tetratricopeptide repeat,LIG_APCC_Cbox_1
160,STAT3,IFIT2,PF00515,TPR_1,Tetratricopeptide repeat,LIG_TPR
...,...,...,...,...,...,...
24156,FOXP2,CTBP2,PF00389,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,...",LIG_CtBP_PxDLS_1
24158,ZBTB18,CTBP2,PF00389,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,...",LIG_CtBP_PxDLS_1
24160,SOX6,CTBP2,PF00389,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,...",LIG_CtBP_PxDLS_1
24163,IKZF2,CTBP2,PF00389,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,...",LIG_CtBP_PxDLS_1


In [14]:
ppi_isr = {}
for gene_name in slim_ppis['ad_gene_symbol'].unique():
    ppi_isr[gene_name] = ppi_linked_isoform_specific_regions(ppi, gene_name)

In [16]:
# for slim ppi:
#    check for isr
#    check isr has any of corresponding slims

for row in slim_ppis:
    if row['db_gene_symbol'] in ppi_isr[row['ad_gene_symbol']]:
        subset, aa_seq = ppi_isr[row['ad_gene_symbol']][row['db_gene_symbol']]
        if aa_seq 

Unnamed: 0,ELM identifier,Interaction Domain Id,Interaction Domain Description,Interaction Domain Name
85,LIG_TPR,PF00515,TPR_1,Tetratricopeptide repeat
136,TRG_PTS1,PF00515,TPR_1,Tetratricopeptide repeat
267,LIG_APCC_Cbox_2,PF00515,TPR_1,Tetratricopeptide repeat
268,LIG_APCC_Cbox_1,PF00515,TPR_1,Tetratricopeptide repeat


In [21]:
slim_ppis = slim_ppis.loc[slim_ppis.apply(lambda x: x['db_gene_symbol'] in ppi_isr[x    ['ad_gene_symbol']],
                              axis=1),
              :]
slim_ppis['aa_seq_isr'] = slim_ppis.apply(lambda x: ppi_isr[x['ad_gene_symbol']][x['db_gene_symbol']][1], axis=1)

In [28]:
help(re.match)

Help on function match in module re:

match(pattern, string, flags=0)
    Try to apply the pattern at the start of the string, returning
    a Match object, or None if no match was found.



In [61]:
import re


elm = pd.read_csv('../../data/external/elm_classes.tsv', header=0, comment='#', sep='\t')

def isr_contains_slim(row):
    # inserting ?: to make groups non-capture
    if row['ELM identifier'] not in elm['ELMIdentifier'].values:
        #raise UserWarning('Missing ELM entry for: ', row['ELM identifier'])
        print('Missing ELM entry for: ', row['ELM identifier'])
        return False
    regex = elm.loc[elm['ELMIdentifier'] == row['ELM identifier'], 'Regex'].values[0].replace('(', '(?:')
    return bool(re.search(regex, row['aa_seq_isr']))


slim_ppis['slim_match'] = slim_ppis.apply(isr_contains_slim, axis=1)

Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  LIG_CaM_1-26_7
Missing ELM entry for:  LIG_CaM_1-17_6
Missing ELM entry for:  LIG_CaM_1-8-9-10_5
Missing ELM entry for:  LIG_CaM_1-8-14_4
Missing ELM entry for:  LIG_CaM_1-5-10-14_3
Missing ELM entry for:  LIG_CaM_1-8_REV_2
Missing ELM entry for:  LIG_CaM_1-14-15-16_REV_1
Missing ELM entry for:  LIG_CaM_1-26_7
Missing ELM entry for:  LIG_CaM_1-17_6
Missing ELM entry for:  LIG_CaM_1-8-9-10_5
Missing ELM entry for:  LIG_CaM_1-8-14_4
Missing ELM entry for:  LIG_CaM_1-5-10-14_3
Missing ELM entry for:  LIG_CaM_1-8_REV_2
Missing ELM entry for:  LIG_CaM_1-14-15-16_REV_1
Missing ELM entry for:  DEG_COP1
Missing ELM entry for:  DEG_COP1
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry for:  MOD_PLK
Missing ELM entry fo

In [64]:
(slim_ppis.loc[slim_ppis['slim_match'],
               ['ad_gene_symbol', 'db_gene_symbol']]
          .drop_duplicates()
          .shape[0])

33

In [66]:
match_pairs = (slim_ppis.loc[slim_ppis['slim_match'],
               ['ad_gene_symbol', 'db_gene_symbol']]
                .drop_duplicates())
match_pairs

Unnamed: 0,ad_gene_symbol,db_gene_symbol
3648,STAT3,BLK
3992,PAX5,STAC
4026,MEIS2,HCK
4418,STAT3,TXK
6807,GATA1,ARMC7
7537,MEIS2,LNX1
8266,MEIS2,IL16
11833,ZBTB48,WDR5
14036,TCF12,EPHB6
14624,IKZF2,CDC7


In [70]:
match_pairs['ad_gene_symbol'].value_counts().head()

STAT3     3
MEIS2     3
FOSB      2
HMBOX1    2
GATA1     2
Name: ad_gene_symbol, dtype: int64

In [69]:
match_pairs['db_gene_symbol'].value_counts().head()

PIN1      5
MAPK9     2
PRKAA2    2
EPHB6     1
CTBP2     1
Name: db_gene_symbol, dtype: int64

In [90]:
slim_ppis.loc[slim_ppis['slim_match'] & (slim_ppis['db_gene_symbol'] == 'PIN1'), :]

Unnamed: 0,ad_gene_symbol,db_gene_symbol,pfam_accession,domain_name,domain_description,ELM identifier,aa_seq_isr,slim_match,isr_length
23489,FOXN4,PIN1,PF00397,WW,WW domain,DOC_WW_Pin1_4,MIESDTSSIMSGIIRNSGQNHHPSPQEYRLLATTSDDDLPGDLQSL...,True,199
23491,PBX1,PIN1,PF00397,WW,WW domain,DOC_WW_Pin1_4,DGLAASQMYSPQGISANGGWQDATTPSSVTSPTEGPGSVHSDTS,True,44
23492,TCF4,PIN1,PF00397,WW,WW domain,DOC_WW_Pin1_4,SNNDDEDLTPEQKAEREKERRMANNARERLRVRDINEAFKELGRMV...,True,121
23513,FOS,PIN1,PF00397,WW,WW domain,DOC_WW_Pin1_4,MAAAKCRNRRRELTDTLQAETDQLEDEKSALQTEIANLLKEKEKLE...,True,231
23521,MEOX1,PIN1,PF00397,WW,WW domain,DOC_WW_Pin1_4,MDPAASSCMRSLQPPAPVWGCLRNPHSEGNGASGLPHYPPTPFSFH...,True,115


Pin1 Links the Activities of c-Abl
and p300 in Regulating p73 Function, Mol. Cell, 2004

It has been showing that TP73 isoforms lacking the PIN1 interacting motif have reduced transcriptional activity.

Also the mouse FOS has the PIN1 motif binding shown.

In [87]:
ppi.loc[ppi['db_gene_symbol'] == 'PIN1',
        'ad_gene_symbol'].nunique()
ppi.loc[ppi['db_gene_symbol'] == 'PIN1',
        'ad_gene_symbol'].unique()

array(['FOXN4', 'GLI1', 'PBX1', 'TCF4', 'FOXP2', 'TCF12', 'TP63', 'ETV6',
       'HOMEZ', 'ZBTB7B', 'FOS', 'RELA', 'ZNF784', 'GATA1', 'MEOX1',
       'ZNF212'], dtype=object)

In [89]:
# smallest region
slim_ppis['isr_length'] = slim_ppis['aa_seq_isr'].str.len()
(slim_ppis.loc[slim_ppis['slim_match'], :]
          .sort_values('isr_length')
          .drop_duplicates(['ad_gene_symbol', 'db_gene_symbol', 'pfam_accession'])
          .head(30))

Unnamed: 0,ad_gene_symbol,db_gene_symbol,pfam_accession,domain_name,domain_description,ELM identifier,aa_seq_isr,slim_match,isr_length
14036,TCF12,EPHB6,PF00069,Pkinase,Protein kinase domain,MOD_Plk_4,KNRVEQQLHEHLQDAMSFLKDVCE,True,24
3648,STAT3,BLK,PF00069,Pkinase,Protein kinase domain,MOD_GSK3_1,NNGEGAEPSAGGQFESLTFDMELTSECATSP,True,31
18071,ATF2,MAPK9,PF00069,Pkinase,Protein kinase domain,DOC_MAPK_gen_1,RFTNEDHLAVHKHKHEMTLKFGPARNDSVIVAD,True,33
23973,HMBOX1,UBE2Z,PF00179,UQ_con,Ubiquitin-conjugating enzyme,MOD_SUMO_for_1,DDSTSHSDHQDPISLAVEMAAVNHTILALARQGANEIKTEALDD,True,44
15877,HMBOX1,CDK18,PF00069,Pkinase,Protein kinase domain,MOD_Plk_1,DDSTSHSDHQDPISLAVEMAAVNHTILALARQGANEIKTEALDD,True,44
23491,PBX1,PIN1,PF00397,WW,WW domain,DOC_WW_Pin1_4,DGLAASQMYSPQGISANGGWQDATTPSSVTSPTEGPGSVHSDTS,True,44
14813,IKZF2,CDC7,PF00069,Pkinase,Protein kinase domain,MOD_Plk_1,VPPMEDCKEQEPIMDNNISLVPFERPAVIEKLTGNMGKRKSSTPQKFV,True,48
24111,NFIX,MLH1,PF16413,Mlh1_C,DNA mismatch repair protein Mlh1 C-terminus,LIG_MLH1_MIPbox_1,SPRATASALHFPSTSIIQQSSPYFTHPTIRYHHHHGQDSLKEFVQF...,True,58
16837,ZNF263,CLK3,PF00069,Pkinase,Protein kinase domain,MOD_CK1_1,EKPYKCTLCGENFSHRSNLIRHQRIHTGEKPYTCHECGDSFSHSSN...,True,83
19551,ZNF263,CLK2,PF00069,Pkinase,Protein kinase domain,MOD_NEK2_1,EKPYKCTLCGENFSHRSNLIRHQRIHTGEKPYTCHECGDSFSHSSN...,True,83


In [92]:
# most specific regex
slim_ppis['motif_probability'] = slim_ppis['ELM identifier'].map(elm.set_index('ELMIdentifier')['Probability'])
(slim_ppis.loc[slim_ppis['slim_match'], :]
          .sort_values('motif_probability')
          .drop_duplicates(['ad_gene_symbol', 'db_gene_symbol', 'pfam_accession'])
          .head(30))

Unnamed: 0,ad_gene_symbol,db_gene_symbol,pfam_accession,domain_name,domain_description,ELM identifier,aa_seq_isr,slim_match,isr_length,motif_probability
22622,CREBZF,HCFC1,PF13415,Kelch_3,"Galactose oxidase, central domain",LIG_HCF-1_HBM_1,MRHSLTKLLAASGSNSPTRSESPEPAATCSLPSDLTRAAAGEEETA...,True,354,5.1e-05
24111,NFIX,MLH1,PF16413,Mlh1_C,DNA mismatch repair protein Mlh1 C-terminus,LIG_MLH1_MIPbox_1,SPRATASALHFPSTSIIQQSSPYFTHPTIRYHHHHGQDSLKEFVQF...,True,58,6.2e-05
8266,MEIS2,IL16,PF00595,PDZ,PDZ domain (Also known as DHR or GLGF),LIG_PDZ_Class_2,GLQSMPGDYVSQGGPMGMSMAQPSYTPPQMTPHPTQLRHGPPMHSY...,True,94,7.9e-05
7537,MEIS2,LNX1,PF00595,PDZ,PDZ domain (Also known as DHR or GLGF),LIG_PDZ_Class_2,GLQSMPGDYVSQGGPMGMSMAQPSYTPPQMTPHPTQLRHGPPMHSY...,True,94,7.9e-05
24172,SOX5,CTBP2,PF00389,2-Hacid_dh,"D-isomer specific 2-hydroxyacid dehydrogenase,...",LIG_CtBP_PxDLS_1,MSVMSSKRPASPYGEADGEVAMVTSRQKVEEEESDGLPAFHLPLHV...,True,378,0.000117
23823,FOSB,HOMER3,PF00568,WH1,WH1 domain,LIG_EVH1_1,VRDLPGSAPAKEDGFSWLLPPPPPPPLPFQTSQDAPPNLTASLFTH...,True,100,0.000124
18719,ATF2,MAPK9,PF00069,Pkinase,Protein kinase domain,DOC_MAPK_NFAT4_5,RFTNEDHLAVHKHKHEMTLKFGPARNDSVIVAD,True,33,0.000163
18739,CREB5,MAPK9,PF00069,Pkinase,Protein kinase domain,DOC_MAPK_NFAT4_5,MNLEQERPFVCSAPGCSQRFPTEDHLMIHRHKHEMTLKFPSIKTDN...,True,148,0.000163
11833,ZBTB48,WDR5,PF00400,WD40,"WD domain, G-beta repeat",DEG_APCC_KENBOX_2,PPRPLEAEGAQLQGGSNEWEVVVQVEDDGDGDYMSEPEAVLTRRKS...,True,475,0.000184
6817,GATA1,ARMC7,PF00514,Arm,Armadillo/beta-catenin-like repeat,TRG_NLS_MonoCore_2,HQVNRPLTMRKDGIQTRNRKASGKGKKKRGSSLGGTGAAEGPAGGF...,True,113,0.000234


In [98]:
slim_ppis.loc[slim_ppis['slim_match'] & (slim_ppis['ad_gene_symbol'] == 'STAT3'), :]

Unnamed: 0,ad_gene_symbol,db_gene_symbol,pfam_accession,domain_name,domain_description,ELM identifier,aa_seq_isr,slim_match,isr_length,motif_probability
3648,STAT3,BLK,PF00069,Pkinase,Protein kinase domain,MOD_GSK3_1,NNGEGAEPSAGGQFESLTFDMELTSECATSP,True,31,0.026787
3735,STAT3,BLK,PF00069,Pkinase,Protein kinase domain,MOD_Plk_2-3,NNGEGAEPSAGGQFESLTFDMELTSECATSP,True,31,0.002175
4418,STAT3,TXK,PF00069,Pkinase,Protein kinase domain,MOD_CK1_1,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.017041
4613,STAT3,TXK,PF00069,Pkinase,Protein kinase domain,MOD_NEK2_1,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.009798
4795,STAT3,TXK,PF00069,Pkinase,Protein kinase domain,MOD_Plk_4,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.006019
4899,STAT3,TXK,PF00017,SH2,SH2 domain,LIG_SH2_STAT3,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.000797
4912,STAT3,TXK,PF00017,SH2,SH2 domain,LIG_SH2_STAT5,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.003296
5003,STAT3,TXK,PF00017,SH2,SH2 domain,LIG_SH2_STAP1,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.001026
15502,STAT3,BMX,PF00069,Pkinase,Protein kinase domain,MOD_CK1_1,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.017041
15547,STAT3,BMX,PF00069,Pkinase,Protein kinase domain,MOD_NEK2_1,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,True,98,0.009798


In [96]:
#ppi.loc[ppi['ad_gene_symbol'] == 'STAT3', :]