In [1]:
import pandas as pd
import numpy as np

from data_loading import (load_y2h_isoform_data, 
                          load_y1h_pdi_data, 
                          load_valid_isoform_clones)


clones = load_valid_isoform_clones()
ppi = load_y2h_isoform_data()
pdi = load_y1h_pdi_data()
pdi = pdi.melt(id_vars=['gene_symbol', 'clone_acc'],
         value_vars=pdi.columns[2:],
         value_name='result',
         var_name='partner')
pdi = pdi.loc[(pdi['result'] == True)
              & pdi['clone_acc'].isin(clones['clone_acc'].values),
              ['gene_symbol', 'clone_acc', 'partner']]
pdi['interaction_type'] = 'PDI'
pdi['partner_type'] = 'DNA'
ppi = (ppi.loc[(ppi['Y2H_result'] == True)
               & ppi['ad_clone_acc'].isin(clones['clone_acc'].values),
               ['ad_gene_symbol', 'ad_clone_acc', 'db_gene_symbol']]
          .rename(columns={
              'ad_gene_symbol': 'gene_symbol',
              'ad_clone_acc': 'clone_acc',
              'db_gene_symbol': 'partner',
          }))
ppi['interaction_type'] = 'PPI'
ppi['partner_type'] = 'Protein'
pd.concat([ppi, pdi], axis=0).to_csv('../output/combined-network-for-cytoscape_edges.tsv',
                                     index=False,
                                     sep='\t')
# need to add a node table to include unconnected nodes
tf_iso_nodes = (clones.loc[clones['gene_symbol'].isin(ppi['gene_symbol'].unique())
                           | clones['gene_symbol'].isin(pdi['gene_symbol'].unique()),
                          ['gene_symbol', 
                           'clone_acc']]
                 .rename(columns={'clone_acc': 'node_ID'})
                 .assign(node_type='TF isoform')
)
ppi_partner_nodes = (ppi.loc[:, ['partner', 'partner_type']]
                     .drop_duplicates()
                     .rename(columns={'partner': 'node_ID',
                                      'partner_type': 'node_type'})
                    .assign(gene_symbol=np.nan)
)
dna_nodes = (pdi.loc[:, ['partner', 'partner_type']]
                     .drop_duplicates()
                     .rename(columns={'partner': 'node_ID',
                                      'partner_type': 'node_type'})
                    .assign(gene_symbol=np.nan)
)

pd.concat([tf_iso_nodes, ppi_partner_nodes, dna_nodes], axis=0).to_csv('../output/combined-network-for-cytoscape_nodes.tsv',
                                     index=False,
                                     sep='\t')

In [2]:
# inspect hubs
ppi['gene_symbol'].value_counts().head(20)

TCF4      973
TCF12     211
SOX6      118
HMBOX1     90
IKZF2      70
ZNF451     67
ZBTB44     54
MEOX1      52
MEIS2      47
NFYA       42
FOXP2      40
ZNF438     29
PATZ1      29
TP63       28
TFCP2      28
ZNF212     28
DDIT3      26
ESRRG      24
ESRRA      24
ZBTB16     22
Name: gene_symbol, dtype: int64

In [3]:
pdi['gene_symbol'].value_counts().head(10)

GRHL3     258
MAX        75
TCF4       67
RXRG       48
ZIC3       43
TFAP2A     32
EBF3       32
TBX5       27
PKNOX1     20
PRRX1      20
Name: gene_symbol, dtype: int64

In [6]:
# duplicate partner nodes to make a cleaner display

degree_ggi = ppi.groupby('partner')['gene_symbol'].nunique()
for partner, degree in degree_ggi.items():
    if degree == 1:
        continue
    for i, tf_gene in enumerate(ppi.loc[ppi['partner'] == partner, 
                           'gene_symbol'].unique()):
        ppi.loc[(ppi['partner'] == partner)
                & (ppi['gene_symbol'] == tf_gene),
                'partner'] = partner + "_" + str(i + 1)
        

degree_dna = pdi.groupby('partner')['gene_symbol'].nunique()
for partner, degree in degree_dna.items():
    if degree == 1:
        continue
    for i, tf_gene in enumerate(pdi.loc[pdi['partner'] == partner, 
                           'gene_symbol'].unique()):
        pdi.loc[(pdi['partner'] == partner)
                & (pdi['gene_symbol'] == tf_gene),
                'partner'] = partner + "_" + str(i + 1)
        

pd.concat([ppi, pdi], axis=0).to_csv('../output/combined-network-for-cytoscape_duplicate-nodes_edges.tsv',
                                     index=False,
                                     sep='\t')