TODO:

- change the scale and x-label on M1H
- improve the subtle splice display
- show uncloned isoforms smaller font size 
    - and format buttons better
- add downloads
- check that there are no singletons there

- fix the overlaping effector domains
- add domains to exons
- add a logo
- Make look nice on different displays
    - below a certain width the search bar disappears

In [2]:
import shutil
from pathlib import Path

from matplotlib import pyplot as plt
import pandas as pd
from jinja2 import Environment, FileSystemLoader

from data_loading import (load_y2h_isoform_data,
                          load_y1h_pdi_data,
                          load_m1h_activation_data,
                          load_annotated_TFiso1_collection,
                          load_gtex_remapped,
                          load_developmental_tissue_expression_remapped)
from plotting import (y2h_ppi_per_tf_gene_plot,
                      y1h_pdi_per_tf_gene_plot,
                      m1h_activation_per_tf_gene_plot)

In [3]:
tf_webpage_dir = Path('../website')
shared_fig_dir = tf_webpage_dir / 'media'

y2h = load_y2h_isoform_data(add_missing_data=True,
    require_at_least_one_ppi_per_isoform=False)
y1h = load_y1h_pdi_data(add_missing_data=True)
m1h = load_m1h_activation_data(add_missing_data=True)
tfs = load_annotated_TFiso1_collection()

  df = pd.concat(


reading from cache


In [20]:
# max height of top is in pixels but should be
# fraction of display
env = Environment(loader=FileSystemLoader('.'))

tf_datalist = '\n'.join('        <option value="{}"/>'.format(name) for name in sorted(tfs.keys()))

with open(tf_webpage_dir / 'index.html', 'w') as f_index:
    f_index_template = env.get_template('index_template.html')
    f_index.write(f_index_template.render(tf_gene_name_list=tf_datalist))  
with open(tf_webpage_dir / 'about.html', 'w') as f_about:
    f_about_template = env.get_template('about_template.html')
    f_about.write(f_about_template.render(tf_gene_name_list=tf_datalist))
with open(tf_webpage_dir / 'contact.html', 'w') as f_contact:
    f_contact_template = env.get_template('contact_template.html')
    f_contact.write(f_contact_template.render(tf_gene_name_list=tf_datalist))
with open(tf_webpage_dir / 'download.html', 'w') as f_dl:
    f_dl_template = env.get_template('download_template.html')
    f_dl.write(f_dl_template.render(tf_gene_name_list=tf_datalist))


template = env.get_template('gene_summary_template.html')
for tf in tfs.values():
    # TODO:
        # align the DAPI/GFP labels properly
    cond_imgs = " <div class='container'> "
    cond_imgs += " <h4 style='text-align: center;'>HEK293T cells</h4> "
    cond_imgs += " <p style='text-align: center; word-spacing: 170px;'>DAPI GFP Merge</p> "
    for iso in tf.cloned_isoforms:
        cond_imgs += " <div class='image-row'> "
        cond_imgs += f"<p class='microscopy-label'>{iso.name}</p>"
        cond_imgs += f"<img class='microscopy-image' src='../media/condensate_images/HEK293T_r1/{tf.name}/{iso.name}/dapi.jpg' onerror='disableCondensates();'>"
        cond_imgs += f"<img class='microscopy-image' src='../media/condensate_images/HEK293T_r1/{tf.name}/{iso.name}/green.jpg'>"
        cond_imgs += f"<img class='microscopy-image' src='../media/condensate_images/HEK293T_r1/{tf.name}/{iso.name}/merge.jpg'>"
        cond_imgs += ' </div> '
    cond_imgs += " <p'></p> "
    cond_imgs += " <h4 style='text-align: center;'>U2OS cells</h4> "
    cond_imgs += " <p style='text-align: center; word-spacing: 170px;'>DAPI GFP Merge</p> "
    for iso in tf.cloned_isoforms:
        cond_imgs += " <div class='image-row'> "
        cond_imgs += f"<p class='microscopy-label'>{iso.name}</p>"
        cond_imgs += f"<img class='microscopy-image' src='../media/condensate_images/U2OS_r1/{tf.name}/{iso.name}/dapi.jpg'>"
        cond_imgs += f"<img class='microscopy-image' src='../media/condensate_images/U2OS_r1/{tf.name}/{iso.name}/green.jpg'>"
        cond_imgs += f"<img class='microscopy-image' src='../media/condensate_images/U2OS_r1/{tf.name}/{iso.name}/merge.jpg'>"
        cond_imgs += ' </div> '
    cond_imgs += ' </div> '

    with open(tf_webpage_dir / 'pages/{}.html'.format(tf.name), 'w') as f:
        f.write(template.render(gene_name=tf.name,
                                ensembl_gene_id=tf.ensembl_gene_id,
                                uniprot_ac=tf.uniprot_ac,
                                tf_family=tf.tf_family,
                                tf_gene_name_list=tf_datalist,
                                condensate_images=cond_imgs,
                                ))
shutil.copyfile('gene_summary.css', tf_webpage_dir / 'gene_summary.css')

with open('molstar_template.js', 'r') as f:
    template = f.read()

for gene in tfs.values():
    exon_colors = gene._get_exon_colors()
    iso_json = '['
    for isoform in gene.cloned_isoforms:
        exon_pos = 1
        nt_diff = 0
        iso_json += '\n{name: \'' + isoform.name + '\','
        exon_boundaries_in_aa = []
        exon_colors_for_iso = []
        for exon in isoform.exons:
            n_nt_to_exon = (exon.end - exon.start) + nt_diff
            n_aa_exon = round(n_nt_to_exon / 3)
            nt_diff = {0: 0, 1: 1, 2: -1}[n_nt_to_exon % 3]
            color = exon_colors[(isoform.name, exon.start, exon.end)]
            start = exon_pos
            end = (exon_pos + n_aa_exon) - 1
            exon_boundaries_in_aa.append([start, end])
            exon_colors_for_iso.append(color)
            exon_pos += n_aa_exon
        iso_json += '\nexon_boundaries: ' + str(exon_boundaries_in_aa) + ','
        iso_json += '\nexon_colors: [' + ', '.join([f'{{r: {int(r * 255)}, g: {int(g * 255)}, b: {int(b * 255)}}}' for r, g, b in exon_colors_for_iso]) + ']'
        iso_json += '},'
    iso_json += ']'
    with open(tf_webpage_dir / 'pages/{}-molstar.js'.format(gene.name), 'w') as f:
        f.write(template.replace('cloned_isoform_json_array', iso_json))

In [6]:
for tf in tfs.values():
    fig, ax = plt.subplots(1, 1)
    has_ppis = y2h_ppi_per_tf_gene_plot(tf.name, 
                             ax=ax, 
                             data=y2h,
                             iso_order=[iso.name for iso in tf.cloned_isoforms],
                             )
    if not has_ppis:
        plt.close(plt.gcf())
        continue
    n_ppi_partners = ax.get_xlim()[1] + 0.5
    fig.set_size_inches(1 + 0.35 * n_ppi_partners, 1 + 0.35 * len(tf.cloned_isoforms))
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_y2h-profile{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

In [8]:
for tf in tfs.values():
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(6, 0.5 * len(tf.cloned_isoforms))
    has_act = m1h_activation_per_tf_gene_plot(tf.name,
                                    data=m1h,
                                    iso_order=[iso.name for iso in tf.cloned_isoforms],
                                    ax=ax,
                                    )
    if not has_act:
        plt.close(plt.gcf())
        continue
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name, fmt),
  plt.savefig(shared_fig_dir / '{}_m1h-profile{}'.format(tf.name

In [10]:
for tf in tfs.values():
    fig, ax = plt.subplots(1, 1)
    has_pdis = y1h_pdi_per_tf_gene_plot(tf.name,
                             data=y1h,
                             ax=ax,
                             iso_order=[iso.name for iso in tf.cloned_isoforms],
                             )
    if not has_pdis:
        plt.close(plt.gcf())
        continue
    n_pdi_partners = ax.get_xlim()[1] + 0.5
    fig.set_size_inches(1 + 0.35 * n_pdi_partners, 1 + 0.35 * len(tf.cloned_isoforms))
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_y1h-profile{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

In [10]:
for tf in tfs.values():
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(8, 0.65 * len(tf.cloned_isoforms))
    tf.protein_diagram(ax=ax)
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_cloned-isoforms_protein-diagram{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

  plt.savefig(shared_fig_dir / '{}_cloned-isoforms_protein-diagram{}'.format(tf.name, fmt),


In [7]:
for tf in tfs.values():
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(8, 0.35 * len(tf.isoforms))
    tf.exon_diagram(ax=ax, show_matched_transcripts=True)
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_cloned-plus-ensembl-isoforms_exon-diagram{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

  plt.savefig(shared_fig_dir / '{}_cloned-plus-ensembl-isoforms_exon-diagram{}'.format(tf.name, fmt),


In [8]:
for tf in tfs.values():
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(w=8, h=0.35 * len(tf.cloned_isoforms))
    tf.exon_diagram(ax=ax, show_matched_transcripts=True, show_uncloned_isoforms=False)
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_cloned-isoforms_exon-diagram{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

  plt.savefig(shared_fig_dir / '{}_cloned-isoforms_exon-diagram{}'.format(tf.name, fmt),


In [29]:
gtex, metadata_gtex, genes = load_gtex_remapped()
means = gtex.groupby(gtex.columns.map(metadata_gtex['body_site']), axis=1).mean()


def presentable_names(s):
    clones, ensembls = s.split(' ')
    if clones == 'noclone':
        a = 'not cloned'
    else:
        a = '/'.join([x.split('|')[0] + '-' + x.split('|')[1].split('/')[0] for x in clones.split('_')])
    if ensembls == 'nomatch':
        b = ' novel isoform'
    else:
        b = '/'.join(ensembls.split('_'))
    return a + ' – ' + b


def get_pos(s):
    try:
        if s.startswith('not cloned'):
            iso_name = s.split(' ')[-1].split('/')[0]
        else:
            iso_name = s.split(' ')[0]
        gene_name = '-'.join(iso_name.split('-')[:-1])
        for i, iso in enumerate(tfs[gene_name].isoforms):
            if iso.name == iso_name:
                return i
        raise UserWarning('couldnt find isoform')
    except:
        print(s)
        print(iso_name, gene_name)
        raise


means.index = means.index.map(presentable_names)
genes.index = genes.index.map(presentable_names)
means = means.loc[sorted(means.index.values, key=get_pos), :]



def tissue_expression_plot(gene_name, means=means, genes=genes):
    fig, axes = plt.subplots(2, 1, sharex=True)
    fig.set_size_inches(12, 6)
    ### bar chart ###
    (means.loc[genes == gene_name, :]
          .T
          .plot.bar(ax=axes[0],
                    legend=False,
                    width=0.7))
    ### percentages ###
    raw_means = 2 ** means.loc[genes == gene_name] - 1.
    (raw_means.div(raw_means.sum(axis=0))
              .T.plot.bar(ax=axes[1], 
                          stacked=True,
                          legend=False))
    axes[0].set_ylabel('Mean log2(TPM + 1)')
    axes[1].set_ylabel('Fraction for each isoform')
    axes[1].set_yticklabels(['{:.0%}'.format(t) for t in axes[1].get_yticks()])
    axes[1].legend(loc='lower left', bbox_to_anchor=(1, 0))
    plt.subplots_adjust(hspace=0.05)
    plt.savefig(shared_fig_dir / (gene_name + '_tissue-expression.svg'),
                bbox_inches='tight')

for tf in tfs.values():
    tissue_expression_plot(tf.name)
    plt.close(plt.gcf())

reading from cache


  axes[1].set_yticklabels(['{:.0%}'.format(t) for t in axes[1].get_yticks()])
  plt.subplots_adjust(hspace=0.05)
  plt.savefig(shared_fig_dir / (gene_name + '_tissue-expression.svg'),


In [33]:
df_dev, metadata_dev, genes = load_developmental_tissue_expression_remapped()

rename_dev_stage = {'8 week post conception,embryo': '08',
'11 week post conception,late embryo': '11',
'embryo,7 week post conception': '07',
'infant': 'infant',
'10 week post conception,late embryo': '10',
'young adult': 'young adult',
'13 week post conception,late embryo': '13',
'16 week post conception,late embryo': '16',
'4 week post conception,embryo': '04',
'neonate': 'neonate',
'19 week post conception,late embryo': '19',
'9 week post conception,late embryo': '09',
'adolescent': 'adolescent',
'5 week post conception,embryo': '05',
'embryo,6 week post conception': '06',
'12 week post conception,late embryo': '12',
'18 week post conception,late embryo': '18',
'toddler': 'toddler',
'elderly': 'elderly',
'middle adult': 'adult',
'school age child': 'child'}
metadata_dev['dev_stage'] = metadata_dev['Developmental_Stage'].map(rename_dev_stage)
means = (df_dev.groupby(df_dev.columns.map(metadata_dev['organism_part'] + ' ' + metadata_dev['dev_stage']), axis=1)
           .mean())
means.index = means.index.map(presentable_names)
genes.index = genes.index.map(presentable_names)
means = means.loc[sorted(means.index.values, key=get_pos), :]

def developmental_tissue_expression_plot(gene_name, means=means, genes=genes):
    fig, axes = plt.subplots(2, 1, sharex=True)
    fig.set_size_inches(48, 6)
    ### bar chart ###
    (means.loc[genes == gene_name, :]
          .T
          .plot.bar(ax=axes[0],
                    legend=False,
                    width=0.7))
    ### percentages ###
    raw_means = 2 ** means.loc[genes == gene_name] - 1.
    (raw_means.div(raw_means.sum(axis=0))
              .T.plot.bar(ax=axes[1], 
                          stacked=True,
                          legend=False))
    axes[0].set_ylabel('Mean log2(TPM + 1)')
    axes[1].set_ylabel('Fraction for each isoform')
    axes[1].set_yticklabels(['{:.0%}'.format(t) for t in axes[1].get_yticks()])
    axes[1].legend(loc='lower left', bbox_to_anchor=(1, 0))
    axes[0].axhline(y=1, color='grey', linewidth=0.5)
    plt.subplots_adjust(hspace=0.05)
    plt.savefig(shared_fig_dir / (gene_name + '_developmental-tissue-expression.svg'),
                bbox_inches='tight')

 
for tf in tfs.values():
    developmental_tissue_expression_plot(tf.name)
    plt.close(plt.gcf())

reading from cache


  axes[1].set_yticklabels(['{:.0%}'.format(t) for t in axes[1].get_yticks()])
  plt.subplots_adjust(hspace=0.05)


In [11]:
df = pd.read_csv('../data/external/clinvar_variant_summary_2022-04-03.txt',
                 sep='\t')
sig_cats = {'Pathogenic', 
            'Likely pathogenic',
            'Pathogenic/Likely pathogenic'}
df = df.loc[(df['Assembly'] == 'GRCh38') &
            df['ClinicalSignificance'].apply(lambda x: any(y in sig_cats for y in x.split(';'))) &
            (df['Type'] == 'single nucleotide variant'),
            :]
df['aa_change'] = df['Name'].str.extract('.*\(p\.(.+)\)', expand=False)
df = df.loc[~df['aa_change'].isnull(), :]
df = df.loc[~df['aa_change'].str.contains('='), :]
df['nt_change'] = df['ReferenceAlleleVCF'] + '>' + df['AlternateAlleleVCF']
for tf in tfs.values():
    for _i, mut in df.loc[df['GeneSymbol'] == tf.name, :].iterrows():
        if 'chr' + str(mut['Chromosome']) != tf.chrom:
            msg = 'Inconsistent chromosomes for {}\nUs: {}\nclinvar: {}'.format(tf.name, tf.chrom, 'chr' + str(mut['Chromosome']))
            raise UserWarning(msg)
        tf.add_pathogenic_coding_SNP(position=mut['Start'] - 1,  # switch from 1-based to 0-based indexing
                                    nt_change=mut['nt_change'],
                                    aa_change=mut['aa_change'],
                                    disease=mut['PhenotypeList'],
                                    mutation_id=mut['VariationID'])

for tf in tfs.values():
    if len(tf.pathogenic_coding_SNPs) == 0:
        continue
    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(8, 0.5 * len(tf.isoforms))
    tf.exon_diagram(ax=ax, show_matched_transcripts=True, show_pathogenic_variants=True)
    for fmt in ['.svg']:
        plt.savefig(shared_fig_dir / '{}_clinvar-pathogenic-SNPs_exon-diagram{}'.format(tf.name, fmt),
                    bbox_inches='tight')
    plt.close(plt.gcf())

  df = pd.read_csv('../data/external/clinvar_variant_summary_2022-04-03.txt',
  plt.savefig(shared_fig_dir / '{}_clinvar-pathogenic-SNPs_exon-diagram{}'.format(tf.name, fmt),
