In [4]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [5]:
%%appyter markdown

# Gene Expression by Tissue
This appyter takes the input of a human gene and displays its expression across human cells and tissues 
utilizing a variety of processed datasets from healthy tissues. If the gene is not contained in one of the datasets, a plot will not be produced for that resource.


# Gene Expression by Tissue
This appyter takes the input of a human gene and displays its expression across human cells and tissues 
utilizing a variety of datasets from healthy tissues. If the gene is not contained in one of the datasets, a plot will not be produced for that resource.

In [6]:
%%appyter hide_code



{% do SectionField(
    name='primary',
    title='Gene and Protien Expression across Human Cells and Tissues',
    img ='gene-expr.png'
) %}

{% do DescriptionField(
    name='data_file_description',
    text='''
    This appyter takes the input of a human gene and displays its expression across human cells and tissues 
    utilizing a variety of processed datasets from healthy tissues.
    If the gene is not contained in one of the datasets, a plot will not be produced for that resource.''',
    section='primary',
) %}



{% set gene = AutocompleteField(name = 'gene',
                                label = 'Human gene symbol',
                                default = 'A1CF',
                                description = 'Enter the gene of interest',
                                file_path = 'https://raw.githubusercontent.com/giacomomarino/SearchProject_2/master/genes.json',
                                section = 'primary'
)%}


In [7]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import s3fs
from IPython.display import HTML, display, Markdown
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup
from maayanlab_bioinformatics.harmonization.transcripts import transcripts_to_genes
lookup = ncbi_genes_lookup()


In [8]:
%%appyter code_exec
gene = {{ gene }}

```python

gene = 'A1CF'
```

In [56]:
%%appyter markdown

## Load in normal gene expression
Utilize processed datasets containing gene expression by cell type and tissue ([GTEx](https://gtexportal.org/home/) [1] and [ARCHS4](https://maayanlab.cloud/archs4/) [2])


[1] Lonsdale, John, et al. "The genotype-tissue expression (GTEx) project." Nature genetics 45.6 (2013): 580-585. https://doi.org/10.1038/ng.265
        
[2] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), https://doi.org/10.1038/s41467-018-03751-6


## Load in normal gene expression
Utilize processed datasets containing gene expression by cell type and tissue ([GTEx](https://gtexportal.org/home/) [1] and [ARCHS4](https://maayanlab.cloud/archs4/) [2])


[1] Lonsdale, John, et al. "The genotype-tissue expression (GTEx) project." Nature genetics 45.6 (2013): 580-585. https://doi.org/10.1038/ng.265
        
[2] Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma'ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), https://doi.org/10.1038/s41467-018-03751-6

In [10]:
df_bg_stats_gtex = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/gtex-gene-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', index_col=[0,1])
df_bg_genes_gtex = df_bg_stats_gtex.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_stats_gtex = df_bg_stats_gtex.unstack().groupby(df_bg_genes_gtex, observed=True).median().stack()
df_bg_expr_gtex = df_bg_stats_gtex.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()

In [11]:
df_bg_stats_archs4 = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/archs4-gene-anatomy-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', index_col=[0,1])
df_bg_genes_archs4 = df_bg_stats_archs4.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_stats_archs4 = df_bg_stats_archs4.unstack().groupby(df_bg_genes_archs4, observed=True).median().stack()
df_bg_expr_archs4 = df_bg_stats_archs4.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()

In [12]:
df_bg_stats_archs4ext = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/archs4-gene-extra-stats.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', index_col=[0,1])
df_bg_genes_archs4ext = df_bg_stats_archs4ext.unstack().index.map(lambda idx: lookup(idx.partition('.')[0]))
df_bg_stats_archs4ext = df_bg_stats_archs4ext.unstack().groupby(df_bg_genes_archs4ext, observed=True).median().stack()
df_bg_expr_archs4ext = df_bg_stats_archs4ext.loc[(slice(None), ['25%', '50%', '75%']), :].unstack()

In [13]:
# Show available genes in each dataset
available = pd.DataFrame({'Selected Gene': [gene], 
                          'in GTEx - Gene': [gene in df_bg_expr_gtex.index], 
                          'in ARCHS4 Gene - Anatomy': [gene in df_bg_expr_archs4.index],
                          'in ARCHS4 Gene - Extra': [gene in df_bg_expr_archs4ext.index]})
display(HTML(available.to_html(notebook=True, escape=False)))

Unnamed: 0,Selected Gene,in GTEx - Gene,in ARCHS4 Gene - Anatomy,in ARCHS4 Gene - Extra
0,A1CF,True,True,True


In [14]:
%%appyter markdown

## Load plots based on the selected gene


## Load plots based on the selected gene

In [80]:
c = gene

if c in df_bg_expr_gtex.index.values:
    display(Markdown(f"### {c}"))
    IQR = df_bg_stats_gtex.loc[(c, '75%')]-df_bg_stats_gtex.loc[(c, '25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_gtex.loc[(c, 'min')],
            df_bg_stats_gtex.loc[(c, '25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_gtex.loc[(c, '25%')],
        median=df_bg_stats_gtex.loc[(c, '50%')],
        q3=df_bg_stats_gtex.loc[(c, '75%')],
        upperfence=np.minimum(
            df_bg_stats_gtex.loc[(c, 'max')],
            df_bg_stats_gtex.loc[(c, '75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_gtex.loc[(c, 'mean')],
        sd=df_bg_stats_gtex.loc[(c, 'std')],
        y=df_bg_stats_gtex.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) GTEx", height=1200)
    fig.show()
    
if c in df_bg_expr_archs4ext.index.values:
    IQR = df_bg_stats_archs4ext.loc[(c, '75%')]-df_bg_stats_archs4ext.loc[(c, '25%')]
    fig = go.Figure()
    fig.add_trace(go.Box(
        lowerfence=np.maximum(
            df_bg_stats_archs4ext.loc[(c, 'min')],
            df_bg_stats_archs4ext.loc[(c, '25%')] - (1.5*IQR),
        ),
        q1=df_bg_stats_archs4ext.loc[(c, '25%')],
        median=df_bg_stats_archs4ext.loc[(c, '50%')],
        q3=df_bg_stats_archs4ext.loc[(c, '75%')],
        upperfence=np.minimum(
            df_bg_stats_archs4ext.loc[(c, 'max')],
            df_bg_stats_archs4ext.loc[(c, '75%')] + (1.5*IQR),
        ),
        mean=df_bg_stats_archs4ext.loc[(c, 'mean')],
        sd=df_bg_stats_archs4ext.loc[(c, 'std')],
        y=df_bg_stats_archs4ext.columns,
        name='Background',
        orientation='h'
    ))
    fig.update_layout(title=c+ " (RNA-seq) ARCHS4", height=5000)
    fig.show()
    

### A1CF

In [57]:
%%appyter markdown

## Proteomics Expression Levels

Proteomics data were obtained from the [Human Protein Atlas](https://www.proteinatlas.org/about/download) (HPA) [3] with IHC-based expression profiling, the [Human Proteome Map](https://www.humanproteomemap.org/download.php) (HPM) [4] with MS-based expression quantification, and a [GTEx proteome project](https://doi.org/10.1016/j.cell.2020.08.036) [5] using TMT MS. 
These datasets contain protein expression levels detected in normal tissues and cell types. The gene may not be present in the data from each project (see table for which proteomics data are present/absent). Plots show expression levels (HPA), average spectral counts (HPM), or a log-transformed relative abundance (GTEx) by tissue/cell-type for each gene candidate (excluding expression levels from the HPA where the [reliability score](https://www.proteinatlas.org/about/assays+annotation) was uncertain). 


[3] Uhlén M et al. "Tissue-based map of the human proteome." Science (New York, N.Y.) vol. 347,6220 (2015): 1260419. https://doi.org/10.1126/science.1260419

[4] Kim, Min-Sik et al. “A draft map of the human proteome.” Nature vol. 509,7502 (2014): 575-81. https://doi.org/10.1038/nature13302

[5] Jiang, Lihua et al. “A Quantitative Proteome Map of the Human Body.” Cell vol. 183,1 (2020): 269-283.e19. https://doi.org/10.1016/j.cell.2020.08.036


## Proteomics Expression Levels

Proteomics data were obtained from the [Human Protein Atlas](https://www.proteinatlas.org/about/download) (HPA) [3] with IHC-based expression profiling, the [Human Proteome Map](https://www.humanproteomemap.org/download.php) (HPM) [4] with MS-based expression quantification, and a [GTEx proteome project](https://doi.org/10.1016/j.cell.2020.08.036) [5] using TMT MS. 
These datasets contain protein expression levels detected in normal tissues and cell types. The gene may not be present in the data from each project (see table for which proteomics data are present/absent). Plots show expression levels (HPA), average spectral counts (HPM), or a log-transformed relative abundance (GTEx) by tissue/cell-type for each gene candidate (excluding expression levels from the HPA where the [reliability score](https://www.proteinatlas.org/about/assays+annotation) was uncertain). 


[3] Uhlén M et al. "Tissue-based map of the human proteome." Science (New York, N.Y.) vol. 347,6220 (2015): 1260419. https://doi.org/10.1126/science.1260419

[4] Kim, Min-Sik et al. “A draft map of the human proteome.” Nature vol. 509,7502 (2014): 575-81. https://doi.org/10.1038/nature13302

[5] Jiang, Lihua et al. “A Quantitative Proteome Map of the Human Body.” Cell vol. 183,1 (2020): 269-283.e19. https://doi.org/10.1016/j.cell.2020.08.036

In [17]:
hpm = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/hpm.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=0)
hpa = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/hpa.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0, index_col=1)
gtexp = pd.read_csv("s3://storage/Tumor_Gene_Target_Screener/gtex_proteomics.tsv", storage_options=dict(client_kwargs=dict(endpoint_url="https://appyters.maayanlab.cloud"), anon=True), sep='\t', header=0)
gtexp['Name'] = gtexp['gene.id'].map(lambda idx: lookup(idx))

In [18]:
# Show available genes in each dataset
available = pd.DataFrame({'Selected Gene': [gene], 
                          'in HPM': [gene in hpm.index], 
                          'in HPA': [gene in hpa.index],
                          'in GTEx Proteomics': [gene in gtexp.Name.values]})
display(HTML(available.to_html(notebook=True, escape=False)))

Unnamed: 0,Selected Gene,in HPM,in HPA,in GTEx Proteomics
0,A1CF,True,True,True


In [19]:
hpa.Tissue = hpa["Tissue"] + ", " + hpa["Cell.type"]
hpa = hpa[hpa['Reliability'] != "Uncertain"] 
gtexp['tissue_specificity'] = gtexp.tissue_specificity.fillna('NA')

if c in gtexp.Name.values:
    d = gtexp[gtexp['Name'] == c]
    fig = px.strip(d, y="tissue", x="value",  
                   orientation='h',
                   stripmode="overlay",
                   hover_data=["tissue_specificity"],
                   height=30*d['tissue'].nunique())
    fig.add_trace(go.Box(x=d['value'],
                         y=d['tissue'],
                         orientation='h',
                         marker=dict(color='#636EFA'),
                         name="n > 1"))
    fig.update_layout(title="(GTEx Proteomics)",
                      autosize=True,
                      showlegend=False)
    fig.update_xaxes(title="log2(relative abundance)")
    fig.update_yaxes(title=None)
    fig.show()
if c in hpm.index:
    fig = px.scatter(hpm.loc[[c]], 
                     y="Tissue", x="value", 
                     height=20*hpm.loc[[c]].shape[0])
    fig.update_layout(title="(HPM)", 
                      autosize=True)
    fig.update_xaxes(title="Average Spectral Counts")
    fig.update_yaxes(title=None)
    fig.show()
if c in hpa.index:
    fig = px.scatter(hpa.loc[[c]], 
                     y="Tissue", x="Level", 
                     category_orders={"Level": ["Not detected", "Low", "Medium", "High"]}, 
                     hover_data=["Reliability"],  
                     hover_name="Tissue",
                     height=20*hpa.loc[[c]].shape[0])
    fig.update_layout(title="(HPA)", 
                      showlegend=False, 
                      autosize=True, 
                      xaxis={'tickmode':'array', 
                             'tickvals':[0, 1, 2, 3], 
                             'ticktext':["Not detected", "Low", "Medium", "High"]})
    fig.update_xaxes(title="Tissue Expression Level")
    fig.update_yaxes(title=None)
    fig.show()

In [78]:
len(df_bg_expr_archs4ext.columns.values)

618

In [79]:
len(df_bg_expr_archs4.columns.values)

81