In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

# Long Non-coding RNA (lncRNA) Appyter 

Using lncRNA-gene co-expression, this Appyter can predict the biological functions of ~5000 lncRNAs.

In [None]:
import pandas as pd 
import numpy as np
import h5py as h5
from plotly.offline import iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from IPython.display import display,FileLink, Markdown
import ssl
import os
import urllib.request
import s3fs

In [None]:
%%appyter hide_code

{% do SectionField(name='section1', title = '1. Input a Gene Symbol or Ensembl ID', subtitle = '', img = 'lncRNA_appyter_logo.png')%}
{% set query = StringField(name='gene_symbol', label='Gene Symbol/Ensembl ID', default='HOTAIR', description='',section = 'section1') %}

In [None]:
%%appyter code_exec
query = {{ query }}

In [None]:
# lncRNA of interest
query = query.upper()

%%appyter markdown
### Import gene-lncRNA co-expression matrix

This lncRNA-gene matrix was computed using Pearson Correlation on 10,000 randomly selected bulk RNA-seq samples from Recount3[1]. 

In [None]:
# Import lncRNA-gene co-expression matrix
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url='https://s3.appyters.maayanlab.cloud'))
f = h5.File(s3.open('storage/lncRNA_Appyter/Recount3_lncRNA_pcorr.h5', 'rb'), 'r') 
corr =f["data/correlation"]
col_genes = [x.decode('UTF-8') for x in f["meta/columns/genes"]]
row_genes =  [x.decode('UTF-8') for x in f["meta/rows/genes"]]
row_genes_ensembl = [x.decode('UTF-8') for x in f["meta/rows/ensembl"]]

In [None]:
# Convert input Ensembl ID to gene symbol
ensembl_2_genes = dict(zip(row_genes_ensembl,row_genes))
if query in row_genes_ensembl:
    query_new = ensembl_2_genes[query]
    if query != query_new:
        print('Predicting functions for ' + query_new + '(' + query + ')')
        query = query_new
    else:
        print('Predicting functions for ' + query)
else:
    if query in row_genes:
        print('Predicting functions for ' + query )

%%appyter markdown
### Top correlated genes with {{query.raw_value}}

Similarly to Geneshot[2], gene-gene similarities are predicted using co-expression. All genes are ranked by Pearson Correlation with the input lncRNA.

In [None]:
# Find most correalted genes and lncRNAs with the input lncRNA
if not os.path.exists("gene_correlations/"):
        os.makedirs("gene_correlations/", exist_ok=True)

# Get index of lncRNA of interest
idx_query = np.where(np.asarray(row_genes) == query)[0][0]

# Ranks genes based on pearson correlation with the lncRNA of interest
lncRNA_coexp = pd.DataFrame(corr[idx_query,:])
lncRNA_coexp.index = col_genes
lncRNA_coexp.columns = ['Pearson Correlation']
lncRNA_coexp = lncRNA_coexp.sort_values(by='Pearson Correlation', ascending=False)
print(lncRNA_coexp[0:20])

# save gene correlations to csv file
lncRNA_coexp.to_csv('gene_correlations/'+ query + '_correlated_genes.csv')

In [None]:
display(FileLink('gene_correlations/' + query + '_correlated_genes.csv', result_html_prefix=str('Download Table 1: ')))

%%appyter markdown
### Top correlated lncRNAs with {{query.raw_value}}

lncRNAs are ranked by Pearson Correlation with the input lncRNA.

In [None]:
# Download most correlated lncRNAs
lncRNA_lncRNA_coexp = lncRNA_coexp.loc[row_genes]
lncRNA_lncRNA_coexp = lncRNA_lncRNA_coexp.sort_values(by='Pearson Correlation', ascending=False)
print(lncRNA_lncRNA_coexp [0:20])

# save gene correlations to csv file
lncRNA_coexp.to_csv('gene_correlations/' + query + '_correlated_lncRNAs.csv')

In [None]:
display(FileLink('gene_correlations/' + query + '_correlated_lncRNAs.csv', result_html_prefix=str('Download Table 2: ')))

In [None]:
%%appyter markdown
### Predicted Biological Functions of {{query.raw_value}}

For each Enrichr library, the mean Pearson Correlation is calulated between each gene set and the lncRNA of interest. Terms with a high mean Pearson Correaltion are prioritized and predicted to be associated with the lncRNA.

In [None]:
# Load Enrichr libraries
def loadLibrary(library: str, overwrite: bool = False) -> str:
    ssl._create_default_https_context = ssl._create_unverified_context
    if not os.path.exists("gmts/"+library +'.gmt' or overwrite):
        os.makedirs("gmts", exist_ok=True)
        print("Download Enrichr geneset library")
        urllib.request.urlretrieve("https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName="+library, "gmts/"+library+".gmt")
    else:
        print("File cached. To reload use loadLibrary(\""+library+"\", overwrite=True) instead.")
    return("gmts/"+library+".gmt")

In [None]:
# Predict functions based on mean pearson correaltion for each term in a library 
def predict_functions(library, matrix, query):
    library_path = loadLibrary(library)
    open_gmt = open(library_path,'r')
    library_dict = {}
    for line in open_gmt.readlines():
        line = line.strip().split('\t')
        term = line[0]
        gene_set = line[2:]
        library_dict[term]=gene_set
    open_gmt.close()  

    all_terms = []
    all_scores = []

    for lib_term, gene_set in library_dict.items():
        all_terms.append(lib_term)
        lib_term_set = list(set(gene_set)&set(matrix.index))
        lib_term_set = [x for x in lib_term_set if x!= query]
        all_scores.append(np.mean(matrix.loc[lib_term_set]['Pearson Correlation']))

    df_results = pd.DataFrame({'Term':all_terms,'Mean Pearson Correlation':all_scores})
    df_results = df_results.sort_values(by ='Mean Pearson Correlation',ascending=False)
    return(df_results)

In [None]:
# Plot the top terms for each prediction library
def plot_results(library_names, results_dfs, top_results=20):
    
    fig = make_subplots(rows=1, cols=2, print_grid=False,shared_xaxes=False)
    max_scores = []
    for i in range(0,2):
        results_df = results_dfs[i][0:top_results].sort_values(by='Mean Pearson Correlation')
        library_name = library_names[i]
        max_scores.append(np.max(results_df['Mean Pearson Correlation']))
        bar = go.Bar(x=results_df['Mean Pearson Correlation'],
            y=results_df['Term'],
            orientation='h',
            name=library_name,
            showlegend=False,
            hovertext=['<b>Term: {Term}</b><br><b>Mean Pearson Correlation</b>: <i>{Mean Pearson Correlation:.3}</i>'.format(**rowData) for index, rowData in results_df[0:top_results].iterrows()],
            hoverinfo='text', 
            marker={'color': 'dodgerblue'})
        fig.append_trace(bar, 1, i+1)
        
        #Get text
        text = go.Scatter(
            x=[max(bar['x'])/50 for x in range(len(bar['y']))],
            y=bar['y'],
            mode='text',
            hoverinfo='none',
            showlegend=False,
            text=['<b>{}</b>'.format(rowData['Term']) for index, rowData in results_df[0:top_results].iterrows()],
            textposition="middle right",
            textfont={'color': 'black','size':8})
        fig.append_trace(text, 1, i+1)
    
    annotations= [{'x': 0.25, 'y': 1.1, 'text': '<span style="color: black; font-size: 15pt; font-weight: 600;">' +library_names[0]+'</span>', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'},{'x': 0.75, 'y': 1.1, 'text': '<span style="color: black; font-size: 15pt; font-weight: 600;">' +library_names[1]+'</span>', 'showarrow': False, 'xref': 'paper', 'yref': 'paper', 'xanchor': 'center'}]
    fig['layout'].update(height = 500, hovermode='closest', annotations=annotations)
    fig.update_layout(title='',height = 500,title_font_size = 25,title_x=0.5)
    
    fig['layout']['xaxis1'].update(domain=[0, 0.49], title='Mean Pearson Correlation' ,range=(0,max_scores[0]+max_scores[0]*.01))
    fig['layout']['xaxis2'].update(domain=[0.51, 1], title='Mean Pearson Correlation',range=(0,max_scores[1]+max_scores[1]*.01))
    fig['layout']['yaxis1'].update(showticklabels=False)
    fig['layout']['yaxis2'].update(showticklabels=False)
    fig['layout']['margin'].update(l=30, t=65, r=30, b=35)
    
    iplot(fig)

In [None]:
# Make function predictions
prediction_libraries = ['MGI_Mammalian_Phenotype_Level_4_2021','GO_Biological_Process_2021','KEGG_2021_Human','DisGeNET']
prediction_libraries = np.array_split(prediction_libraries, int(np.ceil(len(prediction_libraries)/2))) 
for i_group,group in enumerate(prediction_libraries):
    predictions = []
    library_names = []
    for pred_library in group:
        predictions.append(predict_functions(pred_library,lncRNA_coexp,query))
        library_names.append(pred_library.replace('_',' '))
    plot_results(library_names, predictions)
    
    # Save Predictions 
    if not os.path.exists("predicted_functions/"):
        os.makedirs("predicted_functions/", exist_ok=True)
    for ii,prediction in enumerate(predictions):
        prediction.to_csv("predicted_functions/" + library_names[ii]+'_' + query + '.csv')
        display(FileLink("predicted_functions/" + library_names[ii]+'_' + query + '.csv', result_html_prefix=str('Download predictions: ')))

In [None]:
# close h5 file
f.close()

### References
[1] Wilks C, Zheng SC, Chen FY, Charles R, Solomon B, Ling JP, Imada EL, Zhang D, Joseph L, Leek JT: recount3: summaries and queries for large-scale RNA-seq expression and splicing. bioRxiv 2021:2021.2005.2021.445138.

[2] Lachmann A, Schilder BM, Wojciechowicz ML, Torre D, Kuleshov MV, Keenan AB, Ma’ayan A: Geneshot: search engine for ranking genes from arbitrary text queries. Nucleic Acids Research 2019, 47(W1):W571-W577.