In [1]:
from typing import List
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io
import base64
import pandas as pd
import scanpy as sc
from joblib import Parallel, delayed, cpu_count
import numpy as np
import scipy
from numba import njit
from tqdm import tqdm
import torch

def sql_query(query):
    from postgres_utils import engine
    from sqlalchemy import text
    with engine.connect() as connection:
        r = connection.execute(text(query))            
        return [row._mapping for row in r.fetchall()]    
    r = plpy.execute(query)
    return [row for row in r]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
study_id = 3
omics_id = 12652
data = sql_query(f"""
    SELECT s.filename,so.h5ad_var_index FROM study s
    JOIN study_omics so
      ON s.study_id=so.study_id
    WHERE omics_id={omics_id} AND s.study_id={study_id}
    """).pop()

In [4]:
fn = data.get('filename').replace('/h5ad_store','../scratch')
goi = data.get('h5ad_var_index')

In [5]:
adata = sc.read(fn)
df = adata.X.T.todense()

In [48]:
@njit
def pearson_corr(m):
    return np.corrcoef(m)[0,1]

def compute_correlation(m, genes, goi):
    collect = []
    for gene in genes:
        r = pearson_corr(m[[goi,gene]])
        if r>=0.2:
            collect.append({'h5ad_var_index': gene, 'r':r})
    return pd.DataFrame(collect)

In [49]:
chunks = np.array_split([_ for _ in range(0,adata.n_vars) if _!=goi],8)

In [57]:
%%time
result = Parallel(n_jobs=cpu_count(), backend="threading")(delayed(compute_correlation)(m = df, genes=chunk, goi=goi) for chunk in chunks)


CPU times: user 31.4 s, sys: 2.03 s, total: 33.4 s
Wall time: 10.3 s


In [58]:
result = pd.concat(result)

In [74]:
geneids = tuple(result.h5ad_var_index.tolist())

In [77]:
res = sql_query(f'''
    SELECT so.omics_id, ob.display_symbol, ob.display_name, so.h5ad_var_index FROM study_omics so
      JOIN omics_base ob
        ON ob.omics_id = so.omics_id
     WHERE  so.study_id = {study_id}
       AND so.h5ad_var_index in {geneids}''')


In [86]:
out = pd.DataFrame(res).merge(result, on = 'h5ad_var_index').drop('h5ad_var_index', axis = 1)

In [88]:
out = out[['omics_id','display_symbol','display_name','r']]

In [91]:
from pathlib import Path

In [92]:
Path('/h5ad_store') / 'asdf'

PosixPath('/h5ad_store/asdf')

In [111]:
fn = Path('../scratch') / Path(data[0].get('filename')).name

In [112]:
fn

PosixPath('../scratch/blood_covid.h5ad')

In [113]:
import pandas as pd
import scanpy as sc
from joblib import Parallel, delayed, cpu_count
import numpy as np
import scipy
from numba import njit

def sql_query(query):
    # postgres data retrieval with consistent output, both in the jupyter development
    # environment (plpy is not available) and at runtime inside a plpython3u stored procedure
    try:
        import plpy
    except:
        from postgres_utils import engine
        from sqlalchemy import text
        with engine.connect() as connection:
            r = connection.execute(text(query))
            return [row._mapping for row in r.fetchall()]
    r = plpy.execute(query)
    return [row for row in r]

@njit
def pearson_corr(m):
    return np.corrcoef(m)[0,1]

def compute_correlation(m, genes, goi):
    collect = []
    for gene in genes:
        r = pearson_corr(m[[goi,gene]])
        if r>=0.2:
            collect.append({'h5ad_var_index': gene, 'r':r})
    return pd.DataFrame(collect)


data = sql_query(f"""
    SELECT s.filename,so.h5ad_var_index FROM study s
    JOIN study_omics so
      ON s.study_id=so.study_id
    WHERE omics_id={omics_id} AND s.study_id={study_id}
    """)
#fn = Path('/h5ad_store') / data[0].get('filename')
fn = Path('../scratch') / Path(data[0].get('filename')).name
goi = data[0].get('h5ad_var_index')

adata = sc.read(fn)
df = adata.X.T.todense()

chunks = np.array_split([_ for _ in range(0,adata.n_vars) if _!=goi],8)
result = Parallel(n_jobs=cpu_count(), backend="threading")(delayed(compute_correlation)(m = df, genes=chunk, goi=goi) for chunk in chunks)
result = pd.concat(result)

geneids = tuple(result.h5ad_var_index.tolist())
ret = sql_query(f'''
    SELECT so.omics_id, ob.display_symbol, ob.display_name, so.h5ad_var_index FROM study_omics so
      JOIN omics_base ob
        ON ob.omics_id = so.omics_id
     WHERE  so.study_id = {study_id}
       AND so.h5ad_var_index in {geneids}
''')
out = pd.DataFrame(ret).merge(result, on = 'h5ad_var_index').drop('h5ad_var_index', axis =1)
out = out[['omics_id','display_symbol','display_name','r']]

return out.to_records(index=False)

SyntaxError: 'return' outside function (975559305.py, line 63)

In [114]:
out.to_records(index=False)

rec.array([( 6925, 'ADAM19', 'ADAM metallopeptidase domain 19', 0.24688988),
           ( 7112, 'ALDH1L2', 'aldehyde dehydrogenase 1 family member L2', 0.28362993),
           (11419, 'AQP3', 'aquaporin 3 (Gill blood group)', 0.241767  ),
           (12175, 'ARF4', 'ADP ribosylation factor 4', 0.21127773),
           ( 4618, 'BMP8B', 'bone morphogenetic protein 8b', 0.20916775),
           (12469, 'BUB1', 'BUB1 mitotic checkpoint serine/threonine kinase', 0.32998851),
           (14299, 'CALR', 'calreticulin', 0.38251754),
           ( 5978, 'CALU', 'calumenin', 0.29816905),
           ( 5820, 'CANX', 'calnexin', 0.2810183 ),
           ( 3206, 'CAV1', 'caveolin 1', 0.21814871),
           (   50, 'CD38', 'CD38 molecule', 0.45091937),
           ( 1993, 'CDC6', 'cell division cycle 6', 0.23401663),
           ( 2347, 'CEP128', 'centrosomal protein 128', 0.2250923 ),
           ( 5389, 'CHPF', 'chondroitin polymerizing factor', 0.26863922),
           (12436, 'CLIC4', 'chloride intracel