__Author:__ Bram Van de Sande

__Date:__ 15 JAN 2018

__Outline:__ 

In [1]:
import pandas as pd
import numpy as np
import os

from arboretum.algo import grnboost2
from arboretum.utils import load_tf_names

In [2]:
RESOURCES_FOLDER="/Users/bramvandesande/Projects/lcb/resources/sc-experiments/clean/"
DATA_FOLDER="/Users/bramvandesande/Projects/lcb/tmp"

In [3]:
!ls {RESOURCES_FOLDER}

GSE103322.mtx.tsv GSE70630.mtx.tsv  GSE76312.mtx.txt  tfs.txt
GSE103322.mtx.txt GSE70630.mtx.txt  GSE81861.mtx.txt
GSE57872.mtx.txt  GSE72056.mtx.txt  GSE89567.mtx.tsv
GSE69405.mtx.txt  GSE75688.mtx.txt  GSE89567.mtx.txt


## Load the expression matrix

In [4]:
fname = os.path.join(RESOURCES_FOLDER, "GSE103322.mtx.tsv")
ex_matrix = pd.read_csv(fname, sep='\t', index_col=0).T

In [5]:
ex_matrix.head()

Unnamed: 0,SNORD113-9,MAGEB16,SNORA49,MIR26A1,MIR485,MIR3909,TTTY23,MIR519E,C10orf53,MIR3684,...,GAPDH,UBB,FTL,RPL7,MTRNR2L8,FTH1,ACTB,TMSB4X,MTRNR2L2,B2M
HN28_P15_D06_S330_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.945653,0.842134,0.833173,0.824638,1.093898,0.831537,1.072929,0.917554,1.167422,1.061845
HN28_P6_G05_S173_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.971751,0.862773,0.968909,0.953399,1.124328,0.923339,0.996584,0.954457,1.170053,0.940295
HN26_P14_D11_S239_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.10017,0.821669,0.956934,0.953593,1.119356,0.952363,0.821784,0.819039,1.194907,0.820894
HN26_P14_H05_S281_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705589,0.977141,0.941196,0.881868,1.049143,0.954018,0.916003,0.984108,1.171271,1.080044
HN26_P25_H09_S189_comb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.001442,0.850375,0.874529,0.973545,1.156655,0.938911,0.720506,0.85775,1.227803,0.934479


In [6]:
ex_matrix.shape

(5902, 20684)

## Derive list of Transcription Factors(TF) for _Homo sapiens_

In [7]:
tf_names = load_tf_names(os.path.join(RESOURCES_FOLDER, 'tfs.txt'))

## Run GRNBoost to infer co-expression modules

The arboretum package is used for this phase of the pipeline. For this notebook only a sample of 1,000 cells is used for the co-expression module inference is used.

In [8]:
N_SAMPLES = 5902

In [9]:
network = grnboost2(expression_data=ex_matrix.sample(n=N_SAMPLES),
                    tf_names=tf_names, verbose=True)

preparing dask client
parsing input
creating dask graph
computing dask graph
shutting down client and local cluster


tornado.application - ERROR - Exception in callback functools.partial(<function wrap.<locals>.null_wrapper at 0x10e078ae8>, <tornado.concurrent.Future object at 0x104645f98>)
Traceback (most recent call last):
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 605, in _run_callback
    ret = callback()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/ioloop.py", line 626, in _discard_future_result
    future.result()
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tornado/concurrent.py", line 238, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 4, in raise_exc_info
  File "/Users/bramvandesande/miniconda3/envs/pyscenic_dev/lib/python3.6/site-packages/tor

finished


KeyboardInterrupt: 

In [19]:
network.head()

Unnamed: 0,TF,target,importance
35,Rpl7,Rpl34-ps1,86.11987
172,Olig1,Cnp,70.127927
172,Olig1,Tspan2,69.595029
155,Neurod6,Hpca,68.236759
172,Olig1,Cers2,67.195168


In [27]:
len(network)

4109019

In [28]:
network.to_csv(os.path.join(DATA_FOLDER, "coexpression-modules.tsv"), index=False, sep='\t')