__Author:__ Bram Van de Sande

__Date:__ 7 FEB 2018

__Outline:__ This notebook clarifies the process by which the co-expression modules derived from GENIE3 can be refined into true regulomes (i.e. excluding indirect targets of transcription factors). Aka "RcisTarget".

In [1]:
import os
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase, SQLiteRankingDatabase, MemoryDecorator
from pyscenic.genesig import GeneSignature, Regulome
from pyscenic.regulome import module2regulome_bincount_impl, derive_regulomes, module2regulome_numba_impl
from pyscenic.utils import load_motif_annotations

from dask import delayed
from dask.dot import dot_graph
from dask.multiprocessing import get
from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler
from dask.diagnostics import ProgressBar
from distributed import LocalCluster, Client
from bokeh.io import output_notebook, push_notebook, show
output_notebook()
from dask.diagnostics import visualize

In [2]:
%load_ext snakeviz
%load_ext line_profiler

In [3]:
DATA_FOLDER="/Users/bramvandesande/Projects/lcb/tmp"
RESOURCES_FOLDER="/Users/bramvandesande/Projects/lcb/resources"
DATABASE_FOLDER = "/Users/bramvandesande/Projects/lcb/databases/"

SQLITE_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.db")
FEATHER_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.feather")

MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")

NOMENCLATURE = "MGI"

Make databases in feather format are available.

In [4]:
if False:
    def derive_db_name(fname):
        return os.path.basename(fname).split(".")[0]

    from pyscenic.rnkdb import convert2feather
    
    for fname in glob.glob(SQLITE_GLOB):
        convert2feather(fname, DATABASE_FOLDER, derive_db_name(fname), NOMENCLATURE)

### Load resources

Co-expression modules were derived from GENIE3 output.

In [5]:
with open(os.path.join(DATA_FOLDER,'modules.pickle'), 'rb') as f:
    modules = pickle.load(f)

In [6]:
len(modules)

5106

In [9]:
from pyscenic.genesig import Regulome

In [10]:
reg = Regulome(name=modules[0].name,
              nomenclature=modules[0].nomenclature,
              gene2weights=modules[0].gene2weights,
              score=modules[0].score,
              transcription_factor=modules[0].transcription_factor,
              context=modules[0].context)

In [11]:
reg

Regulome(name='Regulome for Aatf', nomenclature='MGI', gene2weights=<frozendict {'Atf5': 13.628211359846899, 'Deaf1': 8.3850504437361248, 'Dok2': 8.3233304633881566, 'Paip2': 6.7175165666883823, 'Pgm3': 5.8792248348304188, 'Agpat5': 4.996717722589108, 'Adcy6': 4.9605282019996233, 'Cxcl14': 4.783492652598742, 'Hexim1': 4.7226024093035903, 'Mrpl42': 4.3856724408212697, 'Lrtm2': 4.257191006843466, 'Mapk3': 4.1840167297841093, 'Pcnxl3': 3.986172042990844, 'Lox': 3.8977304501673782, 'Gtl3': 3.7825547315635082, 'Tle2': 3.6846843032583778, 'Nudt2': 3.5190526704907703, 'Tbc1d31': 3.4007651541383868, 'Cnn2': 3.1783416148680734, 'Kpna1': 3.0481701901346274, 'Slc35c2': 3.0192877084254488, 'Deptor': 2.862210385415044, 'C920021L13Rik': 2.8045861777103847, 'Tank': 2.7947470621579544, 'Kat6a': 2.7758812289456785, 'Mettl3': 2.7059026258485459, '4930547E08Rik': 2.6712692415847687, 'Nr2f2': 2.6174209568578197, 'Hnrnpl': 2.5865850789861411, 'Gpx4': 2.5432381871481593, 'Bcl6': 2.5142917330086632, 'Slc19a2

In [17]:
Regulome.to_yaml(yaml.Dumper, reg)

TypeError: represent_mapping() missing 1 required positional argument: 'mapping'

### Load whole genome ranking databases

All implementations of the database are loaded for performance testing.

In [7]:
def name(fname):
    return os.path.basename(fname).split(".")[0]

In [8]:
db_fnames = glob.glob(FEATHER_GLOB)
dbs = [RankingDatabase(fname=fname, name=name(fname), nomenclature="MGI") for fname in db_fnames]

In [9]:
len(dbs)

6

In [10]:
sqldb_fnames = glob.glob(SQLITE_GLOB)
sqldbs = [SQLiteRankingDatabase(fname=fname, name=name(fname), nomenclature="MGI") for fname in sqldb_fnames]

In [11]:
len(sqldbs)

6

In [12]:
memdb = MemoryDecorator(dbs[0])

### Load motif annotations

In [13]:
motif_annotations = load_motif_annotations(MOTIF_ANNOTATIONS_FNAME)

In [14]:
motif_annotations.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,motif_similarity_qvalue,orthologous_identity,description
gene_name,#motif_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Hoxa9,bergman__Abd-B,0.0006,1.0,gene is annotated for similar motif cisbp__M10...
Zfp128,bergman__Aef1,0.0,0.220264,motif is annotated for orthologous gene FBgn00...
Zfp853,bergman__Cf2,0.0,0.166667,motif is annotated for orthologous gene FBgn00...
Nr1h2,bergman__EcR_usp,0.0,0.378924,gene is orthologous to FBgn0000546 in D. melan...
Nr1h3,bergman__EcR_usp,0.0,0.408989,gene is orthologous to FBgn0000546 in D. melan...


### Single-thread pipeline

Before scaling it via dask to work on the full combinatorial space of databases x modules.

In [15]:
module2regulome = module2regulome_bincount_impl

#### Feather-based storage + bincount implementation

In [16]:
%lprun -f module2regulome list((idx, module2regulome(dbs[0], module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

In [17]:
%%snakeviz
regulomes = list((idx, module2regulome(dbs[0], module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

 
*** Profile stats marshalled to file '/var/folders/cj/xhw0rd3s7hg5k4p78t4s3hph0000gn/T/tmpyziub1gk'. 


1. General performance is 78s for executing `module2regulome` 25 times.
1. 79% of time is spent at `recovery` and 18% at `db.load`.

#### SQLite-based storage + bincount implementation

In [18]:
%lprun -f module2regulome list((idx, module2regulome(sqldbs[0], module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

In [19]:
%%snakeviz
regulomes = list((idx, module2regulome(sqldbs[0], module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

 
*** Profile stats marshalled to file '/var/folders/cj/xhw0rd3s7hg5k4p78t4s3hph0000gn/T/tmpzsg2nvjf'. 


1. General performance is 83s for executing `module2regulome` 25 times.
1. 42% of time is spent at `recovery` and 56% at `db.load`.

#### In-memory database + bincount implementation

In [20]:
%lprun -f module2regulome list((idx, module2regulome(memdb, module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

In [23]:
%%snakeviz
regulomes = list((idx, module2regulome(memdb, module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

 
*** Profile stats marshalled to file '/var/folders/cj/xhw0rd3s7hg5k4p78t4s3hph0000gn/T/tmpfwp_6eat'. 


1. General performance is 78s for executing `module2regulome` 25 times.
1. 89% of time is spent at `recovery` and 8.5% at `db.load`.

#### In-memory database + bincount implementation: assess effect of reducing rank_threshold parameter

In [21]:
%lprun -f module2regulome list((idx, module2regulome(memdb, module, motif_annotations, auc_threshold=0.01, rank_threshold=750)) for idx, module in enumerate(modules[0:25]))

1. General performance is 69s for executing `module2regulome` 25 times.
1. 93% of time is spent at `recovery` and 4.4% at `db.load`.

#### Feather-based storage + numba implementation

In [15]:
%%snakeviz
regulomes = list((idx, module2regulome_numba_impl(dbs[0], module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

 
*** Profile stats marshalled to file '/var/folders/cj/xhw0rd3s7hg5k4p78t4s3hph0000gn/T/tmppk10rqe2'. 


1. General performance is 49s for executing `module2regulome` 25 times.
1. 47% of time is spent at `recovery` and 24% at `db.load`.

#### Approach combining all potential improvements (in-memory database, auc-only calculation to assess enriched features and numba JIT implementation).

In [16]:
%%snakeviz
regulomes = list((idx, module2regulome_numba_impl(memdb, module, motif_annotations)) for idx, module in enumerate(modules[0:25]))

 
*** Profile stats marshalled to file '/var/folders/cj/xhw0rd3s7hg5k4p78t4s3hph0000gn/T/tmpqoyeb21k'. 


1. General performance is 39s for executing `module2regulome` 25 times.
1. 81% of time is spent at `recovery`

### Parallelized pipeline

#### Python multiprocessing implementation (db-dedicated workers using in memory copy + numba implementation of auc calculation).

Loading the database is also part of the overall timing. This will however dwarf when the number of modules increases.

In [17]:
%%timeit -n1 -r1 -o -q
regulomes = derive_regulomes(dbs[0:2], modules[0:50], MOTIF_ANNOTATIONS_FNAME)
print(len(regulomes))

Using 2 workers.
Worker for mm9-500bp-upstream-7species: database loaded in memory.
Worker for mm9-500bp-upstream-7species: motif annotations loaded in memory.
Worker for mm9-tss-centered-10kb-10species: database loaded in memory.
Worker for mm9-tss-centered-10kb-10species: motif annotations loaded in memory.
Worker for mm9-500bp-upstream-7species: 4 regulomes created.
Worker for mm9-tss-centered-10kb-10species: 12 regulomes created.
Worker for mm9-tss-centered-10kb-10species: Done.
Worker for mm9-500bp-upstream-7species: Done.
16


<TimeitResult : 1min 45s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)>

#### Dask framework using multiprocessing

In [15]:
with ProgressBar():
    with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:
        regulomes = derive_regulomes(dbs[0:2], modules[0:50], MOTIF_ANNOTATIONS_FNAME,
                                     client_or_address="dask_multiprocessing")

[########################################] | 100% Completed |  1min 54.9s


In [16]:
len(regulomes)

16

In [17]:
visualize([prof, rprof, cprof])

#### Dask framework with custom client

In [15]:
local_cluster = LocalCluster(n_workers=6, 
                             threads_per_worker=1)

custom_client = Client(local_cluster)

In [16]:
custom_client

0,1
Client  Scheduler: tcp://127.0.0.1:51083  Dashboard: http://127.0.0.1:8787,Cluster  Workers: 6  Cores: 6  Memory: 12.88 GB


In [17]:
regulomes = derive_regulomes(dbs[0:2],
                             modules[0:50],
                             MOTIF_ANNOTATIONS_FNAME,
                             client_or_address=custom_client)

In [19]:
regulomes

In [21]:
regulomes.result()

[Regulome(name='Regulome for Arntl', nomenclature='MGI', gene2weights=<frozendict {'Zfp668': 1.0, 'Bag6': 1.0, 'Rnf167': 1.0, 'Hmg20a': 1.0, 'Cog3': 1.0, 'Snrpb': 1.0, 'Pafah1b2': 1.0, 'Eif3i': 1.0, 'Vps52': 1.0, 'Trmt112': 1.0, 'Psma6': 1.0, 'Dnaja2': 1.0, 'Cdk13': 1.0, 'Cpsf3': 1.0, 'Cstf2t': 1.0, 'Ncstn': 1.0, 'Fam188a': 1.0, 'Tmem161b': 1.0, 'Emg1': 1.0, 'Mtmr4': 1.0, 'Cap1': 1.0, 'Epn1': 1.0, 'Ttpal': 1.0, 'Nipbl': 1.0, 'Zfp410': 1.0, 'Krr1': 1.0, 'Rpl37': 1.0, 'Dalrd3': 1.0, 'Ebna1bp2': 1.0, 'Lrrc49': 1.0, 'Pick1': 1.0, 'Ngrn': 1.0, 'Uggt1': 1.0, 'Psmd13': 1.0, 'Zfp384': 1.0, 'Sptlc2': 1.0, 'Gatad1': 1.0, 'B3gat3': 1.0, 'Tmcc1': 1.0, 'Ppme1': 1.0, 'Rhot2': 1.0, 'Fbxw2': 1.0, 'Dctn5': 1.0, 'Ndufs3': 1.0, 'Pgap3': 1.0, 'Trim46': 1.0, 'Pfn1': 1.0, 'Lyrm1': 1.0, 'Ddost': 1.0, 'Ovca2': 1.0, 'Clasp1': 1.0, 'Vma21': 1.0, 'Zfp622': 1.0, 'Exoc1': 1.0, 'Tomm6': 1.0, 'Zfp3': 1.0, 'Erh': 1.0, 'Txnl4b': 1.0, 'Prpf19': 1.0, 'Ubl7': 1.0, '1110001J03Rik': 1.0, 'Lsm1': 1.0, 'Fmr1': 1.0, 'Rars': 1

In [22]:
custom_client.close()
local_cluster.close()