In [1]:
import argparse
import logging
from operator import mul
import time
import os

import pubweb.singlecell # import AnnDataSparse
from pubweb.hdf5 import Hdf5
from pubweb.commands.convert.singlecell.anndata import ImportAnndata
from pubweb.commands.convert.singlecell.cellranger import ImportCellRanger
from pubweb.commands.validate.dimensions import ValidateDimensions
from pubweb.commands.annotate.geneid import AnnotateGeneId
from pubweb.commands.annotate.geneset import AnnotateGeneset
from pubweb.commands.export.lists import ExportLists
from pubweb.commands.export.attributes import ExportAttributes
from pubweb.commands.export.tables import ExportTables
from pubweb.commands.export.projections import ExportProjections
from pubweb.commands.export.spatial import ExportSpatial
from pubweb.commands.export.matrix_sparse import ExportMatrixSparse
from pubweb.commands.export.matrix_dense import ExportMatrixDense
from pubweb.commands.summarize.genes import SummarizeGenes
from pubweb.commands.summarize.genemap import SummarizeGeneMap
from pubweb.commands.summarize.colors import SummarizeColors
from pubweb.commands.summarize.manifest import SummerizeManifest


In [2]:
import importlib

importlib.reload(pubweb.singlecell)
importlib.reload(pubweb.hdf5)
importlib.reload(pubweb.commands.convert.singlecell.anndata)
importlib.reload(pubweb.commands.convert.singlecell.cellranger)
importlib.reload(pubweb.commands.validate.dimensions)
importlib.reload(pubweb.commands.annotate.geneid)
importlib.reload(pubweb.commands.annotate.geneset)
importlib.reload(pubweb.commands.export)
importlib.reload(pubweb.commands.export.lists)
importlib.reload(pubweb.commands.export.attributes)
importlib.reload(pubweb.commands.export.tables)
importlib.reload(pubweb.commands.export.projections)
importlib.reload(pubweb.commands.export.spatial)
importlib.reload(pubweb.commands.export.matrix_sparse)
importlib.reload(pubweb.commands.export.matrix_dense)
importlib.reload(pubweb.commands.summarize.genes)
importlib.reload(pubweb.commands.summarize.genemap)
importlib.reload(pubweb.commands.summarize.colors)
importlib.reload(pubweb.commands.summarize.manifest)


<module 'pubweb.commands.summarize.manifest' from '/root/anaconda3/lib/python3.8/site-packages/pubweb/commands/summarize/manifest.py'>

In [3]:
logging.basicConfig(level='DEBUG')

In [4]:
datasetName='lung-upper-airway-h1299'
inputFile = '/data/notebooks/input/convert.hdf5'
outputFolder = '/data/notebooks/pubweb'
species = 'human'
overwriteHdf5 = True
python_wd = '/opt/pubweb'


In [5]:
#dir(pubweb.singlecell)

In [6]:
# anndatasparse
outputFile = f'{outputFolder}/pubweb.hdf5'
if os.path.exists(outputFile) and overwriteHdf5:
    os.remove(outputFile)
hdf5 = Hdf5.load(outputFile, "a")

In [7]:
hdf5.uri

'/data/notebooks/pubweb/pubweb.hdf5'

In [8]:
%time hdf5 | ImportAnndata(inputFile, datasetName)
# 345

CPU times: user 464 ms, sys: 6.55 s, total: 7.01 s
Wall time: 6.97 s


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [9]:
hdf5.getDatasets()

['pubweb/lung-upper-airway-h1299']

In [10]:
hdf5.h5py['pubweb/lung-upper-airway-h1299/matrix']

<HDF5 dataset "matrix": shape (81736, 27072), type "<f4">

In [11]:
%time hdf5 | AnnotateGeneId(species=species)
# 1min28s

INFO:root:AnnotateGeneId: pubweb/lung-upper-airway-h1299/features/gene


Annotate Gene Id


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 1-1000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 1001-2000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 2001-3000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 3001-4000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 4001-5000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 5001-6000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 6001-7000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 7001-8000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 8001-9000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 9001-10000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 10001-11000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 11001-12000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 12001-13000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 13001-14000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 14001-15000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 15001-16000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 16001-17000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 17001-18000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 18001-19000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 19001-20000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 20001-21000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 21001-22000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 22001-23000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 23001-24000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 24001-25000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 25001-26000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80


querying 26001-27000...

DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


done.


DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mygene.info:80
DEBUG:urllib3.connectionpool:http://mygene.info:80 "POST /v3/query/ HTTP/1.1" 200 None


querying 27001-27072...done.


DEBUG:root:Now creating datasets in pubweb/lung-upper-airway-h1299/features, existing ones are <KeysViewHDF5 ['Selected', 'gene', 'vst_mean', 'vst_variable', 'vst_variance', 'vst_variance_expected', 'vst_variance_standardized']>


Finished.
1223 input query terms found dup hits:
	[('MMP23A', 2), ('LINC01346', 2), ('LINC00337', 2), ('SNORA59B', 2), ('PRAMEF22', 2), ('PRAMEF34P', 
2036 input query terms found no hit:
	['LOC100133331', 'MIR6723', 'LOC100130417', 'FAM132A', 'CPSF3L', 'LOC148413', 'LOC102724312', 'C1orf
CPU times: user 4.67 s, sys: 53.5 ms, total: 4.72 s
Wall time: 1min 17s


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [12]:
# save hdf5_geneid
print(type(hdf5))

<class 'pubweb.hdf5.LocalHdf5'>


In [13]:
hdf5.getDatasetsWithPath('pubweb/lung-upper-airway-h1299')

['pubweb/lung-upper-airway-h1299/matrix']

In [14]:
hdf5.getDatasets()

['pubweb/lung-upper-airway-h1299']

In [15]:
%time hdf5 | ExportMatrixDense(outputFolder)
# 14.1s

Export Matrix
Writing cols 0 to 100
Writing cols 100 to 200
Writing cols 200 to 300
Writing cols 300 to 400
Writing cols 400 to 500
Writing cols 500 to 600
Writing cols 600 to 700
Writing cols 700 to 800
Writing cols 800 to 900
Writing cols 900 to 1000
Writing cols 1000 to 1100
Writing cols 1100 to 1200
Writing cols 1200 to 1300
Writing cols 1300 to 1400
Writing cols 1400 to 1500
Writing cols 1500 to 1600
Writing cols 1600 to 1700
Writing cols 1700 to 1800
Writing cols 1800 to 1900
Writing cols 1900 to 2000
Writing cols 2000 to 2100
Writing cols 2100 to 2200
Writing cols 2200 to 2300
Writing cols 2300 to 2400
Writing cols 2400 to 2500
Writing cols 2500 to 2600
Writing cols 2600 to 2700
Writing cols 2700 to 2800
Writing cols 2800 to 2900
Writing cols 2900 to 3000
Writing cols 3000 to 3100
Writing cols 3100 to 3200
Writing cols 3200 to 3300
Writing cols 3300 to 3400
Writing cols 3400 to 3500
Writing cols 3500 to 3600
Writing cols 3600 to 3700
Writing cols 3700 to 3800
Writing cols 3800 t

<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [16]:
%time hdf5 | ExportProjections(outputFolder)
# 3min3s

Export Dataset Projections
CPU times: user 164 µs, sys: 282 µs, total: 446 µs
Wall time: 429 µs


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [17]:
%time hdf5 | ExportTables(outputFolder)
# 426us

Export Dataset Tables
CPU times: user 424 µs, sys: 0 ns, total: 424 µs
Wall time: 408 µs


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [18]:
%time hdf5 | ExportLists(outputFolder)
#480us

Export Dataset Lists
CPU times: user 154 µs, sys: 264 µs, total: 418 µs
Wall time: 401 µs


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [19]:
%time hdf5 | ExportAttributes(outputFolder)
# 2min 7 s

DEBUG:root:data has shape (81736,)
DEBUG:root:data has shape (81736,)


Export Attributes: observations


DEBUG:root:data has shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:valueMin 1323.0, valueMax 176802.0, data is <class 'numpy.ndarray'>, shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:valueMin 774.0, valueMax 18939.0, data is <class 'numpy.ndarray'>, shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:valueMin 1001, valueMax 10585, data is <class 'numpy.ndarray'>, shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:valueMin 628, valueMax 5438, data is <class 'numpy.ndarray'>, shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:data has shape (81736,)
DEBUG:root:data has shape (27072,)
DEBUG:root:valueMin 0.0, valueMax 1.0, data is <class 'numpy.ndarray'>, shape (27072,)
DEBUG:root:data has shape (27072,)
DEBUG:root:data has shape (27071,)
DEBUG:root:data has shape (27071,)


Export Attributes: features


DEBUG:root:data has shape (27071,)
DEBUG:root:data has shape (27071,)
DEBUG:root:data has shape (27072,)
DEBUG:root:valueMin 0.0, valueMax 91.63963317871094, data is <class 'numpy.ndarray'>, shape (27072,)
DEBUG:root:data has shape (27072,)
DEBUG:root:valueMin 0, valueMax 1, data is <class 'numpy.ndarray'>, shape (27072,)
DEBUG:root:data has shape (27072,)
DEBUG:root:valueMin 0.0, valueMax 10331.2373046875, data is <class 'numpy.ndarray'>, shape (27072,)
DEBUG:root:data has shape (27072,)
DEBUG:root:valueMin 0.0, valueMax 19461.0234375, data is <class 'numpy.ndarray'>, shape (27072,)
DEBUG:root:data has shape (27072,)
DEBUG:root:valueMin 0.0, valueMax 19.242284774780273, data is <class 'numpy.ndarray'>, shape (27072,)


CPU times: user 2.11 s, sys: 19 ms, total: 2.13 s
Wall time: 2.12 s


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [20]:
%time hdf5 | SummarizeColors(outputFolder)
# 59.4ms

INFO:root:Reading from /data/notebooks/pubweb/features/pw_symbol/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/pw_ensembl/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/gene/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/vst_variance_expected/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/vst_mean/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/pw_hcid/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/vst_variable/metadata.json for /data/notebooks/pubweb/summary/color/features
INFO:root:Reading from /data/notebooks/pubweb/features/Selected/metadata.json for /data/n

Summarize Colors
Export Colors: features
Export Colors: observations
CPU times: user 19.4 ms, sys: 5.26 ms, total: 24.7 ms
Wall time: 20.5 ms


<pubweb.hdf5.LocalHdf5 at 0x7ff519147e20>

In [21]:
%time hdf5 | SummerizeManifest(outputFolder)
# 4.2ms

matrix: /data/notebooks/pubweb/matrix
placeholder
CPU times: user 79 µs, sys: 2.94 ms, total: 3.02 ms
Wall time: 2.37 ms
