From 2054379224a06d81123147b9353eb2cb5b44b73c Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Fri, 4 Sep 2015 11:13:31 +0200 Subject: [PATCH 01/11] enable sample if override from command line, fix #145 --- chanjo/cli/load.py | 10 ++++++---- chanjo/load/sambamba.py | 30 ++++++++++-------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/chanjo/cli/load.py b/chanjo/cli/load.py index 48cbfa63..5a3c3b00 100644 --- a/chanjo/cli/load.py +++ b/chanjo/cli/load.py @@ -13,25 +13,27 @@ @click.command() +@click.option('-s', '--sample', help='override sample id from file') @click.option('-g', '--group', help='id to group related samples') @click.argument('bed_stream', callback=validate_stdin, type=click.File(encoding='utf-8'), default='-', required=False) @click.pass_context -def load(context, group, bed_stream): +def load(context, sample, group, bed_stream): """Load Sambamba output into the database for a sample.""" chanjo_db = Store(uri=context.obj['database']) try: - load_sambamba(chanjo_db, bed_stream, group_id=group) + load_sambamba(chanjo_db, bed_stream, sample_id=sample, group_id=group) except IntegrityError: logger.error('sample already loaded, rolling back') chanjo_db.session.rollback() context.abort() -def load_sambamba(chanjo_db, bed_iterable, group_id=None): +def load_sambamba(chanjo_db, bed_iterable, sample_id=None, group_id=None): """Load Sambamba BED output from a stream.""" rows = bed.chanjo(bed_iterable) - stats = sambamba.rows(chanjo_db.session, rows, group_id=group_id) + stats = sambamba.rows(chanjo_db.session, rows, sample_id=sample_id, + group_id=group_id) for index, stat in enumerate(stats): chanjo_db.add(stat) if index % 10000 == 0: diff --git a/chanjo/load/sambamba.py b/chanjo/load/sambamba.py index f9b76ca7..e8825099 100644 --- a/chanjo/load/sambamba.py +++ b/chanjo/load/sambamba.py @@ -7,7 +7,7 @@ from .utils import get_or_build_exon -def rows(session, row_data, group_id=None): +def rows(session, row_data, sample_id=None, group_id=None): """Handle rows of sambamba output. N.B. only handles single sample annotations. @@ -15,16 +15,20 @@ def rows(session, row_data, group_id=None): Args: session (Session): database session object row_data (dict): parsed sambamba output rows + sample_id (Optional[str]): id to reference sample group_id (Optional[str]): id to group samples together Yields: ExonStatistic: stats model linked to exon and sample """ - # use first row to get information on sample - first_row = next(iter(row_data)) - sample_obj = sample(first_row, group_id=group_id) - # place the first row back in the stream - all_data = cons(first_row, row_data) + if sample_id is None: + # use first row to get information on sample + first_row = next(iter(row_data)) + sample_id = first_row['sampleName'] + # place the first row back in the stream + all_data = cons(first_row, row_data) + + sample_obj = Sample(sample_id=sample_id, group_id=group_id) nested_stats = (row(session, data, sample_obj) for data in all_data) # flatten 2D nested list return (stat for stats in nested_stats for stat in stats) @@ -46,20 +50,6 @@ def row(session, data, sample_obj): return stats -def sample(data, group_id=None): - """Create sample model. - - Args: - data (dict): parsed sambamba output row - group_id (Optional[str]): id to group samples together - - Returns: - Sample: sample database model - """ - sample_obj = Sample(sample_id=data['sampleName'], group=group_id) - return sample_obj - - def statistics(data, sample_obj, exon_obj): """Create models from a sambamba output row. From 52a638da3120a339ef25b1eb64b8217b472f7a73 Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Fri, 4 Sep 2015 11:19:19 +0200 Subject: [PATCH 02/11] update readme, add @moonso as collaborator --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2278dbd4..d21004a6 100644 --- a/README.md +++ b/README.md @@ -33,8 +33,7 @@ Chanjo exposes a composable command line interface with a nifty config file impl $ chanjo init --setup $ chanjo load /path/to/sambamba.output.bed $ chanjo calculate mean sample1 -#sampleId mean-coverage -sample10 176.513223249 +{"completeness_10": 96.09, "mean_coverage": 240.96, "sample_id": "ADM980A4"} ``` ## Documentation @@ -56,6 +55,7 @@ Chanjo is not the right choice if you care about coverage for every base across - Robin Andeer - Luca Beltrame ([lbeltrame](https://github.com/lbeltrame)) - John Kern ([kern3020](https://github.com/kern3020)) +- Måns Magnusson ([moonso](https://github.com/moonso)) ## License MIT. See the [LICENSE](LICENSE) file for more details. From 04791ea02ab55c1ab6a4c352ab27516a8a9e926f Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Fri, 4 Sep 2015 11:39:28 +0200 Subject: [PATCH 03/11] update Sample model field group > group_id some PEP8 fixes :) --- chanjo/annotate/sambamba/run.py | 21 ++++++++------------- chanjo/cli/root.py | 21 ++++++++++----------- chanjo/cli/sambamba.py | 12 ++++++------ chanjo/log.py | 24 +++++++++++------------- chanjo/store/__init__.py | 4 ++-- chanjo/store/api.py | 2 +- chanjo/store/models.py | 31 +++++++++---------------------- 7 files changed, 47 insertions(+), 68 deletions(-) diff --git a/chanjo/annotate/sambamba/run.py b/chanjo/annotate/sambamba/run.py index 8388d3a8..f9e66ba4 100644 --- a/chanjo/annotate/sambamba/run.py +++ b/chanjo/annotate/sambamba/run.py @@ -7,10 +7,7 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()): - """ - Run sambamba from chanjo. - - """ + """Run sambamba from chanjo.""" logger = logging.getLogger(__name__) logger = logging.getLogger("chanjo.sambamba") log_stream = get_log_stream(logger) @@ -22,15 +19,14 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()): region_file, bam_file ] - + if outfile: sambamba_call += ["-o", outfile] - + for coverage_treshold in cov_treshold: sambamba_call += ['-T', str(coverage_treshold)] - - logger.info("Running sambamba with call: {0}".format(' '.join(sambamba_call))) - + + logger.info("Running sambamba with call: %s", ' '.join(sambamba_call)) try: subprocess.check_call( sambamba_call, @@ -40,10 +36,9 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()): logger.critical("sambamba seems to not exist on your system.") raise e except CalledProcessError as e: - logger.critical("Something went wrong when running sambamba. "\ - "Please see sambamba error output.") + logger.critical("Something went wrong when running sambamba. " + "Please see sambamba error output.") raise e - + logger.debug("sambamba run successfull") - return diff --git a/chanjo/cli/root.py b/chanjo/cli/root.py index 0a03110e..1fef934a 100644 --- a/chanjo/cli/root.py +++ b/chanjo/cli/root.py @@ -9,8 +9,6 @@ """ import click -import chanjo - from chanjo.compat import text_type from chanjo.config import Config, CONFIG_FILE_NAME, markup from chanjo.log import init_log, LEVELS @@ -18,6 +16,7 @@ from chanjo import __version__, logger + def print_version(ctx, param, value): """Callback function for printing version and exiting Args: @@ -33,21 +32,21 @@ def print_version(ctx, param, value): ctx.exit() @click.group(cls=EntryPointsCLI) -@click.option('-c', '--config', - default=CONFIG_FILE_NAME, - type=click.Path(), +@click.option('-c', '--config', + default=CONFIG_FILE_NAME, + type=click.Path(), help='path to config file' ) -@click.option('-d', '--database', +@click.option('-d', '--database', type=text_type, help='path/URI of the SQL database' ) -@click.option('-v', '--verbose', +@click.option('-v', '--verbose', count=True, default=0, help="Increase output verbosity. Can be used multiple times, eg. -vv" ) -@click.option('--log_file', +@click.option('--log_file', type=click.Path() ) @click.option('--version', @@ -60,10 +59,10 @@ def print_version(ctx, param, value): def root(context, config, database, verbose, log_file): """Clinical sequencing coverage analysis tool.""" # setup logging - - loglevel = LEVELS.get(min(verbose,2), "WARNING") + + loglevel = LEVELS.get(min(verbose, 2), "WARNING") init_log(logger, loglevel=loglevel, filename=log_file) - logger.info("version {0}".format( __version__)) + logger.info("version {0}".format(__version__)) # avoid setting global defaults in Click options, do it below when # updating the config object diff --git a/chanjo/cli/sambamba.py b/chanjo/cli/sambamba.py index cbebcfe4..7fa45ae1 100644 --- a/chanjo/cli/sambamba.py +++ b/chanjo/cli/sambamba.py @@ -5,6 +5,7 @@ from chanjo.annotate.sambamba import run_sambamba + @click.command() @click.argument('bam_file', type=click.Path(exists=True), @@ -25,7 +26,7 @@ "the percentage of bases in the region"\ "where coverage is more than this value" ) -@click.option('-o', '--outfile', +@click.option('-o', '--outfile', type=click.Path(exists=False), help='Specify the path to a file where results should be stored.' ) @@ -33,24 +34,23 @@ def sambamba(context, bam_file, exon_bed, gene_bed, cov_treshold, outfile): """Run Sambamba from chanjo.""" logger = logging.getLogger(__name__) - #For testing only: + # For testing only: logger = logging.getLogger("chanjo.cli.sambamba") logger.info("Running chanjo sambamba") - + if not (exon_bed or gene_bed): logger.warning("Please provide a region file in BED format") sys.exit() if exon_bed and gene_bed: logger.warning("Only one region file at a time") sys.exit() - + region_file = exon_bed if gene_bed: region_file = gene_bed - + try: run_sambamba(bam_file, region_file, outfile, cov_treshold) except Exception as e: logger.debug(e) click.Abort() - diff --git a/chanjo/log.py b/chanjo/log.py index 75579833..0a9f5d67 100644 --- a/chanjo/log.py +++ b/chanjo/log.py @@ -1,18 +1,16 @@ -import sys import logging LEVELS = { - 0 : 'WARNING', - 1 : 'INFO', - 2 : 'DEBUG', + 0: 'WARNING', + 1: 'INFO', + 2: 'DEBUG', } + def init_log(logger, filename=None, loglevel=None): - """ - Initializes the log file in the proper format. + """Initializes the log file in the proper format. Arguments: - filename (str): Path to a file. Or None if logging is to be disabled. loglevel (str): Determines the level of the log output. @@ -42,15 +40,15 @@ def init_log(logger, filename=None, loglevel=None): logger.addHandler(console) + def get_log_stream(logger): - """ - Returns a stream to the root log file. + """Returns a stream to the root log file. + If there is no logfile return the stderr log stream - + Returns: A stream to the root log file or stderr stream. """ - file_stream = None log_stream = None for handler in logger.handlers: @@ -58,8 +56,8 @@ def get_log_stream(logger): file_stream = handler.stream else: log_stream = handler.stream - + if file_stream: return file_stream - + return log_stream diff --git a/chanjo/store/__init__.py b/chanjo/store/__init__.py index 2d6de445..6d021a95 100644 --- a/chanjo/store/__init__.py +++ b/chanjo/store/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- from .api import ChanjoAPI from .core import Store -from .models import (Exon, ExonStatistic, Exon_Transcript, Gene, GeneStatistic, - Sample, Transcript) +from .models import (Exon, ExonStatistic, Exon_Transcript, Gene, Sample, + Transcript) diff --git a/chanjo/store/api.py b/chanjo/store/api.py index 450ea966..c8aae3e0 100644 --- a/chanjo/store/api.py +++ b/chanjo/store/api.py @@ -28,7 +28,7 @@ def filter_samples(query, group_id=None, sample_ids=None): """ if group_id: logger.debug('filter based on group') - return query.filter(Sample.group == group_id) + return query.filter(Sample.group_id == group_id) elif sample_ids: logger.debug('filter based on list of samples') return query.filter(Sample.sample_id.in_(sample_ids)) diff --git a/chanjo/store/models.py b/chanjo/store/models.py index 6f9cb107..c00e3955 100644 --- a/chanjo/store/models.py +++ b/chanjo/store/models.py @@ -135,7 +135,7 @@ class Sample(BASE): id = Column(Integer, primary_key=True) sample_id = Column(String(32), unique=True) - group = Column(String(32), index=True) + group_id = Column(String(32), index=True) created_at = Column(DateTime, default=datetime.now) updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now) @@ -148,37 +148,24 @@ class Statistic(object): """Coverage metrics for a single element and a given sample. Args: - sample_id (int): unique sample identifier - group_id (str): group identifier + metric (str): identifier for the metric + value (float): value for the metric """ - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True) metric = Column(String(32)) value = Column(Float) -class GeneStatistic(Statistic, BASE): - - """ - - Args: - parent_id (int): parent record Gene id - """ - - __tablename__ = 'gene_stat' - - sample_id = Column(Integer, ForeignKey('sample.id')) - sample = relationship(Sample, backref=backref('gene_stats')) - gene_id = Column(Integer, ForeignKey('gene.id')) - gene = relationship(Gene, backref=backref('stats')) - - class ExonStatistic(Statistic, BASE): - """ + """Statistics on the exon level, related to sample and exon. Args: - parent_id (int): parent record Exon id + sample (Sample): parent record Sample + exon (Exon): parent record Exon + sample_id (int): parent record Sample id + exon_id (int): parent record Exon id """ __tablename__ = 'exon_stat' From 4fc0eb5f13fc6a4de544a1517fa5e4c543c5eb56 Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 09:33:40 +0200 Subject: [PATCH 04/11] refactor API to accept prepared queries add converter mixin API class make group by field function lazy --- chanjo/cli/calculate.py | 6 +- chanjo/store/api.py | 229 ++++++++++++-------------------------- chanjo/store/converter.py | 101 +++++++++++++++++ chanjo/store/utils.py | 59 +++++++--- 4 files changed, 220 insertions(+), 175 deletions(-) create mode 100644 chanjo/store/converter.py diff --git a/chanjo/cli/calculate.py b/chanjo/cli/calculate.py index 8b3b0e8e..5ad36c0c 100644 --- a/chanjo/cli/calculate.py +++ b/chanjo/cli/calculate.py @@ -4,6 +4,7 @@ import click from chanjo.store import ChanjoAPI +from chanjo.store.utils import filter_samples from .utils import dump_json logger = logging.getLogger(__name__) @@ -21,7 +22,10 @@ def calculate(context): @click.pass_context def mean(context, samples): """Report mean coverage for a list of samples.""" - results = context.parent.api.mean(*samples) + api = context.parent.api + query = filter_samples(api.query(), sample_ids=samples) + results = ({'sample_id': sample_id, 'metrics': data} + for sample_id, data in api.means(query)) dump_json(*results) diff --git a/chanjo/store/api.py b/chanjo/store/api.py index c8aae3e0..d1556dbf 100644 --- a/chanjo/store/api.py +++ b/chanjo/store/api.py @@ -5,38 +5,16 @@ from sqlalchemy.sql import func from chanjo.compat import itervalues +from chanjo.utils import list_get from .core import Store -from .models import Exon, ExonStatistic, Gene, Sample, Transcript -from .utils import group_by_field, predict_gender +from .converter import ChanjoConverterMixin +from .models import Exon, ExonStatistic, Sample, Transcript +from .utils import filter_samples, group_by_field, predict_gender logger = logging.getLogger(__name__) -def filter_samples(query, group_id=None, sample_ids=None): - """Filter a query to a subset of samples. - - Will return an unaltered query if none of the optional parameters - are set. `group_id` takes precedence over `sample_ids`. - - Args: - query (Query): SQLAlchemy query object - group_id (Optional[str]): sample group identifier - sample_ids (Optional[List[str]]): sample ids - - Returns: - Query: filtered query object - """ - if group_id: - logger.debug('filter based on group') - return query.filter(Sample.group_id == group_id) - elif sample_ids: - logger.debug('filter based on list of samples') - return query.filter(Sample.sample_id.in_(sample_ids)) - else: - return query - - -class ChanjoAPI(Store): +class ChanjoAPI(Store, ChanjoConverterMixin): """Interface to chanjo database for asking interesting questions. @@ -71,7 +49,7 @@ def weighted_average(self): weight = Exon.end - Exon.start total_weight = func.sum(weight) total_value = func.sum(ExonStatistic.value * weight) - weighted_mean = total_value / total_weight + weighted_mean = (total_value / total_weight).label('weighted_mean') return weighted_mean def samples(self, group_id=None, sample_ids=None): @@ -88,24 +66,24 @@ def samples(self, group_id=None, sample_ids=None): query = filter_samples(query, group_id=group_id, sample_ids=sample_ids) return query - def mean(self, *samples): - r"""Calculate mean values for all metrics on a per sample basis. + def means(self, query=None): + """Calculate means on a per sample basis. Args: - \*samples (Optional[List[str]]): filter by sample id + query (Optional[Query]): initialized and filtered query Returns: - dict: weighted averages grouped by sample + List[tuple]: sample id and grouped mean metrics """ - results = (self.query(Sample.sample_id, ExonStatistic.metric, - self.weighted_average) - .join(ExonStatistic.sample, ExonStatistic.exon) - .group_by(Sample.sample_id, ExonStatistic.metric)) - if samples: - results = results.filter(Sample.sample_id.in_(samples)) - - sample_groups = group_by_field(results, name='sample_id') - return sample_groups + query = query or self.query() + ready_query = (query.add_columns(Sample.sample_id, + ExonStatistic.metric, + self.weighted_average) + .join(ExonStatistic.sample, ExonStatistic.exon) + .group_by(Sample.sample_id, ExonStatistic.metric) + .order_by(Sample.sample_id)) + results = group_by_field(ready_query) + return results def region_alt(self, region_id, sample_id=None, per=None): """Parse region id as input for `region` method. @@ -169,16 +147,16 @@ def gene(self, *gene_ids): samples = {} for gene_id in gene_ids: logger.debug('figure out which transcripts the gene belongs to') - tx_ids = list(self.gene_to_transcripts(gene_id)) - if len(tx_ids) == 0: + exon_objs = self.gene_exons(gene_id).all() + if len(exon_objs) == 0: raise AttributeError("gene id not in database: {}" .format(gene_id)) - - results = self.transcript_to_exons(*tx_ids) - for data in group_by_field(results, name='sample_id'): - if data['sample_id'] not in samples: - samples[data['sample_id']] = {'sample_id': data['sample_id']} - samples[data['sample_id']][gene_id] = data + exon_ids = [exon_obj.exon_id for exon_obj in exon_objs] + query = (self.query().filter(Exon.exon_id.in_(exon_ids))) + for sample_id, data in self.means(query): + if sample_id not in samples: + samples[sample_id] = {'sample_id': sample_id, 'genes': {}} + samples[sample_id]['genes'][gene_id] = data return itervalues(samples) @@ -202,61 +180,7 @@ def transcripts(self, *transcript_ids): Transcript.transcript_id)) return results - def all_transcripts(self, sample_id, *gene_ids): - r"""Fetch all transcripts from the database. - - Args: - sample_id (str): restrict query to a sample id - \*gene_ids (List[str]): genes to convert to transcripts - - Returns: - List[Transcript]: transcript models related to the genes - """ - query = (self.query(Transcript) - .join(Transcript.exons, Exon.stats, ExonStatistic.sample) - .filter(Sample.sample_id == sample_id)) - if gene_ids: - tx_ids = self.gene_to_transcripts(*gene_ids) - query = query.filter(Transcript.transcript_id.in_(tx_ids)) - return query - - def transcripts_to_genes(self, transcript_ids, db_ids=False): - """Fetch a list of genes related to some exons. - - Args: - transcript_ids (List[str]): transcript ids to convert to genes - db_ids (Optional[bool]): if sending in primary key ids - - Returns: - List[Gene]: gene models related to the transcripts - """ - results = self.query(Gene).join(Gene.transcripts) - if db_ids: - results = results.filter(Transcript.id.in_(transcript_ids)) - else: - condition = Transcript.transcript_id.in_(transcript_ids) - results = results.filter(condition) - return results - - def exons_to_transcripts(self, exons_ids, db_ids=False): - """Fetch a list of transcripts related to some exons. - - Args: - exon_ids (List[str]): list of exon ids - db_ids (Optional[bool]): if sending in primary key ids - - Returns: - List[Transcript]: transcripts related to the exons - """ - results = self.query(Transcript).join(Transcript.exons) - if db_ids: - results = results.filter(Exon.id.in_(exons_ids)) - else: - results = results.filter(Exon.exon_id.in_(exons_ids)) - return results - - def incomplete_exons(self, level=10, threshold=100, group_id=None, - sample_ids=None): + def incomplete_exons(self, query=None, level=10, threshold=100): """Fetch exons with incomplete coverage at a specifc level. Args: @@ -266,64 +190,19 @@ def incomplete_exons(self, level=10, threshold=100, group_id=None, sample_ids (Optional[List[str]]): sample ids Returns: - List[tuple]: sample, exon, completeness value + List[tuple]: sample_id, dict with exons and completeness """ completeness = "completeness_{}".format(level) - query = (self.query(Sample.sample_id, - Exon.exon_id, - ExonStatistic.value) - .join(ExonStatistic.exon, ExonStatistic.sample) - .filter(ExonStatistic.metric == completeness, - ExonStatistic.value < threshold)) - query = filter_samples(query, group_id=group_id, sample_ids=sample_ids) - return query - - def gene_panel(self, gene_ids, group_id=None, sample_ids=None): - """Report mean coverage (+ completeness) for a panel of genes. - - Args: - gene_ids (List[str]): gene ids for the panel - group_id (Optional[str]): sample group identifier - sample_ids (Optional[List[str]]): sample ids - - Returns: - List[tuple]: sample, metric, weighted average value - """ - tx_ids = list(self.gene_to_transcripts(*gene_ids)) - query = self.transcript_to_exons(*tx_ids) - query = filter_samples(query, group_id=group_id, sample_ids=sample_ids) - return query - - def transcript_to_exons(self, *transcript_ids): - """Fetch a unique list of exons related to some transcripts. - - Args: - transcript_ids (List[str]): transcript ids to look up - - Returns: - List[tuple]: sample, metric, weighted average value - """ - results = (self.query(Sample.sample_id, ExonStatistic.metric, - self.weighted_average) - .join(ExonStatistic.sample, ExonStatistic.exon, - Exon.transcripts) - .filter(Transcript.transcript_id.in_(transcript_ids)) - .group_by(Sample.sample_id, ExonStatistic.metric)) - return results - - def gene_to_transcripts(self, *gene_ids): - r"""Fetch a unique list of transcripts related to some genes. - - Args: - \*gene_ids (List[str]): gene ids - - Returns: - List[str]: transcript ids - """ - results = (self.query(Transcript.transcript_id) - .join(Transcript.gene) - .filter(Gene.gene_id.in_(gene_ids))) - return (result[0] for result in results) + query = query or self.query() + ready_query = (self.query(Sample.sample_id, + Exon.exon_id, + ExonStatistic.value) + .join(ExonStatistic.exon, ExonStatistic.sample) + .filter(ExonStatistic.metric == completeness, + ExonStatistic.value < threshold) + .order_by(Sample.sample_id)) + exon_samples = group_by_field(ready_query) + return exon_samples def sex_coverage(self, sex_chromosomes=('X', 'Y')): """Query for average on X/Y chromsomes. @@ -363,3 +242,33 @@ def sex_check(self, group_id=None, sample_ids=None): # run the predictor gender = predict_gender(*sex_coverage) yield sample_id, gender, sex_coverage[0], sex_coverage[1] + + def diagnostic_yield(self, query, level=10, threshold=100): + """Calculate transcripts that aren't completely covered.""" + base_query = (query.add_columns(Transcript) + .join(ExonStatistic.exon, Exon.transcripts)) + level = "completeness_{}".format(level) + yield_query = (base_query.group_by(Transcript.id, ExonStatistic.metric) + .filter(func.avg(ExonStatistic.value) < threshold, + ExonStatistic.metric == level)) + all_tx_count = base_query.count() + tx_count = yield_query.count() + diagnostic_yield = 100 - (tx_count/all_tx_count * 100) + return { + "diagnostic_yield": diagnostic_yield, + "count": tx_count, + "total_count": all_tx_count, + "transcripts": yield_query + } + + def completeness_levels(self): + """Return a list of included completeness levels.""" + metrics = (self.query(ExonStatistic.metric) + .distinct(ExonStatistic.metric)) + # for all completeness levels, extract the level as int and full name + levels = ((int(field[0].split('_')[-1]), field[0]) + for field in metrics + if field[0].startswith('completeness')) + # sort them based on the level int + sorted_levels = sorted(levels, key=lambda level: list_get(level, 0)) + return sorted_levels diff --git a/chanjo/store/converter.py b/chanjo/store/converter.py new file mode 100644 index 00000000..3a14d413 --- /dev/null +++ b/chanjo/store/converter.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +from .models import Exon, ExonStatistic, Gene, Sample, Transcript + + +class ChanjoConverterMixin(object): + + """docstring for ChanjoConverterMixin""" + + def gene_transcripts(self, *gene_ids): + """Fetch transcripts related to a list of genes. + + Args: + *gene_ids (List[str]): gene ids + + Returns: + List[Transcript]: transcript objects related to the genes + """ + query = (self.query(Transcript) + .join(Exon.transcripts) + .filter(Gene.gene_id.in_(gene_ids))) + return query + + def gene_exons(self, *gene_ids): + """Fetch exons related to a list of genes. + + Args: + *gene_ids (List[str]): gene ids + + Returns: + List[Exon]: exon objects related to the genes + """ + query = (self.query(Exon) + .join(Exon.transcripts, Transcript.gene) + .filter(Gene.gene_id.in_(gene_ids))) + return query + + def transcript_genes(self, transcript_ids, db_ids=False): + """Fetch a lists of genes related to some exons. + + Args: + transcript_ids (List[str]): transcript ids to convert to genes + db_ids (Optional[bool]): if sending in primary key ids + + Returns: + List[Gene]: gene models related to the transcripts + """ + results = self.query(Gene).join(Gene.transcripts) + if db_ids: + results = results.filter(Transcript.id.in_(transcript_ids)) + else: + condition = Transcript.transcript_id.in_(transcript_ids) + results = results.filter(condition) + return results + + def transcript_exons(self, *transcript_ids): + """Fetch a unique list of exons related to some transcripts. + + Args: + transcript_ids (List[str]): transcript ids to look up + + Returns: + List[tuple]: sample, metric, weighted average value + """ + results = (self.query(Sample.sample_id, ExonStatistic.metric, + self.weighted_average) + .join(ExonStatistic.sample, ExonStatistic.exon, + Exon.transcripts) + .filter(Transcript.transcript_id.in_(transcript_ids)) + .group_by(Sample.sample_id, ExonStatistic.metric)) + return results + + def exon_transcripts(self, exons_ids, db_ids=False): + """Fetch a list of transcripts related to some exons. + + Args: + exon_ids (List[str]): list of exon ids + db_ids (Optional[bool]): if sending in primary key ids + + Returns: + List[Transcript]: transcripts related to the exons + """ + results = self.query(Transcript).join(Transcript.exons) + if db_ids: + results = results.filter(Exon.id.in_(exons_ids)) + else: + results = results.filter(Exon.exon_id.in_(exons_ids)) + return results + + def exon_genes(self, *exon_ids): + """Fetch genes related to a list of exons. + + Args: + *exon_ids (List[str]): exon ids + + Returns: + List[Gene]: gene objects related to the exons + """ + query = (self.query(Gene) + .join(Gene.transcripts, Transcript.exons) + .filter(Exon.exon_id.in_(exon_ids))) + return query diff --git a/chanjo/store/utils.py b/chanjo/store/utils.py index 4df369de..dc698a28 100644 --- a/chanjo/store/utils.py +++ b/chanjo/store/utils.py @@ -1,27 +1,58 @@ # -*- coding: utf-8 -*- -from chanjo.compat import itervalues +import logging +from .models import Sample -def group_by_field(results, name='field_id'): +logger = logging.getLogger(__name__) + + +def filter_samples(query, group_id=None, sample_ids=None): + """Filter a query to a subset of samples. + + Will return an unaltered query if none of the optional parameters + are set. `group_id` takes precedence over `sample_ids`. + + Args: + query (Query): SQLAlchemy query object + group_id (Optional[str]): sample group identifier + sample_ids (Optional[List[str]]): sample ids + + Returns: + Query: filtered query object + """ + if group_id: + logger.debug('filter based on group') + return query.filter(Sample.group_id == group_id) + elif sample_ids: + logger.debug('filter based on list of samples') + return query.filter(Sample.sample_id.in_(sample_ids)) + else: + return query + + +def group_by_field(results): """Group items based on the initial field. + Assumes a sorted list of results. + Args: results (List[tuple]): list of fields to group - name (Optional[str]): what to call the first field - Returns: - List[dict]: grouped dicts + Yields: + str, dict: next group of results along with the group id """ - groups = {} - # loop over results - for field_id, metric, value in results: - if field_id not in groups: - # init a new group - groups[field_id] = {name: field_id} + results = iter(results) + current_group_id, metric, value = next(results) + group = {metric: value} + for group_id, metric, value in results: + if current_group_id != group_id: + yield current_group_id, group + # reset the group + current_group_id = group_id + group = {} # store the metric under the correct group - groups[field_id][metric] = value - - return itervalues(groups) + group[metric] = value + yield current_group_id, group def predict_gender(x_coverage, y_coverage): From 910b9a70e426dadd335e052e8f79c7adeeb276e0 Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 11:53:05 +0200 Subject: [PATCH 05/11] fix diagnostic yield, limit to a single sample per call! --- chanjo/store/api.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/chanjo/store/api.py b/chanjo/store/api.py index d1556dbf..ae14e2fd 100644 --- a/chanjo/store/api.py +++ b/chanjo/store/api.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from __future__ import division import itertools import logging @@ -243,22 +244,31 @@ def sex_check(self, group_id=None, sample_ids=None): gender = predict_gender(*sex_coverage) yield sample_id, gender, sex_coverage[0], sex_coverage[1] - def diagnostic_yield(self, query, level=10, threshold=100): + def diagnostic_yield(self, sample_id, query=None, level=10, threshold=100): """Calculate transcripts that aren't completely covered.""" - base_query = (query.add_columns(Transcript) - .join(ExonStatistic.exon, Exon.transcripts)) + query = query or self.query() level = "completeness_{}".format(level) - yield_query = (base_query.group_by(Transcript.id, ExonStatistic.metric) - .filter(func.avg(ExonStatistic.value) < threshold, - ExonStatistic.metric == level)) - all_tx_count = base_query.count() - tx_count = yield_query.count() - diagnostic_yield = 100 - (tx_count/all_tx_count * 100) + + all_query = (query.add_columns(Transcript.transcript_id) + .join(Exon.transcripts)) + all_count = all_query.count() + + logger.debug('find out which exons failed') + yield_query = (query.add_columns(Exon.exon_id) + .join(ExonStatistic.exon) + .group_by(ExonStatistic.metric) + .filter(ExonStatistic.metric == level, + ExonStatistic.value < threshold)) + exon_ids = [row[0] for row in yield_query] + transcripts = self.exon_transcripts(exon_ids) + + tx_count = transcripts.count() + diagnostic_yield = 100 - (tx_count/all_count * 100) return { "diagnostic_yield": diagnostic_yield, "count": tx_count, - "total_count": all_tx_count, - "transcripts": yield_query + "total_count": all_count, + "transcripts": transcripts } def completeness_levels(self): From 0b982ace3d6494dd9193c90767815d7b9f67f36f Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 13:21:58 +0200 Subject: [PATCH 06/11] fix bug in diag. yield --- chanjo/store/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chanjo/store/api.py b/chanjo/store/api.py index ae14e2fd..aa598d33 100644 --- a/chanjo/store/api.py +++ b/chanjo/store/api.py @@ -250,13 +250,13 @@ def diagnostic_yield(self, sample_id, query=None, level=10, threshold=100): level = "completeness_{}".format(level) all_query = (query.add_columns(Transcript.transcript_id) + .distinct(Transcript.transcript_id) .join(Exon.transcripts)) all_count = all_query.count() logger.debug('find out which exons failed') yield_query = (query.add_columns(Exon.exon_id) .join(ExonStatistic.exon) - .group_by(ExonStatistic.metric) .filter(ExonStatistic.metric == level, ExonStatistic.value < threshold)) exon_ids = [row[0] for row in yield_query] From db81e001673002ee1753132c56e0c73c79d65034 Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 18:32:12 +0200 Subject: [PATCH 07/11] generalise bed parser, use for "link" doesn't have to be sambamba output --- chanjo/__init__.py | 2 +- chanjo/cli/load.py | 8 +++---- chanjo/load/utils.py | 3 ++- chanjo/log.py | 2 +- chanjo/parse/bed/parse.py | 50 +++++++++++++++++++++++++++++---------- chanjo/store/converter.py | 2 +- chanjo/utils.py | 1 - 7 files changed, 47 insertions(+), 21 deletions(-) diff --git a/chanjo/__init__.py b/chanjo/__init__.py index bbc7d601..6c031726 100644 --- a/chanjo/__init__.py +++ b/chanjo/__init__.py @@ -32,4 +32,4 @@ __email__ = 'robin.andeer@gmail.com' __license__ = 'MIT' -__copyright__ = 'Copyright 2014 Robin Andeer' +__copyright__ = 'Copyright 2015 Robin Andeer' diff --git a/chanjo/cli/load.py b/chanjo/cli/load.py index 5a3c3b00..da1f9068 100644 --- a/chanjo/cli/load.py +++ b/chanjo/cli/load.py @@ -4,8 +4,8 @@ import click from sqlalchemy.exc import IntegrityError -from chanjo.load import sambamba -from chanjo.parse import bed +from chanjo.load.sambamba import rows as sambamba_rows +from chanjo.parse import sambamba from chanjo.store import Store from chanjo.utils import validate_stdin @@ -31,8 +31,8 @@ def load(context, sample, group, bed_stream): def load_sambamba(chanjo_db, bed_iterable, sample_id=None, group_id=None): """Load Sambamba BED output from a stream.""" - rows = bed.chanjo(bed_iterable) - stats = sambamba.rows(chanjo_db.session, rows, sample_id=sample_id, + rows = sambamba.depth_output(bed_iterable) + stats = sambamba_rows(chanjo_db.session, rows, sample_id=sample_id, group_id=group_id) for index, stat in enumerate(stats): chanjo_db.add(stat) diff --git a/chanjo/load/utils.py b/chanjo/load/utils.py index 9b84dd17..7864a4d2 100644 --- a/chanjo/load/utils.py +++ b/chanjo/load/utils.py @@ -11,7 +11,8 @@ def _exon_kwargs(data): Returns: dict: kwargs prepared for Exon model """ - return {'exon_id': data['name'], 'chromosome': data['chrom'], + exon_id = data.get('name') or data['extraFields'][0] + return {'exon_id': exon_id, 'chromosome': data['chrom'], 'start': data['chromStart'], 'end': data['chromEnd']} diff --git a/chanjo/log.py b/chanjo/log.py index 0a9f5d67..29f3f895 100644 --- a/chanjo/log.py +++ b/chanjo/log.py @@ -15,7 +15,7 @@ def init_log(logger, filename=None, loglevel=None): be disabled. loglevel (str): Determines the level of the log output. """ - template = '[%(asctime)s] %(levelname)-8s: %(name)-25s: %(message)s' + template = "[%(asctime)s] %(levelname)-8s: %(name)-25s: %(message)s" formatter = logging.Formatter(template) if loglevel: diff --git a/chanjo/parse/bed/parse.py b/chanjo/parse/bed/parse.py index 63afd615..a959d15d 100644 --- a/chanjo/parse/bed/parse.py +++ b/chanjo/parse/bed/parse.py @@ -1,14 +1,12 @@ # -*- coding: utf-8 -*- """Parse BED files including Sambamba output files.""" from chanjo.compat import zip -from chanjo.parse import sambamba +from chanjo.exc import BedFormattingError from chanjo.utils import list_get def chanjo(handle): - """Parse a general BED file. - - Parses the optional columns not covered by Sambamba. + """Parse the chanjo specific columns in a BED file. Args: handle (iterable): Chanjo-formatted BED lines @@ -16,14 +14,42 @@ def chanjo(handle): Yields: dict: representation of row in BED file """ - row_data = sambamba.depth_output(handle) - for data in row_data: - data['name'] = list_get(data['extraFields'], 0) - data['score'] = int(list_get(data['extraFields'], 1, default=0)) - data['strand'] = list_get(data['extraFields'], 2) - element_combos = extra_fields(data['extraFields'][3:]) - data['elements'] = element_combos - yield data + lines = (line.strip() for line in handle if not line.startswith('#')) + rows = (line.split('\t') for line in lines) + for row in rows: + yield expand_row(row) + + +def expand_row(row): + """Parse a BED row. + + Args: + List[str]: BED row + + Returns: + dict: formatted BED row as a dict + + Raises: + BedFormattingError: failure to parse first three columns + """ + try: + data = { + 'chrom': row[0], + 'chromStart': int(row[1]), + 'chromEnd': int(row[2]) + } + except IndexError: + raise BedFormattingError('make sure fields are tab-separated') + except ValueError: + raise BedFormattingError("positions malformatted: {}".format(row)) + + data['name'] = list_get(row, 3) + data['score'] = list_get(row, 4) + data['strand'] = list_get(row, 5) + element_combos = extra_fields(row[6:8]) + data['elements'] = element_combos + data['extra_fields'] = row[8:] + return data def extra_fields(columns): diff --git a/chanjo/store/converter.py b/chanjo/store/converter.py index 3a14d413..46ad8042 100644 --- a/chanjo/store/converter.py +++ b/chanjo/store/converter.py @@ -4,7 +4,7 @@ class ChanjoConverterMixin(object): - """docstring for ChanjoConverterMixin""" + """Mixin to provide conversions between genomic elements.""" def gene_transcripts(self, *gene_ids): """Fetch transcripts related to a list of genes. diff --git a/chanjo/utils.py b/chanjo/utils.py index 85c080ca..c3bac1aa 100644 --- a/chanjo/utils.py +++ b/chanjo/utils.py @@ -6,7 +6,6 @@ A few general utility functions that might also be useful outside Chanjo. """ -from __future__ import division import pkg_resources import random import sys From bfa8d42fb39f38062fb2f6e993defaa5cbf12c3d Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 18:43:30 +0200 Subject: [PATCH 08/11] update tests for new parsing --- chanjo/parse/bed/parse.py | 6 +++++- tests/load/test_sambamba.py | 4 ++-- tests/parse/bed/test_bed_parse.py | 15 ++++++--------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/chanjo/parse/bed/parse.py b/chanjo/parse/bed/parse.py index a959d15d..ef2f7e46 100644 --- a/chanjo/parse/bed/parse.py +++ b/chanjo/parse/bed/parse.py @@ -43,8 +43,12 @@ def expand_row(row): except ValueError: raise BedFormattingError("positions malformatted: {}".format(row)) + try: + data['score'] = int(list_get(row, 4)) + except ValueError: + raise BedFormattingError('invalid BED syntax, score column is int') + data['name'] = list_get(row, 3) - data['score'] = list_get(row, 4) data['strand'] = list_get(row, 5) element_combos = extra_fields(row[6:8]) data['elements'] = element_combos diff --git a/tests/load/test_sambamba.py b/tests/load/test_sambamba.py index f00efc61..8d5ffe99 100644 --- a/tests/load/test_sambamba.py +++ b/tests/load/test_sambamba.py @@ -4,7 +4,7 @@ from chanjo.store import Store from chanjo.load import sambamba as load_sambamba -from chanjo.parse import bed as parse_bed +from chanjo.parse import sambamba as parse_sambamba from get_bedrows import get_bedlines bed_lines = get_bedlines() @@ -14,7 +14,7 @@ class TestSambamba: def setup(self): self.store = Store('sqlite://') self.store.set_up() - self.row_data = parse_bed.chanjo(bed_lines) + self.row_data = parse_sambamba.depth_output(bed_lines) def teardown(self): self.store.tear_down() diff --git a/tests/parse/bed/test_bed_parse.py b/tests/parse/bed/test_bed_parse.py index 280db4c4..21993d2d 100644 --- a/tests/parse/bed/test_bed_parse.py +++ b/tests/parse/bed/test_bed_parse.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +import pytest + +from chanjo.exc import BedFormattingError from chanjo.parse import bed TEST_OUTPUT = 'tests/fixtures/test.sambamba.bed' @@ -23,16 +26,10 @@ def test_chanjo(): assert list(data['elements']) == [('CCDS30547.1', 'OR4F5')] -def test_chanjo_without_elements(): +def test_chanj_with_sambamba(): bed_lines = ["# chrom\tchromStart\tchromEnd\treadCount\tmeanCoverage" "\tsampleName", "1\t69089\t70007\t232\t25.4946\tADM992A10\t"] - data_rows = list(bed.chanjo(bed_lines)) - data = data_rows[0] - - assert len(data_rows) == 1 - assert data['name'] is None - assert data['score'] == 0 - assert data['strand'] is None - assert list(data['elements']) == [] + with pytest.raises(BedFormattingError): + list(bed.chanjo(bed_lines)) From 48524de14c87d251a668d62fb774a998521385be Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 19:07:37 +0200 Subject: [PATCH 09/11] parse list of cov thresholds for sambamba command/init --- chanjo/cli/init.py | 7 ++++--- chanjo/config/questions.py | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/chanjo/cli/init.py b/chanjo/cli/init.py index 1e3db8dd..121e73db 100644 --- a/chanjo/cli/init.py +++ b/chanjo/cli/init.py @@ -18,8 +18,9 @@ def init(context, setup, reset, automate): click.echo(chanjo.__banner__) if not automate: - questions = [('annotate.cutoff', 'sufficient coverage', - context.obj.get('annotate', {}).get('cutoff', 10)), + questions = [('sambamba.cov_treshold', 'sufficient coverage', + context.obj.get('sambamba', {}).get('cov_treshold', + [10, 20])), ('database', 'central database path/URI', context.obj['database'])] # launch init pipeline @@ -47,5 +48,5 @@ def init_pipeline(program, config, questions): for dot_key, value in user_defaults.items(): config.set(dot_key, value, scope=config.user_data) - # Write to the config file + # write to the config file config.save(default_flow_style=False) diff --git a/chanjo/config/questions.py b/chanjo/config/questions.py index b0a48628..fe1be21c 100644 --- a/chanjo/config/questions.py +++ b/chanjo/config/questions.py @@ -96,6 +96,9 @@ def ask(prompt, default=None, color='cyan'): # write default option in parentheses, use it as response if nothing # was submitted by user. response = input(build_prompt(prompt, default_string)) or default + if isinstance(default, list) and isinstance(response, str): + sep = ',' if ',' in response else None + response = [int(item) for item in response.split(sep)] # print the updated confirmation line by replacing the previous echo(MOVE_CURSOR_UP + ERASE_LINE From 7c7ef8b1e6b986d540bc0a3d597c18ae5f87a0a9 Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 19:22:05 +0200 Subject: [PATCH 10/11] update demo fix output of calc region --- chanjo/cli/calculate.py | 4 ++- chanjo/store/api.py | 5 ++-- docs/introduction.rst | 60 ++++++++++++++++++++++------------------- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/chanjo/cli/calculate.py b/chanjo/cli/calculate.py index 5ad36c0c..9d2f637f 100644 --- a/chanjo/cli/calculate.py +++ b/chanjo/cli/calculate.py @@ -53,7 +53,9 @@ def region(context, sample, per, chromosome, start, end): logger.debug('region id detected, parse string') results = api.region_alt(chromosome, sample_id=sample, per=per) else: - results = api.region(chromosome, start, end, sample_id=sample, per=per) + query = api.region(chromosome, start, end, sample_id=sample, per=per) + results = ({'exon_id': exon_id, 'metrics': data} + for exon_id, data in query) if per == 'exon': dump_json(*results) else: diff --git a/chanjo/store/api.py b/chanjo/store/api.py index aa598d33..475bb672 100644 --- a/chanjo/store/api.py +++ b/chanjo/store/api.py @@ -122,7 +122,8 @@ def region(self, chromosome, start, end, sample_id=None, per=None): .filter(Exon.chromosome == chromosome, Exon.start >= start, Exon.end <= end) - .group_by(ExonStatistic.metric)) + .group_by(ExonStatistic.metric) + .order_by(Exon.exon_id)) if sample_id: results = (results.join(ExonStatistic.sample) @@ -130,7 +131,7 @@ def region(self, chromosome, start, end, sample_id=None, per=None): if per == 'exon': results = results.group_by(Exon.exon_id) - exon_groups = group_by_field(results, name='exon_id') + exon_groups = group_by_field(results) return exon_groups else: data = {metric: value for exon_id, metric, value in results} diff --git a/docs/introduction.rst b/docs/introduction.rst index aea4e174..03a15b02 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -83,53 +83,59 @@ We now have some information loaded for a few samples and we can now start explo $ chanjo calculate mean sample1 { - "completeness_20": 96.02074970588237, - "completeness_10": 96.80992352941175, - "completeness_100": 66.78868541470588, - "mean_coverage": 171.9479456635295, + "metrics": { + "completeness_20": 90.38, + "completeness_10": 90.92, + "completeness_100": 70.62, + "mean_coverage": 193.85 + }, "sample_id": "sample1" } $ chanjo calculate gene FAP MUSK { - "MUSK": { - "completeness_20": 100.0, - "completeness_10": 100.0, - "completeness_100": 92.58771999999999, - "mean_coverage": 324.4876066666667, - "sample_id": "sample5" - }, - "FAP": { - "completeness_20": 97.08153846153847, - "completeness_10": 100.0, - "completeness_100": 45.32870461538461, - "mean_coverage": 137.20929999999998, - "sample_id": "sample5" + "genes": { + "MUSK": { + "completeness_20": 100.0, + "completeness_10": 100.0, + "completeness_100": 99.08, + "mean_coverage": 370.36 + }, + "FAP": { + "completeness_20": 97.24, + "completeness_10": 100.0, + "completeness_100": 50.19, + "mean_coverage": 151.63 + } }, "sample_id": "sample5" } - ... + [...] $ chanjo calculate region 11 619304 619586 { "completeness_20": 100.0, "completeness_10": 100.0, "completeness_100": 100.0, - "mean_coverage": 253.81090000000003 + "mean_coverage": 258.24 } $ chanjo calculate region 11 619304 619586 --per exon { - "completeness_20": 100.0, - "completeness_10": 100.0, - "completeness_100": 100.0, - "mean_coverage": 223.3904, + "metrics": { + "completeness_20": 100.0, + "completeness_10": 100.0, + "completeness_100": 100.0, + "mean_coverage": 223.3904 + }, "exon_id": "11-619305-619389" } { - "completeness_20": 100.0, - "completeness_10": 100.0, - "completeness_100": 100.0, - "mean_coverage": 284.2314, + "metrics": { + "completeness_20": 100.0, + "completeness_10": 100.0, + "completeness_100": 100.0, + "mean_coverage": 284.23 + }, "exon_id": "11-619473-619586" } From 04c7632e972b26fd48386f4676c807d135318886 Mon Sep 17 00:00:00 2001 From: Robin Andeer Date: Mon, 7 Sep 2015 19:29:01 +0200 Subject: [PATCH 11/11] update README about 3.0 --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d21004a6..83cd567c 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,13 @@ Chanjo is coverage analysis for clinical sequencing. It's implemented in Python with a command line interface that adheres to [UNIX pipeline philisophy][unix]. +## Whats new in Chanjo 3.0? +Hey - exiting things are coming to the new version of Chanjo :smile: + +The primary change is [Sambamba][sambamba] integration. Just run `sambamba depth region` and load the output into Chanjo for further data exploration. Chanjo is now more flexible, accurate, and much easier to install. We have also built in some basic commands to quickly extract statistics from the database right from the command line. + ## Installation -Chanjo is distruibuted through "pip". Install the latest release by running: +Chanjo is distruibuted through "pip". Install the latest stable release by running: ```bash $ pip install chanjo @@ -33,7 +38,7 @@ Chanjo exposes a composable command line interface with a nifty config file impl $ chanjo init --setup $ chanjo load /path/to/sambamba.output.bed $ chanjo calculate mean sample1 -{"completeness_10": 96.09, "mean_coverage": 240.96, "sample_id": "ADM980A4"} +{"metrics": {"completeness_10": 90.92, "mean_coverage": 193.85}, "sample_id": "sample1"} ``` ## Documentation @@ -69,6 +74,7 @@ Anyone can help make this project better - read [CONTRIBUTION](CONTRIBUTION.md) [bedtools]: http://bedtools.readthedocs.org/en/latest/ [thesis]: https://s3.amazonaws.com/tudo/chanjo/RobinAndeerMastersThesisFinal_2013.pdf [report]: https://github.com/robinandeer/chanjo-report +[sambamba]: http://lomereiter.github.io/sambamba/ [coveralls-url]: https://coveralls.io/r/robinandeer/chanjo [coveralls-image]: https://img.shields.io/coveralls/robinandeer/chanjo.svg?style=flat