diff --git a/README.md b/README.md index 2278dbd4..83cd567c 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,13 @@ Chanjo is coverage analysis for clinical sequencing. It's implemented in Python with a command line interface that adheres to [UNIX pipeline philisophy][unix]. +## Whats new in Chanjo 3.0? +Hey - exiting things are coming to the new version of Chanjo :smile: + +The primary change is [Sambamba][sambamba] integration. Just run `sambamba depth region` and load the output into Chanjo for further data exploration. Chanjo is now more flexible, accurate, and much easier to install. We have also built in some basic commands to quickly extract statistics from the database right from the command line. + ## Installation -Chanjo is distruibuted through "pip". Install the latest release by running: +Chanjo is distruibuted through "pip". Install the latest stable release by running: ```bash $ pip install chanjo @@ -33,8 +38,7 @@ Chanjo exposes a composable command line interface with a nifty config file impl $ chanjo init --setup $ chanjo load /path/to/sambamba.output.bed $ chanjo calculate mean sample1 -#sampleId mean-coverage -sample10 176.513223249 +{"metrics": {"completeness_10": 90.92, "mean_coverage": 193.85}, "sample_id": "sample1"} ``` ## Documentation @@ -56,6 +60,7 @@ Chanjo is not the right choice if you care about coverage for every base across - Robin Andeer - Luca Beltrame ([lbeltrame](https://github.com/lbeltrame)) - John Kern ([kern3020](https://github.com/kern3020)) +- Måns Magnusson ([moonso](https://github.com/moonso)) ## License MIT. See the [LICENSE](LICENSE) file for more details. @@ -69,6 +74,7 @@ Anyone can help make this project better - read [CONTRIBUTION](CONTRIBUTION.md) [bedtools]: http://bedtools.readthedocs.org/en/latest/ [thesis]: https://s3.amazonaws.com/tudo/chanjo/RobinAndeerMastersThesisFinal_2013.pdf [report]: https://github.com/robinandeer/chanjo-report +[sambamba]: http://lomereiter.github.io/sambamba/ [coveralls-url]: https://coveralls.io/r/robinandeer/chanjo [coveralls-image]: https://img.shields.io/coveralls/robinandeer/chanjo.svg?style=flat diff --git a/chanjo/__init__.py b/chanjo/__init__.py index bbc7d601..6c031726 100644 --- a/chanjo/__init__.py +++ b/chanjo/__init__.py @@ -32,4 +32,4 @@ __email__ = 'robin.andeer@gmail.com' __license__ = 'MIT' -__copyright__ = 'Copyright 2014 Robin Andeer' +__copyright__ = 'Copyright 2015 Robin Andeer' diff --git a/chanjo/annotate/sambamba/run.py b/chanjo/annotate/sambamba/run.py index 8388d3a8..f9e66ba4 100644 --- a/chanjo/annotate/sambamba/run.py +++ b/chanjo/annotate/sambamba/run.py @@ -7,10 +7,7 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()): - """ - Run sambamba from chanjo. - - """ + """Run sambamba from chanjo.""" logger = logging.getLogger(__name__) logger = logging.getLogger("chanjo.sambamba") log_stream = get_log_stream(logger) @@ -22,15 +19,14 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()): region_file, bam_file ] - + if outfile: sambamba_call += ["-o", outfile] - + for coverage_treshold in cov_treshold: sambamba_call += ['-T', str(coverage_treshold)] - - logger.info("Running sambamba with call: {0}".format(' '.join(sambamba_call))) - + + logger.info("Running sambamba with call: %s", ' '.join(sambamba_call)) try: subprocess.check_call( sambamba_call, @@ -40,10 +36,9 @@ def run_sambamba(bam_file, region_file, outfile=None, cov_treshold=()): logger.critical("sambamba seems to not exist on your system.") raise e except CalledProcessError as e: - logger.critical("Something went wrong when running sambamba. "\ - "Please see sambamba error output.") + logger.critical("Something went wrong when running sambamba. " + "Please see sambamba error output.") raise e - + logger.debug("sambamba run successfull") - return diff --git a/chanjo/cli/calculate.py b/chanjo/cli/calculate.py index 8b3b0e8e..9d2f637f 100644 --- a/chanjo/cli/calculate.py +++ b/chanjo/cli/calculate.py @@ -4,6 +4,7 @@ import click from chanjo.store import ChanjoAPI +from chanjo.store.utils import filter_samples from .utils import dump_json logger = logging.getLogger(__name__) @@ -21,7 +22,10 @@ def calculate(context): @click.pass_context def mean(context, samples): """Report mean coverage for a list of samples.""" - results = context.parent.api.mean(*samples) + api = context.parent.api + query = filter_samples(api.query(), sample_ids=samples) + results = ({'sample_id': sample_id, 'metrics': data} + for sample_id, data in api.means(query)) dump_json(*results) @@ -49,7 +53,9 @@ def region(context, sample, per, chromosome, start, end): logger.debug('region id detected, parse string') results = api.region_alt(chromosome, sample_id=sample, per=per) else: - results = api.region(chromosome, start, end, sample_id=sample, per=per) + query = api.region(chromosome, start, end, sample_id=sample, per=per) + results = ({'exon_id': exon_id, 'metrics': data} + for exon_id, data in query) if per == 'exon': dump_json(*results) else: diff --git a/chanjo/cli/init.py b/chanjo/cli/init.py index 1e3db8dd..121e73db 100644 --- a/chanjo/cli/init.py +++ b/chanjo/cli/init.py @@ -18,8 +18,9 @@ def init(context, setup, reset, automate): click.echo(chanjo.__banner__) if not automate: - questions = [('annotate.cutoff', 'sufficient coverage', - context.obj.get('annotate', {}).get('cutoff', 10)), + questions = [('sambamba.cov_treshold', 'sufficient coverage', + context.obj.get('sambamba', {}).get('cov_treshold', + [10, 20])), ('database', 'central database path/URI', context.obj['database'])] # launch init pipeline @@ -47,5 +48,5 @@ def init_pipeline(program, config, questions): for dot_key, value in user_defaults.items(): config.set(dot_key, value, scope=config.user_data) - # Write to the config file + # write to the config file config.save(default_flow_style=False) diff --git a/chanjo/cli/load.py b/chanjo/cli/load.py index 48cbfa63..da1f9068 100644 --- a/chanjo/cli/load.py +++ b/chanjo/cli/load.py @@ -4,8 +4,8 @@ import click from sqlalchemy.exc import IntegrityError -from chanjo.load import sambamba -from chanjo.parse import bed +from chanjo.load.sambamba import rows as sambamba_rows +from chanjo.parse import sambamba from chanjo.store import Store from chanjo.utils import validate_stdin @@ -13,25 +13,27 @@ @click.command() +@click.option('-s', '--sample', help='override sample id from file') @click.option('-g', '--group', help='id to group related samples') @click.argument('bed_stream', callback=validate_stdin, type=click.File(encoding='utf-8'), default='-', required=False) @click.pass_context -def load(context, group, bed_stream): +def load(context, sample, group, bed_stream): """Load Sambamba output into the database for a sample.""" chanjo_db = Store(uri=context.obj['database']) try: - load_sambamba(chanjo_db, bed_stream, group_id=group) + load_sambamba(chanjo_db, bed_stream, sample_id=sample, group_id=group) except IntegrityError: logger.error('sample already loaded, rolling back') chanjo_db.session.rollback() context.abort() -def load_sambamba(chanjo_db, bed_iterable, group_id=None): +def load_sambamba(chanjo_db, bed_iterable, sample_id=None, group_id=None): """Load Sambamba BED output from a stream.""" - rows = bed.chanjo(bed_iterable) - stats = sambamba.rows(chanjo_db.session, rows, group_id=group_id) + rows = sambamba.depth_output(bed_iterable) + stats = sambamba_rows(chanjo_db.session, rows, sample_id=sample_id, + group_id=group_id) for index, stat in enumerate(stats): chanjo_db.add(stat) if index % 10000 == 0: diff --git a/chanjo/cli/root.py b/chanjo/cli/root.py index 0a03110e..1fef934a 100644 --- a/chanjo/cli/root.py +++ b/chanjo/cli/root.py @@ -9,8 +9,6 @@ """ import click -import chanjo - from chanjo.compat import text_type from chanjo.config import Config, CONFIG_FILE_NAME, markup from chanjo.log import init_log, LEVELS @@ -18,6 +16,7 @@ from chanjo import __version__, logger + def print_version(ctx, param, value): """Callback function for printing version and exiting Args: @@ -33,21 +32,21 @@ def print_version(ctx, param, value): ctx.exit() @click.group(cls=EntryPointsCLI) -@click.option('-c', '--config', - default=CONFIG_FILE_NAME, - type=click.Path(), +@click.option('-c', '--config', + default=CONFIG_FILE_NAME, + type=click.Path(), help='path to config file' ) -@click.option('-d', '--database', +@click.option('-d', '--database', type=text_type, help='path/URI of the SQL database' ) -@click.option('-v', '--verbose', +@click.option('-v', '--verbose', count=True, default=0, help="Increase output verbosity. Can be used multiple times, eg. -vv" ) -@click.option('--log_file', +@click.option('--log_file', type=click.Path() ) @click.option('--version', @@ -60,10 +59,10 @@ def print_version(ctx, param, value): def root(context, config, database, verbose, log_file): """Clinical sequencing coverage analysis tool.""" # setup logging - - loglevel = LEVELS.get(min(verbose,2), "WARNING") + + loglevel = LEVELS.get(min(verbose, 2), "WARNING") init_log(logger, loglevel=loglevel, filename=log_file) - logger.info("version {0}".format( __version__)) + logger.info("version {0}".format(__version__)) # avoid setting global defaults in Click options, do it below when # updating the config object diff --git a/chanjo/cli/sambamba.py b/chanjo/cli/sambamba.py index cbebcfe4..7fa45ae1 100644 --- a/chanjo/cli/sambamba.py +++ b/chanjo/cli/sambamba.py @@ -5,6 +5,7 @@ from chanjo.annotate.sambamba import run_sambamba + @click.command() @click.argument('bam_file', type=click.Path(exists=True), @@ -25,7 +26,7 @@ "the percentage of bases in the region"\ "where coverage is more than this value" ) -@click.option('-o', '--outfile', +@click.option('-o', '--outfile', type=click.Path(exists=False), help='Specify the path to a file where results should be stored.' ) @@ -33,24 +34,23 @@ def sambamba(context, bam_file, exon_bed, gene_bed, cov_treshold, outfile): """Run Sambamba from chanjo.""" logger = logging.getLogger(__name__) - #For testing only: + # For testing only: logger = logging.getLogger("chanjo.cli.sambamba") logger.info("Running chanjo sambamba") - + if not (exon_bed or gene_bed): logger.warning("Please provide a region file in BED format") sys.exit() if exon_bed and gene_bed: logger.warning("Only one region file at a time") sys.exit() - + region_file = exon_bed if gene_bed: region_file = gene_bed - + try: run_sambamba(bam_file, region_file, outfile, cov_treshold) except Exception as e: logger.debug(e) click.Abort() - diff --git a/chanjo/config/questions.py b/chanjo/config/questions.py index b0a48628..fe1be21c 100644 --- a/chanjo/config/questions.py +++ b/chanjo/config/questions.py @@ -96,6 +96,9 @@ def ask(prompt, default=None, color='cyan'): # write default option in parentheses, use it as response if nothing # was submitted by user. response = input(build_prompt(prompt, default_string)) or default + if isinstance(default, list) and isinstance(response, str): + sep = ',' if ',' in response else None + response = [int(item) for item in response.split(sep)] # print the updated confirmation line by replacing the previous echo(MOVE_CURSOR_UP + ERASE_LINE diff --git a/chanjo/load/sambamba.py b/chanjo/load/sambamba.py index f9b76ca7..e8825099 100644 --- a/chanjo/load/sambamba.py +++ b/chanjo/load/sambamba.py @@ -7,7 +7,7 @@ from .utils import get_or_build_exon -def rows(session, row_data, group_id=None): +def rows(session, row_data, sample_id=None, group_id=None): """Handle rows of sambamba output. N.B. only handles single sample annotations. @@ -15,16 +15,20 @@ def rows(session, row_data, group_id=None): Args: session (Session): database session object row_data (dict): parsed sambamba output rows + sample_id (Optional[str]): id to reference sample group_id (Optional[str]): id to group samples together Yields: ExonStatistic: stats model linked to exon and sample """ - # use first row to get information on sample - first_row = next(iter(row_data)) - sample_obj = sample(first_row, group_id=group_id) - # place the first row back in the stream - all_data = cons(first_row, row_data) + if sample_id is None: + # use first row to get information on sample + first_row = next(iter(row_data)) + sample_id = first_row['sampleName'] + # place the first row back in the stream + all_data = cons(first_row, row_data) + + sample_obj = Sample(sample_id=sample_id, group_id=group_id) nested_stats = (row(session, data, sample_obj) for data in all_data) # flatten 2D nested list return (stat for stats in nested_stats for stat in stats) @@ -46,20 +50,6 @@ def row(session, data, sample_obj): return stats -def sample(data, group_id=None): - """Create sample model. - - Args: - data (dict): parsed sambamba output row - group_id (Optional[str]): id to group samples together - - Returns: - Sample: sample database model - """ - sample_obj = Sample(sample_id=data['sampleName'], group=group_id) - return sample_obj - - def statistics(data, sample_obj, exon_obj): """Create models from a sambamba output row. diff --git a/chanjo/load/utils.py b/chanjo/load/utils.py index 9b84dd17..7864a4d2 100644 --- a/chanjo/load/utils.py +++ b/chanjo/load/utils.py @@ -11,7 +11,8 @@ def _exon_kwargs(data): Returns: dict: kwargs prepared for Exon model """ - return {'exon_id': data['name'], 'chromosome': data['chrom'], + exon_id = data.get('name') or data['extraFields'][0] + return {'exon_id': exon_id, 'chromosome': data['chrom'], 'start': data['chromStart'], 'end': data['chromEnd']} diff --git a/chanjo/log.py b/chanjo/log.py index 75579833..29f3f895 100644 --- a/chanjo/log.py +++ b/chanjo/log.py @@ -1,23 +1,21 @@ -import sys import logging LEVELS = { - 0 : 'WARNING', - 1 : 'INFO', - 2 : 'DEBUG', + 0: 'WARNING', + 1: 'INFO', + 2: 'DEBUG', } + def init_log(logger, filename=None, loglevel=None): - """ - Initializes the log file in the proper format. + """Initializes the log file in the proper format. Arguments: - filename (str): Path to a file. Or None if logging is to be disabled. loglevel (str): Determines the level of the log output. """ - template = '[%(asctime)s] %(levelname)-8s: %(name)-25s: %(message)s' + template = "[%(asctime)s] %(levelname)-8s: %(name)-25s: %(message)s" formatter = logging.Formatter(template) if loglevel: @@ -42,15 +40,15 @@ def init_log(logger, filename=None, loglevel=None): logger.addHandler(console) + def get_log_stream(logger): - """ - Returns a stream to the root log file. + """Returns a stream to the root log file. + If there is no logfile return the stderr log stream - + Returns: A stream to the root log file or stderr stream. """ - file_stream = None log_stream = None for handler in logger.handlers: @@ -58,8 +56,8 @@ def get_log_stream(logger): file_stream = handler.stream else: log_stream = handler.stream - + if file_stream: return file_stream - + return log_stream diff --git a/chanjo/parse/bed/parse.py b/chanjo/parse/bed/parse.py index 63afd615..ef2f7e46 100644 --- a/chanjo/parse/bed/parse.py +++ b/chanjo/parse/bed/parse.py @@ -1,14 +1,12 @@ # -*- coding: utf-8 -*- """Parse BED files including Sambamba output files.""" from chanjo.compat import zip -from chanjo.parse import sambamba +from chanjo.exc import BedFormattingError from chanjo.utils import list_get def chanjo(handle): - """Parse a general BED file. - - Parses the optional columns not covered by Sambamba. + """Parse the chanjo specific columns in a BED file. Args: handle (iterable): Chanjo-formatted BED lines @@ -16,14 +14,46 @@ def chanjo(handle): Yields: dict: representation of row in BED file """ - row_data = sambamba.depth_output(handle) - for data in row_data: - data['name'] = list_get(data['extraFields'], 0) - data['score'] = int(list_get(data['extraFields'], 1, default=0)) - data['strand'] = list_get(data['extraFields'], 2) - element_combos = extra_fields(data['extraFields'][3:]) - data['elements'] = element_combos - yield data + lines = (line.strip() for line in handle if not line.startswith('#')) + rows = (line.split('\t') for line in lines) + for row in rows: + yield expand_row(row) + + +def expand_row(row): + """Parse a BED row. + + Args: + List[str]: BED row + + Returns: + dict: formatted BED row as a dict + + Raises: + BedFormattingError: failure to parse first three columns + """ + try: + data = { + 'chrom': row[0], + 'chromStart': int(row[1]), + 'chromEnd': int(row[2]) + } + except IndexError: + raise BedFormattingError('make sure fields are tab-separated') + except ValueError: + raise BedFormattingError("positions malformatted: {}".format(row)) + + try: + data['score'] = int(list_get(row, 4)) + except ValueError: + raise BedFormattingError('invalid BED syntax, score column is int') + + data['name'] = list_get(row, 3) + data['strand'] = list_get(row, 5) + element_combos = extra_fields(row[6:8]) + data['elements'] = element_combos + data['extra_fields'] = row[8:] + return data def extra_fields(columns): diff --git a/chanjo/store/__init__.py b/chanjo/store/__init__.py index 2d6de445..6d021a95 100644 --- a/chanjo/store/__init__.py +++ b/chanjo/store/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- from .api import ChanjoAPI from .core import Store -from .models import (Exon, ExonStatistic, Exon_Transcript, Gene, GeneStatistic, - Sample, Transcript) +from .models import (Exon, ExonStatistic, Exon_Transcript, Gene, Sample, + Transcript) diff --git a/chanjo/store/api.py b/chanjo/store/api.py index 450ea966..475bb672 100644 --- a/chanjo/store/api.py +++ b/chanjo/store/api.py @@ -1,42 +1,21 @@ # -*- coding: utf-8 -*- +from __future__ import division import itertools import logging from sqlalchemy.sql import func from chanjo.compat import itervalues +from chanjo.utils import list_get from .core import Store -from .models import Exon, ExonStatistic, Gene, Sample, Transcript -from .utils import group_by_field, predict_gender +from .converter import ChanjoConverterMixin +from .models import Exon, ExonStatistic, Sample, Transcript +from .utils import filter_samples, group_by_field, predict_gender logger = logging.getLogger(__name__) -def filter_samples(query, group_id=None, sample_ids=None): - """Filter a query to a subset of samples. - - Will return an unaltered query if none of the optional parameters - are set. `group_id` takes precedence over `sample_ids`. - - Args: - query (Query): SQLAlchemy query object - group_id (Optional[str]): sample group identifier - sample_ids (Optional[List[str]]): sample ids - - Returns: - Query: filtered query object - """ - if group_id: - logger.debug('filter based on group') - return query.filter(Sample.group == group_id) - elif sample_ids: - logger.debug('filter based on list of samples') - return query.filter(Sample.sample_id.in_(sample_ids)) - else: - return query - - -class ChanjoAPI(Store): +class ChanjoAPI(Store, ChanjoConverterMixin): """Interface to chanjo database for asking interesting questions. @@ -71,7 +50,7 @@ def weighted_average(self): weight = Exon.end - Exon.start total_weight = func.sum(weight) total_value = func.sum(ExonStatistic.value * weight) - weighted_mean = total_value / total_weight + weighted_mean = (total_value / total_weight).label('weighted_mean') return weighted_mean def samples(self, group_id=None, sample_ids=None): @@ -88,24 +67,24 @@ def samples(self, group_id=None, sample_ids=None): query = filter_samples(query, group_id=group_id, sample_ids=sample_ids) return query - def mean(self, *samples): - r"""Calculate mean values for all metrics on a per sample basis. + def means(self, query=None): + """Calculate means on a per sample basis. Args: - \*samples (Optional[List[str]]): filter by sample id + query (Optional[Query]): initialized and filtered query Returns: - dict: weighted averages grouped by sample + List[tuple]: sample id and grouped mean metrics """ - results = (self.query(Sample.sample_id, ExonStatistic.metric, - self.weighted_average) - .join(ExonStatistic.sample, ExonStatistic.exon) - .group_by(Sample.sample_id, ExonStatistic.metric)) - if samples: - results = results.filter(Sample.sample_id.in_(samples)) - - sample_groups = group_by_field(results, name='sample_id') - return sample_groups + query = query or self.query() + ready_query = (query.add_columns(Sample.sample_id, + ExonStatistic.metric, + self.weighted_average) + .join(ExonStatistic.sample, ExonStatistic.exon) + .group_by(Sample.sample_id, ExonStatistic.metric) + .order_by(Sample.sample_id)) + results = group_by_field(ready_query) + return results def region_alt(self, region_id, sample_id=None, per=None): """Parse region id as input for `region` method. @@ -143,7 +122,8 @@ def region(self, chromosome, start, end, sample_id=None, per=None): .filter(Exon.chromosome == chromosome, Exon.start >= start, Exon.end <= end) - .group_by(ExonStatistic.metric)) + .group_by(ExonStatistic.metric) + .order_by(Exon.exon_id)) if sample_id: results = (results.join(ExonStatistic.sample) @@ -151,7 +131,7 @@ def region(self, chromosome, start, end, sample_id=None, per=None): if per == 'exon': results = results.group_by(Exon.exon_id) - exon_groups = group_by_field(results, name='exon_id') + exon_groups = group_by_field(results) return exon_groups else: data = {metric: value for exon_id, metric, value in results} @@ -169,16 +149,16 @@ def gene(self, *gene_ids): samples = {} for gene_id in gene_ids: logger.debug('figure out which transcripts the gene belongs to') - tx_ids = list(self.gene_to_transcripts(gene_id)) - if len(tx_ids) == 0: + exon_objs = self.gene_exons(gene_id).all() + if len(exon_objs) == 0: raise AttributeError("gene id not in database: {}" .format(gene_id)) - - results = self.transcript_to_exons(*tx_ids) - for data in group_by_field(results, name='sample_id'): - if data['sample_id'] not in samples: - samples[data['sample_id']] = {'sample_id': data['sample_id']} - samples[data['sample_id']][gene_id] = data + exon_ids = [exon_obj.exon_id for exon_obj in exon_objs] + query = (self.query().filter(Exon.exon_id.in_(exon_ids))) + for sample_id, data in self.means(query): + if sample_id not in samples: + samples[sample_id] = {'sample_id': sample_id, 'genes': {}} + samples[sample_id]['genes'][gene_id] = data return itervalues(samples) @@ -202,61 +182,7 @@ def transcripts(self, *transcript_ids): Transcript.transcript_id)) return results - def all_transcripts(self, sample_id, *gene_ids): - r"""Fetch all transcripts from the database. - - Args: - sample_id (str): restrict query to a sample id - \*gene_ids (List[str]): genes to convert to transcripts - - Returns: - List[Transcript]: transcript models related to the genes - """ - query = (self.query(Transcript) - .join(Transcript.exons, Exon.stats, ExonStatistic.sample) - .filter(Sample.sample_id == sample_id)) - if gene_ids: - tx_ids = self.gene_to_transcripts(*gene_ids) - query = query.filter(Transcript.transcript_id.in_(tx_ids)) - return query - - def transcripts_to_genes(self, transcript_ids, db_ids=False): - """Fetch a list of genes related to some exons. - - Args: - transcript_ids (List[str]): transcript ids to convert to genes - db_ids (Optional[bool]): if sending in primary key ids - - Returns: - List[Gene]: gene models related to the transcripts - """ - results = self.query(Gene).join(Gene.transcripts) - if db_ids: - results = results.filter(Transcript.id.in_(transcript_ids)) - else: - condition = Transcript.transcript_id.in_(transcript_ids) - results = results.filter(condition) - return results - - def exons_to_transcripts(self, exons_ids, db_ids=False): - """Fetch a list of transcripts related to some exons. - - Args: - exon_ids (List[str]): list of exon ids - db_ids (Optional[bool]): if sending in primary key ids - - Returns: - List[Transcript]: transcripts related to the exons - """ - results = self.query(Transcript).join(Transcript.exons) - if db_ids: - results = results.filter(Exon.id.in_(exons_ids)) - else: - results = results.filter(Exon.exon_id.in_(exons_ids)) - return results - - def incomplete_exons(self, level=10, threshold=100, group_id=None, - sample_ids=None): + def incomplete_exons(self, query=None, level=10, threshold=100): """Fetch exons with incomplete coverage at a specifc level. Args: @@ -266,64 +192,19 @@ def incomplete_exons(self, level=10, threshold=100, group_id=None, sample_ids (Optional[List[str]]): sample ids Returns: - List[tuple]: sample, exon, completeness value + List[tuple]: sample_id, dict with exons and completeness """ completeness = "completeness_{}".format(level) - query = (self.query(Sample.sample_id, - Exon.exon_id, - ExonStatistic.value) - .join(ExonStatistic.exon, ExonStatistic.sample) - .filter(ExonStatistic.metric == completeness, - ExonStatistic.value < threshold)) - query = filter_samples(query, group_id=group_id, sample_ids=sample_ids) - return query - - def gene_panel(self, gene_ids, group_id=None, sample_ids=None): - """Report mean coverage (+ completeness) for a panel of genes. - - Args: - gene_ids (List[str]): gene ids for the panel - group_id (Optional[str]): sample group identifier - sample_ids (Optional[List[str]]): sample ids - - Returns: - List[tuple]: sample, metric, weighted average value - """ - tx_ids = list(self.gene_to_transcripts(*gene_ids)) - query = self.transcript_to_exons(*tx_ids) - query = filter_samples(query, group_id=group_id, sample_ids=sample_ids) - return query - - def transcript_to_exons(self, *transcript_ids): - """Fetch a unique list of exons related to some transcripts. - - Args: - transcript_ids (List[str]): transcript ids to look up - - Returns: - List[tuple]: sample, metric, weighted average value - """ - results = (self.query(Sample.sample_id, ExonStatistic.metric, - self.weighted_average) - .join(ExonStatistic.sample, ExonStatistic.exon, - Exon.transcripts) - .filter(Transcript.transcript_id.in_(transcript_ids)) - .group_by(Sample.sample_id, ExonStatistic.metric)) - return results - - def gene_to_transcripts(self, *gene_ids): - r"""Fetch a unique list of transcripts related to some genes. - - Args: - \*gene_ids (List[str]): gene ids - - Returns: - List[str]: transcript ids - """ - results = (self.query(Transcript.transcript_id) - .join(Transcript.gene) - .filter(Gene.gene_id.in_(gene_ids))) - return (result[0] for result in results) + query = query or self.query() + ready_query = (self.query(Sample.sample_id, + Exon.exon_id, + ExonStatistic.value) + .join(ExonStatistic.exon, ExonStatistic.sample) + .filter(ExonStatistic.metric == completeness, + ExonStatistic.value < threshold) + .order_by(Sample.sample_id)) + exon_samples = group_by_field(ready_query) + return exon_samples def sex_coverage(self, sex_chromosomes=('X', 'Y')): """Query for average on X/Y chromsomes. @@ -363,3 +244,42 @@ def sex_check(self, group_id=None, sample_ids=None): # run the predictor gender = predict_gender(*sex_coverage) yield sample_id, gender, sex_coverage[0], sex_coverage[1] + + def diagnostic_yield(self, sample_id, query=None, level=10, threshold=100): + """Calculate transcripts that aren't completely covered.""" + query = query or self.query() + level = "completeness_{}".format(level) + + all_query = (query.add_columns(Transcript.transcript_id) + .distinct(Transcript.transcript_id) + .join(Exon.transcripts)) + all_count = all_query.count() + + logger.debug('find out which exons failed') + yield_query = (query.add_columns(Exon.exon_id) + .join(ExonStatistic.exon) + .filter(ExonStatistic.metric == level, + ExonStatistic.value < threshold)) + exon_ids = [row[0] for row in yield_query] + transcripts = self.exon_transcripts(exon_ids) + + tx_count = transcripts.count() + diagnostic_yield = 100 - (tx_count/all_count * 100) + return { + "diagnostic_yield": diagnostic_yield, + "count": tx_count, + "total_count": all_count, + "transcripts": transcripts + } + + def completeness_levels(self): + """Return a list of included completeness levels.""" + metrics = (self.query(ExonStatistic.metric) + .distinct(ExonStatistic.metric)) + # for all completeness levels, extract the level as int and full name + levels = ((int(field[0].split('_')[-1]), field[0]) + for field in metrics + if field[0].startswith('completeness')) + # sort them based on the level int + sorted_levels = sorted(levels, key=lambda level: list_get(level, 0)) + return sorted_levels diff --git a/chanjo/store/converter.py b/chanjo/store/converter.py new file mode 100644 index 00000000..46ad8042 --- /dev/null +++ b/chanjo/store/converter.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- +from .models import Exon, ExonStatistic, Gene, Sample, Transcript + + +class ChanjoConverterMixin(object): + + """Mixin to provide conversions between genomic elements.""" + + def gene_transcripts(self, *gene_ids): + """Fetch transcripts related to a list of genes. + + Args: + *gene_ids (List[str]): gene ids + + Returns: + List[Transcript]: transcript objects related to the genes + """ + query = (self.query(Transcript) + .join(Exon.transcripts) + .filter(Gene.gene_id.in_(gene_ids))) + return query + + def gene_exons(self, *gene_ids): + """Fetch exons related to a list of genes. + + Args: + *gene_ids (List[str]): gene ids + + Returns: + List[Exon]: exon objects related to the genes + """ + query = (self.query(Exon) + .join(Exon.transcripts, Transcript.gene) + .filter(Gene.gene_id.in_(gene_ids))) + return query + + def transcript_genes(self, transcript_ids, db_ids=False): + """Fetch a lists of genes related to some exons. + + Args: + transcript_ids (List[str]): transcript ids to convert to genes + db_ids (Optional[bool]): if sending in primary key ids + + Returns: + List[Gene]: gene models related to the transcripts + """ + results = self.query(Gene).join(Gene.transcripts) + if db_ids: + results = results.filter(Transcript.id.in_(transcript_ids)) + else: + condition = Transcript.transcript_id.in_(transcript_ids) + results = results.filter(condition) + return results + + def transcript_exons(self, *transcript_ids): + """Fetch a unique list of exons related to some transcripts. + + Args: + transcript_ids (List[str]): transcript ids to look up + + Returns: + List[tuple]: sample, metric, weighted average value + """ + results = (self.query(Sample.sample_id, ExonStatistic.metric, + self.weighted_average) + .join(ExonStatistic.sample, ExonStatistic.exon, + Exon.transcripts) + .filter(Transcript.transcript_id.in_(transcript_ids)) + .group_by(Sample.sample_id, ExonStatistic.metric)) + return results + + def exon_transcripts(self, exons_ids, db_ids=False): + """Fetch a list of transcripts related to some exons. + + Args: + exon_ids (List[str]): list of exon ids + db_ids (Optional[bool]): if sending in primary key ids + + Returns: + List[Transcript]: transcripts related to the exons + """ + results = self.query(Transcript).join(Transcript.exons) + if db_ids: + results = results.filter(Exon.id.in_(exons_ids)) + else: + results = results.filter(Exon.exon_id.in_(exons_ids)) + return results + + def exon_genes(self, *exon_ids): + """Fetch genes related to a list of exons. + + Args: + *exon_ids (List[str]): exon ids + + Returns: + List[Gene]: gene objects related to the exons + """ + query = (self.query(Gene) + .join(Gene.transcripts, Transcript.exons) + .filter(Exon.exon_id.in_(exon_ids))) + return query diff --git a/chanjo/store/models.py b/chanjo/store/models.py index 6f9cb107..c00e3955 100644 --- a/chanjo/store/models.py +++ b/chanjo/store/models.py @@ -135,7 +135,7 @@ class Sample(BASE): id = Column(Integer, primary_key=True) sample_id = Column(String(32), unique=True) - group = Column(String(32), index=True) + group_id = Column(String(32), index=True) created_at = Column(DateTime, default=datetime.now) updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now) @@ -148,37 +148,24 @@ class Statistic(object): """Coverage metrics for a single element and a given sample. Args: - sample_id (int): unique sample identifier - group_id (str): group identifier + metric (str): identifier for the metric + value (float): value for the metric """ - id = Column(Integer, primary_key=True) + id = Column(Integer, primary_key=True) metric = Column(String(32)) value = Column(Float) -class GeneStatistic(Statistic, BASE): - - """ - - Args: - parent_id (int): parent record Gene id - """ - - __tablename__ = 'gene_stat' - - sample_id = Column(Integer, ForeignKey('sample.id')) - sample = relationship(Sample, backref=backref('gene_stats')) - gene_id = Column(Integer, ForeignKey('gene.id')) - gene = relationship(Gene, backref=backref('stats')) - - class ExonStatistic(Statistic, BASE): - """ + """Statistics on the exon level, related to sample and exon. Args: - parent_id (int): parent record Exon id + sample (Sample): parent record Sample + exon (Exon): parent record Exon + sample_id (int): parent record Sample id + exon_id (int): parent record Exon id """ __tablename__ = 'exon_stat' diff --git a/chanjo/store/utils.py b/chanjo/store/utils.py index 4df369de..dc698a28 100644 --- a/chanjo/store/utils.py +++ b/chanjo/store/utils.py @@ -1,27 +1,58 @@ # -*- coding: utf-8 -*- -from chanjo.compat import itervalues +import logging +from .models import Sample -def group_by_field(results, name='field_id'): +logger = logging.getLogger(__name__) + + +def filter_samples(query, group_id=None, sample_ids=None): + """Filter a query to a subset of samples. + + Will return an unaltered query if none of the optional parameters + are set. `group_id` takes precedence over `sample_ids`. + + Args: + query (Query): SQLAlchemy query object + group_id (Optional[str]): sample group identifier + sample_ids (Optional[List[str]]): sample ids + + Returns: + Query: filtered query object + """ + if group_id: + logger.debug('filter based on group') + return query.filter(Sample.group_id == group_id) + elif sample_ids: + logger.debug('filter based on list of samples') + return query.filter(Sample.sample_id.in_(sample_ids)) + else: + return query + + +def group_by_field(results): """Group items based on the initial field. + Assumes a sorted list of results. + Args: results (List[tuple]): list of fields to group - name (Optional[str]): what to call the first field - Returns: - List[dict]: grouped dicts + Yields: + str, dict: next group of results along with the group id """ - groups = {} - # loop over results - for field_id, metric, value in results: - if field_id not in groups: - # init a new group - groups[field_id] = {name: field_id} + results = iter(results) + current_group_id, metric, value = next(results) + group = {metric: value} + for group_id, metric, value in results: + if current_group_id != group_id: + yield current_group_id, group + # reset the group + current_group_id = group_id + group = {} # store the metric under the correct group - groups[field_id][metric] = value - - return itervalues(groups) + group[metric] = value + yield current_group_id, group def predict_gender(x_coverage, y_coverage): diff --git a/chanjo/utils.py b/chanjo/utils.py index 85c080ca..c3bac1aa 100644 --- a/chanjo/utils.py +++ b/chanjo/utils.py @@ -6,7 +6,6 @@ A few general utility functions that might also be useful outside Chanjo. """ -from __future__ import division import pkg_resources import random import sys diff --git a/docs/introduction.rst b/docs/introduction.rst index aea4e174..03a15b02 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -83,53 +83,59 @@ We now have some information loaded for a few samples and we can now start explo $ chanjo calculate mean sample1 { - "completeness_20": 96.02074970588237, - "completeness_10": 96.80992352941175, - "completeness_100": 66.78868541470588, - "mean_coverage": 171.9479456635295, + "metrics": { + "completeness_20": 90.38, + "completeness_10": 90.92, + "completeness_100": 70.62, + "mean_coverage": 193.85 + }, "sample_id": "sample1" } $ chanjo calculate gene FAP MUSK { - "MUSK": { - "completeness_20": 100.0, - "completeness_10": 100.0, - "completeness_100": 92.58771999999999, - "mean_coverage": 324.4876066666667, - "sample_id": "sample5" - }, - "FAP": { - "completeness_20": 97.08153846153847, - "completeness_10": 100.0, - "completeness_100": 45.32870461538461, - "mean_coverage": 137.20929999999998, - "sample_id": "sample5" + "genes": { + "MUSK": { + "completeness_20": 100.0, + "completeness_10": 100.0, + "completeness_100": 99.08, + "mean_coverage": 370.36 + }, + "FAP": { + "completeness_20": 97.24, + "completeness_10": 100.0, + "completeness_100": 50.19, + "mean_coverage": 151.63 + } }, "sample_id": "sample5" } - ... + [...] $ chanjo calculate region 11 619304 619586 { "completeness_20": 100.0, "completeness_10": 100.0, "completeness_100": 100.0, - "mean_coverage": 253.81090000000003 + "mean_coverage": 258.24 } $ chanjo calculate region 11 619304 619586 --per exon { - "completeness_20": 100.0, - "completeness_10": 100.0, - "completeness_100": 100.0, - "mean_coverage": 223.3904, + "metrics": { + "completeness_20": 100.0, + "completeness_10": 100.0, + "completeness_100": 100.0, + "mean_coverage": 223.3904 + }, "exon_id": "11-619305-619389" } { - "completeness_20": 100.0, - "completeness_10": 100.0, - "completeness_100": 100.0, - "mean_coverage": 284.2314, + "metrics": { + "completeness_20": 100.0, + "completeness_10": 100.0, + "completeness_100": 100.0, + "mean_coverage": 284.23 + }, "exon_id": "11-619473-619586" } diff --git a/tests/load/test_sambamba.py b/tests/load/test_sambamba.py index f00efc61..8d5ffe99 100644 --- a/tests/load/test_sambamba.py +++ b/tests/load/test_sambamba.py @@ -4,7 +4,7 @@ from chanjo.store import Store from chanjo.load import sambamba as load_sambamba -from chanjo.parse import bed as parse_bed +from chanjo.parse import sambamba as parse_sambamba from get_bedrows import get_bedlines bed_lines = get_bedlines() @@ -14,7 +14,7 @@ class TestSambamba: def setup(self): self.store = Store('sqlite://') self.store.set_up() - self.row_data = parse_bed.chanjo(bed_lines) + self.row_data = parse_sambamba.depth_output(bed_lines) def teardown(self): self.store.tear_down() diff --git a/tests/parse/bed/test_bed_parse.py b/tests/parse/bed/test_bed_parse.py index 280db4c4..21993d2d 100644 --- a/tests/parse/bed/test_bed_parse.py +++ b/tests/parse/bed/test_bed_parse.py @@ -1,4 +1,7 @@ # -*- coding: utf-8 -*- +import pytest + +from chanjo.exc import BedFormattingError from chanjo.parse import bed TEST_OUTPUT = 'tests/fixtures/test.sambamba.bed' @@ -23,16 +26,10 @@ def test_chanjo(): assert list(data['elements']) == [('CCDS30547.1', 'OR4F5')] -def test_chanjo_without_elements(): +def test_chanj_with_sambamba(): bed_lines = ["# chrom\tchromStart\tchromEnd\treadCount\tmeanCoverage" "\tsampleName", "1\t69089\t70007\t232\t25.4946\tADM992A10\t"] - data_rows = list(bed.chanjo(bed_lines)) - data = data_rows[0] - - assert len(data_rows) == 1 - assert data['name'] is None - assert data['score'] == 0 - assert data['strand'] is None - assert list(data['elements']) == [] + with pytest.raises(BedFormattingError): + list(bed.chanjo(bed_lines))