Skip to content

Commit

Permalink
Merge ca87919 into fdf8a3d
Browse files Browse the repository at this point in the history
  • Loading branch information
adrosenbaum committed May 13, 2019
2 parents fdf8a3d + ca87919 commit 5c2144a
Show file tree
Hide file tree
Showing 6 changed files with 350 additions and 2 deletions.
45 changes: 45 additions & 0 deletions mutacc_auto/cli/export_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import click
import logging
from pathlib import Path

import yaml

from mutacc_auto.recipes.export_recipe import export_dataset


LOG = logging.getLogger(__name__)

def parse_path(ctx, param, value):

if value:
value = str(Path(str(value)).expanduser().absolute().resolve())
return value

@click.command('export')
@click.option('-C', '--mutacc-config',
type=click.Path(exists=True),
callback=parse_path,
help="configuration file used for mutacc")
@click.option('-o', '--vcf-out',
type=click.Path(exists=False),
callback=parse_path,
help="Path to created vcf-file")
@click.option('-b', '--background',
type=click.Path(exists=True),
help="yaml file with genomic backgrounds for each sample in trio")
@click.pass_context
def export_command(ctx, mutacc_config, vcf_out, background):

mutacc_config = mutacc_config or ctx.obj['mutacc_config']
mutacc_binary = ctx.obj.get('mutacc_binary')

with open(background, 'r') as background_handle:
background_datasets = yaml.load(background_handle)

files = export_dataset(mutacc_config=mutacc_config,
background=background_datasets,
mutacc_binary=mutacc_binary,
case_query='{}',
merged_vcf_path=vcf_out)

LOG.debug("files created: {}".format(files))
2 changes: 2 additions & 0 deletions mutacc_auto/cli/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from mutacc_auto import __version__
from .extract_command import extract_command
from .import_command import import_command
from .export_command import export_command

LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
LOG = logging.getLogger(__name__)
Expand Down Expand Up @@ -36,3 +37,4 @@ def cli(ctx, loglevel, config_file):

cli.add_command(extract_command)
cli.add_command(import_command)
cli.add_command(export_command)
4 changes: 2 additions & 2 deletions mutacc_auto/commands/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def check_output(self):

command_stdout = command_stdout.decode('utf-8')

LOG.debug("Executed:".format(str(self)))
LOG.debug("Executed: {}".format(str(self)))

return command_stdout

Expand All @@ -96,6 +96,6 @@ def call(self):

raise

LOG.debug("Executed:".format(str(self)))
LOG.debug("Executed: {}".format(str(self)))

return result
38 changes: 38 additions & 0 deletions mutacc_auto/commands/mutacc_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,41 @@ def __init__(self, config_file, extracted_case_file, mutacc_binary=None):
self.add_subcommand('db')
self.add_subcommand('import')
self.add_argument(str(extracted_case_file))

class MutaccExport(MutaccCommand):

def __init__(self, config_file, mutacc_binary=None, case_query=None, variant_query=None,
proband=False, member='affected', sample_name=None):

super(MutaccExport, self).__init__(config_file, mutacc_binary=mutacc_binary)

self.add_subcommand('db')
self.add_subcommand('export')

if variant_query is not None:
self.add_option('variant-query', variant_query)
if case_query is not None:
self.add_option('case-query', case_query)
if sample_name is not None:
self.add_option('sample-name', sample_name)
if proband:
self.add_option('proband')

self.add_option('member', member)
self.add_option('json-out')

class MutaccSynthesize(MutaccCommand):

def __init__(self, config_file, fastq1, bam_file, query_file, fastq2=None,
mutacc_binary=None):

super(MutaccSynthesize, self).__init__(config_file, mutacc_binary=mutacc_binary)

self.add_subcommand('synthesize')

self.add_option('background-bam', bam_file)
self.add_option('background-fastq', fastq1)
if fastq2 is not None:
self.add_option('background-fastq2', fastq2)
self.add_option('query', query_file)
self.add_option('json-out')
37 changes: 37 additions & 0 deletions mutacc_auto/commands/vcf_command.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from .command import Command as BaseCommand

BGZIP_COMMAND = 'bgzip'
TABIX_COMMAND = 'tabix'
BCFTOOLS_MERGE_COMMAND = 'bcftools'

class BgzipCommand(BaseCommand):

def __init__(self, vcf_file):

super(BgzipCommand, self).__init__(BGZIP_COMMAND)

self.add_argument(vcf_file)

class TabixCommand(BaseCommand):

def __init__(self, vcf_file):

super(TabixCommand, self).__init__(TABIX_COMMAND)

self.add_argument(vcf_file)

class BcftoolsMergeCommand(BaseCommand):

def __init__(self, vcf_files, out_vcf=None):

super(BcftoolsMergeCommand, self).__init__(BCFTOOLS_MERGE_COMMAND)

self.add_subcommand('merge')

if out_vcf:
self.add_option('output', out_vcf)

self.add_option('output-type', 'z')

for vcf_file in vcf_files:
self.add_argument(vcf_file)
226 changes: 226 additions & 0 deletions mutacc_auto/recipes/export_recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
"""
Functions to export synthetic dataset
"""

import logging
import json
import os
from pathlib import Path

from mutacc_auto.commands.mutacc_command import (MutaccExport, MutaccSynthesize)
from mutacc_auto.commands.vcf_command import (BgzipCommand, TabixCommand, BcftoolsMergeCommand)

LOG = logging.getLogger(__name__)


def run_mutacc_export_command(mutacc_config, mutacc_binary=None, case_query=None,
variant_query=None, proband=False, member='affected',
sample_name=None):

"""
Runs the 'mutacc db export' command
Args:
mutacc_config (Path): Path to mutacc config file
mutacc_binary (Path): path to mutacc binary
case_query (string): json string with query against case collection
variant_query (string): json string with query against variant collection
proband (bool): True if sample is proband, False if not
member (string): affected|father|mother|child
sample_name (string): name of created sample
returns:
export_out (string): stdout from mutacc command
"""

mutacc_export_command = MutaccExport(config_file=mutacc_config,
mutacc_binary=mutacc_binary,
case_query=case_query,
variant_query=variant_query,
proband=proband,
member=member,
sample_name=sample_name)
export_out = mutacc_export_command.check_output()
export_out = json.loads(export_out)
return export_out

def export_trio(mutacc_config, mutacc_binary=None, case_query=None, variant_query=None):

"""
Exports trio from mutaccDB
Args:
mutacc_config (Path): Path to mutacc config file
mutacc_binary (Path): path to mutacc binary
case_query (string): json string with query against case collection
variant_query (string): json string with query against variant collection
Returns:
mutacc_files (dict): files created by mutacc for each sample
"""

members = ('child', 'father', 'mother')
mutacc_files = {}
for member in members:

member_files = run_mutacc_export_command(mutacc_config=mutacc_config,
mutacc_binary=mutacc_binary,
case_query=case_query,
variant_query=variant_query,
proband=True if member == 'child' else False,
member=member,
sample_name=member)

mutacc_files[member] = member_files

return mutacc_files

def bgzip_vcf_file(vcf_file):

"""
Uses command line tool 'bgzip' to compress vcf
Args:
vcf_file (Path): path to vcf
Returns:
vcf_gz_file (Path): path to compressed vcf
"""

zip_command = BgzipCommand(vcf_file)
zip_command.call()
return f"{vcf_file}.gz"

def index_vcf_file(vcf_file):

"""
Uses command line tool 'tabix' to index vcf
Args:
vcf_file (Path): path to vcf
Returns:
vcf_index_file (Path): path to indexed vcf
"""

index_command = TabixCommand(vcf_file)
index_command.call()
return f"{vcf_file}.tbi"

def merge_vcf_files(vcf_files, out_file=None):

"""
Merges vcf files using 'bcftools merge'
Args:
vcf_files (list): list of paths to vcf files
Returns:
out_file (Path): path to merged vcf file
"""

merge_command = BcftoolsMergeCommand(vcf_files, out_vcf=out_file)
merge_command.call()
return out_file

def synthesize_dataset(sample, mutacc_binary=None, mutacc_config=None):

"""
Uses 'mutacc synthesize' to make synthetic dataset
Args:
sample (dict): Dictionary with fastq files, bam, and query file for sample
mutacc_config (Path): Path to mutacc config file
mutacc_binary (Path): path to mutacc binary
Returns:
dataset (list): list of fastq files created
"""

synthesize_command = MutaccSynthesize(config_file=mutacc_config,
mutacc_binary=mutacc_binary,
fastq1=sample['fastq1'],
fastq2=sample['fastq2'],
bam_file=sample['bam'],
query_file=sample['query'])

dataset = synthesize_command.check_output()
dataset = json.loads(dataset)['fastq_files']
return dataset


def synthesize_trio(mutacc_config, samples, mutacc_binary=None):
"""
Synthesizes a trio
Args:
mutacc_config (Path): Path to mutacc config file
mutacc_binary (Path): path to mutacc binary
samples (dict(dict)): fastq files, bam, query for each sample in trio
Returns:
datasets (dict): fastq files for each sample
"""
datasets = {}
for member in samples.keys():
dataset = synthesize_dataset(mutacc_config=mutacc_config,
mutacc_binary=mutacc_binary,
sample=samples[member])
datasets[member] = dataset

return datasets


def export_dataset(mutacc_config, background=None, mutacc_binary=None, case_query=None,
variant_query=None, merged_vcf_path=None):

"""
Export a synthetic trio
Args:
mutacc_config (Path): Path to mutacc config file
mutacc_binary (Path): path to mutacc binary
background (dict): dictionary with background files to be used for each sample
case_query (string): json string with query against case collection
variant_query (string): json string with query against variant collection
merged_vcf_path (Path): path where to create vcf file
Returns:
datasets (dict): fastq files for each sample
"""

files = export_trio(mutacc_config=mutacc_config,
mutacc_binary=mutacc_binary,
case_query=case_query,
variant_query=variant_query)

vcf_files = [files[member]['vcf_file'] for member in files.keys()]
zipped_vcf_files = [bgzip_vcf_file(vcf_file) for vcf_file in vcf_files]
indexed_vcf_files = [index_vcf_file(vcf_file) for vcf_file in zipped_vcf_files]

vcf_path = Path.cwd().joinpath('merged_mutacc_set.vcf.gz')
if merged_vcf_path:
vcf_path = merged_vcf_path

vcf_path = merge_vcf_files(vcf_files=zipped_vcf_files, out_file=vcf_path)

# Remove individual vcf files and their indices
for zipped, indexed in zip(zipped_vcf_files, indexed_vcf_files):
os.remove(zipped)
os.remove(indexed)

print("FILES: ", files)
print("BACKGROUNDS: ", background)
samples = {member: {'fastq1': background[member]['fastq1'],
'fastq2': background[member]['fastq2'],
'bam': background[member]['bam'],
'query': files[member]['query_file']} for member in files.keys()}

datasets = synthesize_trio(mutacc_config=mutacc_config,
samples=samples,
mutacc_binary=mutacc_binary)

return datasets

0 comments on commit 5c2144a

Please sign in to comment.