Skip to content

Commit

Permalink
Merge 9774198 into f7e26c6
Browse files Browse the repository at this point in the history
  • Loading branch information
adrosenbaum committed Mar 12, 2019
2 parents f7e26c6 + 9774198 commit ef5a76a
Show file tree
Hide file tree
Showing 6 changed files with 222 additions and 7 deletions.
5 changes: 4 additions & 1 deletion mutacc_auto/commands/scout_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ def __init__(self, case_id=None):

class ScoutExportCausativeVariants(ScoutCommand):

def __init__(self, case_id):
def __init__(self, case_id, json_output = True):

super(ScoutExportCausativeVariants, self).__init__()

self.add_subcommand('export')
self.add_subcommand('variants')

if json_output: self.add_option('json')

self.add_option('case-id', value=case_id)
151 changes: 150 additions & 1 deletion mutacc_auto/parse/parse_scout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@
from datetime import datetime, timedelta

from mutacc_auto.commands.scout_command import ScoutExportCases
from mutacc_auto.parse.vcf_constants import (SCOUT_TO_FORMAT,
SCOUT_TO_INFO,
HEADER,
NEWLINE,
TAB,
COLUMN_NAMES,
SCOUT_TO_COLUMNS)

#The timestamp in the scout database seems to be given with
#millisecond precision, it is therefor necessary to divide the
Expand All @@ -19,7 +26,7 @@ def get_cases_from_scout(scout_output, days_ago=None):
scout_output (str): output from scout command
days_ago (int): number of days since case updated
Returns (list(dict)): list of dictionaries representing the cases
Returns (list(dict)): list of dictionaries representing the cases
"""

cases = json.loads(scout_output)
Expand All @@ -39,3 +46,145 @@ def get_cases_from_scout(scout_output, days_ago=None):
recent_cases.append(case)

return recent_cases





def get_vcf_from_json(scout_vcf_output):

"""
Reconstructs vcf from scout variant object
Args:
scout_vcf_output (str): string returned by command 'scout export variants --json'
Returns:
vcf_string (str): string with vcf content
"""

scout_vcf_output = json.loads(scout_vcf_output)

vcf_string = ""

#Write header of vcf
for header_line in HEADER:
vcf_string += header_line + NEWLINE

#Get samples
samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']]

#Append sample names to the COLUMN_NAMES list
column_names = COLUMN_NAMES + samples
column_names = TAB.join(column_names)

vcf_string += column_names + NEWLINE

#Write variants
for variant in scout_vcf_output:

#Write column values
record = get_columns(variant)

#write INFO
info = get_info(variant)
record.append(info)

#Write the format a
format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
record.append(format)

#write genotypes for each sample
samples = get_genotypes(variant)
record.append(samples)

record = TAB.join(record) + NEWLINE

#Add variant record to vcf_string
vcf_string += record

return vcf_string

def get_columns(variant):
"""
Given a variant object from scout, write the columns CHR - FILTER
as a string with values separated by tab
Args:
variant (dict): dictionary of scout variant object
Returns:
record (str): values CHR-FILTER as a string
"""
record = []

for column in SCOUT_TO_COLUMNS:

if type(variant[column]) == list:
column_value = ','.join([str(element) for element in variant[column]])

else:
column_value = str(variant[column])

record.append(column_value)

return record

def get_info(variant):
"""
Given a variant object from scout, write the INFO column
for a variant.
Args:
variant (dict): dictionary of scout variant object
Returns:
info (str): INFO string
"""
info = []
for ID in SCOUT_TO_INFO.keys():

info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
info.append(info_string)

if variant['category'].lower() == 'snv':
info_string = f"TYPE={variant['sub_category']}"

else:
info_string = f"SVTYPE={variant['sub_category']}"

info.append(info_string)

info = ';'.join(info)

return info

def get_genotypes(variant):
"""
Given a variant object from scout, write the genotypes column for each
sample.
Args:
variant (dict): dictionary of scout variant object
Returns:
samples (str): genotypes for each sample
"""
samples = []
for sample in variant['samples']:

gt_calls = []
for ID in SCOUT_TO_FORMAT.keys():

if type(sample[ID]) == list:

ID_value = ','.join([str(element) for element in sample[ID]])

else:
ID_value = str(sample[ID])

gt_calls.append(ID_value)

gt_calls = ':'.join(gt_calls)
samples.append(gt_calls)

samples = TAB.join(samples)

return samples
60 changes: 60 additions & 0 deletions mutacc_auto/parse/vcf_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#Scout fields name: vcf ID
SCOUT_TO_FORMAT = {

'genotype_call': 'GT',
'allele_depths': 'AD',
'read_depth': 'DP',
'genotype_quality': 'GQ'

}

SCOUT_TO_INFO = {

'end': 'END',
'rank_score': 'RankScore'
}

HEADER = (

'##fileformat=VCFv4.2',

'##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">',
'##INFO=<ID=RankScore,Number=.,Type=String,Description="The rank score for this variant in this family. family_id:rank_score.">',


'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

)

NEWLINE = '\n'

TAB = '\t'

COLUMN_NAMES = [
'#CHROM',
'POS',
'ID',
'REF',
'ALT',
'QUAL',
'FILTER',
'INFO',
'FORMAT'
]

SCOUT_TO_COLUMNS = (

'chromosome', #CHROM
'position', #POS
'dbsnp_id', #ID
'reference', #REF
'alternative',#ALT
'quality', #QUAL
'filters' #FILTER

)
5 changes: 3 additions & 2 deletions mutacc_auto/recipes/input_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from mutacc_auto.utils.tmp_dir import TemporaryDirectory
from mutacc_auto.commands.scout_command import ScoutExportCases, ScoutExportCausativeVariants
from mutacc_auto.commands.housekeeper_command import HousekeeperCommand
from mutacc_auto.parse.parse_scout import get_cases_from_scout
from mutacc_auto.parse.parse_scout import get_cases_from_scout, get_vcf_from_json
from mutacc_auto.parse.parse_housekeeper import get_bams_from_housekeeper
from mutacc_auto.build_input.input_assemble import get_case

Expand Down Expand Up @@ -66,7 +66,8 @@ def write_vcf(case_id, directory):
) as vcf_handle:

vcf_command = ScoutExportCausativeVariants(case_id)
vcf_content = vcf_command.check_output()
vcf_scout_output = vcf_command.check_output()
vcf_content = get_vcf_from_json(vcf_scout_output)
vcf_handle.write(vcf_content)

vcf_path = vcf_handle.name
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/scout_variant_output.json

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions tests/recipes/test_input_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
HK_OUT_FILE = "tests/fixtures/HK_output_test.txt"
SCOUT_OUT_FILE = "tests/fixtures/scout_output.json"
TEST_VCF = "tests/fixtures/test_vcf.vcf"
TEST_SCOUT_VARIANT = "tests/fixtures/scout_variant_output.json"

def mock_hk_output(case_id):

Expand All @@ -28,9 +29,9 @@ def mock_scout_output(case_id):

return scout_out

def mock_vcf(case_id):
def mock_scout_variant(case_id):

with open(TEST_VCF) as vcf_handle:
with open(TEST_SCOUT_VARIANT) as vcf_handle:

vcf_out = vcf_handle.read()

Expand All @@ -49,7 +50,7 @@ def test_get_bams():

assert len(bams) == 3

@patch.object(Command, 'check_output', mock_vcf)
@patch.object(Command, 'check_output', mock_scout_variant)
def test_write_vcf(tmpdir):
tmp_dir = Path(tmpdir.mkdir('test_write_vcf'))
vcf_path = write_vcf('case_id', tmp_dir)
Expand Down

0 comments on commit ef5a76a

Please sign in to comment.