Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recreate vcf from variants in scout #6

Merged
merged 3 commits into from
Mar 13, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion mutacc_auto/commands/scout_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ def __init__(self, case_id=None):

class ScoutExportCausativeVariants(ScoutCommand):

def __init__(self, case_id):
def __init__(self, case_id, json_output = True):

super(ScoutExportCausativeVariants, self).__init__()

self.add_subcommand('export')
self.add_subcommand('variants')

if json_output: self.add_option('json')

self.add_option('case-id', value=case_id)
98 changes: 97 additions & 1 deletion mutacc_auto/parse/parse_scout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from datetime import datetime, timedelta

from mutacc_auto.commands.scout_command import ScoutExportCases
from mutacc_auto.parse.vcf_constants import *
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this mean import all? If so, it is usually better to be explicit. Otherwise your namespace might be polluted without you knowing it.


#The timestamp in the scout database seems to be given with
#millisecond precision, it is therefor necessary to divide the
Expand All @@ -19,7 +20,7 @@ def get_cases_from_scout(scout_output, days_ago=None):
scout_output (str): output from scout command
days_ago (int): number of days since case updated

Returns (list(dict)): list of dictionaries representing the cases
Returns (list(dict)): list of dictionaries representing the cases
"""

cases = json.loads(scout_output)
Expand All @@ -39,3 +40,98 @@ def get_cases_from_scout(scout_output, days_ago=None):
recent_cases.append(case)

return recent_cases





def get_vcf_from_json(scout_vcf_output):

"""
Reconstructs vcf from scout variant object

Args:
scout_vcf_output (str): string returned by command 'scout export variants --json'

Returns:
vcf_string (str): string with vcf content
"""

scout_vcf_output = json.loads(scout_vcf_output)

vcf_string = ""

#Write header of vcf
for header_line in HEADER:
vcf_string += header_line + '\n'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like a named constant + NEWLINE


#Get samples
samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']]
samples = '\t'.join(samples)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here TAB


vcf_string += f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{samples}\n"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like a simple "TAB.join(#CHROM, POS, ...etc)" if possible or even better
header_column_names = ["#CHROM", "POS", ..etc]
TAB.join(header_column_names, samples) + NEWLINE

That is a better description of what the parts are and more readable. You will have to excuse my python, but I think you get the idea



#Write variants
for variant in scout_vcf_output:

entry= []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

entry = []

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And a line is vcf is usually called a "record" I think...

entry.append(str(variant['chromosome'])) #CHROM
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you not make this into a for loop and append each iteration using the key as iterator.
something like:
entry = []
for column in scout_header_column_names:
if column in variant:
entry.append(str(variant[column] or '.')) # Since it was only uninitilized place

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Handle the PASS outside loop if it is not part of variant

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make this into a def

entry.append(str(variant['position'])) #POS
entry.append(str(variant['dbsnp_id'] or '.')) #ID
entry.append(str(variant['reference'])) #REF
entry.append(str(variant['alternative'])) #ALT
entry.append(str(variant['quality'])) #QUAL
entry.append('PASS') #FILTER

#write INFO
info = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make this into a def

for ID in SCOUT_TO_INFO.keys():

info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
info.append(info_string)

if variant['category'].lower() == 'snv':
info_string = f"TYPE={variant['sub_category']}"

else:
info_string = f"SVTYPE={variant['sub_category']}"

info.append(info_string)

info = ';'.join(info)

entry.append(info)

#Write the format and genotype calls
format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])

entry.append(format)

samples = []
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make this into a def

for sample in variant['samples']:

gt_calls = []
for ID in SCOUT_TO_FORMAT.keys():

if type(sample[ID]) == list:

ID_value = ','.join([str(element) for element in sample[ID]])

else:
ID_value = str(sample[ID])

gt_calls.append(ID_value)

gt_calls = ':'.join(gt_calls)
samples.append(gt_calls)

samples = '\t'.join(samples)

entry.append(samples)

entry = '\t'.join(entry) + '\n'

vcf_string += entry

return vcf_string
32 changes: 32 additions & 0 deletions mutacc_auto/parse/vcf_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#Scout fields name: vcf ID
SCOUT_TO_FORMAT = {

'genotype_call': 'GT',
'allele_depths': 'AD',
'read_depth': 'DP',
'genotype_quality': 'GQ'

}

SCOUT_TO_INFO = {

'end': 'END',
'rank_score': 'RankScore'
}

HEADER = (

'##fileformat=VCFv4.2',

'##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">',
'##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
'##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">',
'##INFO=<ID=RankScore,Number=.,Type=String,Description="The rank score for this variant in this family. family_id:rank_score.">',


'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'

)
5 changes: 3 additions & 2 deletions mutacc_auto/recipes/input_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from mutacc_auto.utils.tmp_dir import TemporaryDirectory
from mutacc_auto.commands.scout_command import ScoutExportCases, ScoutExportCausativeVariants
from mutacc_auto.commands.housekeeper_command import HousekeeperCommand
from mutacc_auto.parse.parse_scout import get_cases_from_scout
from mutacc_auto.parse.parse_scout import get_cases_from_scout, get_vcf_from_json
from mutacc_auto.parse.parse_housekeeper import get_bams_from_housekeeper
from mutacc_auto.build_input.input_assemble import get_case

Expand Down Expand Up @@ -66,7 +66,8 @@ def write_vcf(case_id, directory):
) as vcf_handle:

vcf_command = ScoutExportCausativeVariants(case_id)
vcf_content = vcf_command.check_output()
vcf_scout_output = vcf_command.check_output()
vcf_content = get_vcf_from_json(vcf_scout_output)
vcf_handle.write(vcf_content)

vcf_path = vcf_handle.name
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/scout_variant_output.json

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions tests/recipes/test_input_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
HK_OUT_FILE = "tests/fixtures/HK_output_test.txt"
SCOUT_OUT_FILE = "tests/fixtures/scout_output.json"
TEST_VCF = "tests/fixtures/test_vcf.vcf"
TEST_SCOUT_VARIANT = "tests/fixtures/scout_variant_output.json"

def mock_hk_output(case_id):

Expand All @@ -28,9 +29,9 @@ def mock_scout_output(case_id):

return scout_out

def mock_vcf(case_id):
def mock_scout_variant(case_id):

with open(TEST_VCF) as vcf_handle:
with open(TEST_SCOUT_VARIANT) as vcf_handle:

vcf_out = vcf_handle.read()

Expand All @@ -49,7 +50,7 @@ def test_get_bams():

assert len(bams) == 3

@patch.object(Command, 'check_output', mock_vcf)
@patch.object(Command, 'check_output', mock_scout_variant)
def test_write_vcf(tmpdir):
tmp_dir = Path(tmpdir.mkdir('test_write_vcf'))
vcf_path = write_vcf('case_id', tmp_dir)
Expand Down