Merge 9774198 into f7e26c6

Clinical-Genomics · Mar 12, 2019 · ef5a76a · ef5a76a
2 parents f7e26c6 + 9774198
commit ef5a76a
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 7 deletions.
diff --git a/mutacc_auto/commands/scout_command.py b/mutacc_auto/commands/scout_command.py
@@ -29,10 +29,13 @@ def __init__(self, case_id=None):
 
 class ScoutExportCausativeVariants(ScoutCommand):
 
-    def __init__(self, case_id):
+    def __init__(self, case_id, json_output = True):
 
         super(ScoutExportCausativeVariants, self).__init__()
 
         self.add_subcommand('export')
         self.add_subcommand('variants')
+
+        if json_output: self.add_option('json')
+
         self.add_option('case-id', value=case_id)
diff --git a/mutacc_auto/parse/parse_scout.py b/mutacc_auto/parse/parse_scout.py
@@ -2,6 +2,13 @@
 from datetime import datetime, timedelta
 
 from mutacc_auto.commands.scout_command import ScoutExportCases
+from mutacc_auto.parse.vcf_constants import (SCOUT_TO_FORMAT,
+                                             SCOUT_TO_INFO,
+                                             HEADER,
+                                             NEWLINE,
+                                             TAB,
+                                             COLUMN_NAMES,
+                                             SCOUT_TO_COLUMNS)
 
 #The timestamp in the scout database seems to be given with
 #millisecond precision, it is therefor necessary to divide the
@@ -19,7 +26,7 @@ def get_cases_from_scout(scout_output, days_ago=None):
             scout_output (str): output from scout command
             days_ago (int): number of days since case updated
 
-        Returns (list(dict)): list of dictionaries representing the cases 
+        Returns (list(dict)): list of dictionaries representing the cases
     """
 
     cases = json.loads(scout_output)
@@ -39,3 +46,145 @@ def get_cases_from_scout(scout_output, days_ago=None):
             recent_cases.append(case)
 
     return recent_cases
+
+
+
+
+
+def get_vcf_from_json(scout_vcf_output):
+
+    """
+        Reconstructs vcf from scout variant object
+
+        Args:
+            scout_vcf_output (str): string returned by command 'scout export variants --json'
+
+        Returns:
+            vcf_string (str): string with vcf content
+    """
+
+    scout_vcf_output = json.loads(scout_vcf_output)
+
+    vcf_string = ""
+
+    #Write header of vcf
+    for header_line in HEADER:
+        vcf_string += header_line + NEWLINE
+
+    #Get samples
+    samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']]
+
+    #Append sample names to the COLUMN_NAMES list
+    column_names = COLUMN_NAMES + samples
+    column_names = TAB.join(column_names)
+
+    vcf_string += column_names + NEWLINE
+
+    #Write variants
+    for variant in scout_vcf_output:
+
+        #Write column values
+        record = get_columns(variant)
+
+        #write INFO
+        info = get_info(variant)
+        record.append(info)
+
+        #Write the format a
+        format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
+        record.append(format)
+
+        #write genotypes for each sample
+        samples = get_genotypes(variant)
+        record.append(samples)
+
+        record = TAB.join(record) + NEWLINE
+
+        #Add variant record to vcf_string
+        vcf_string += record
+
+    return vcf_string
+
+def get_columns(variant):
+    """
+        Given a variant object from scout, write the columns CHR - FILTER
+        as a string with values separated by tab
+
+        Args:
+            variant (dict): dictionary of scout variant object
+        Returns:
+            record (str): values CHR-FILTER as a string
+    """
+    record = []
+
+    for column in SCOUT_TO_COLUMNS:
+
+        if type(variant[column]) == list:
+            column_value = ','.join([str(element) for element in variant[column]])
+
+        else:
+            column_value = str(variant[column])
+
+        record.append(column_value)
+
+    return record
+
+def get_info(variant):
+    """
+        Given a variant object from scout, write the INFO column
+        for a variant.
+
+        Args:
+            variant (dict): dictionary of scout variant object
+        Returns:
+            info (str): INFO string
+    """
+    info = []
+    for ID in SCOUT_TO_INFO.keys():
+
+        info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
+        info.append(info_string)
+
+    if variant['category'].lower() == 'snv':
+        info_string = f"TYPE={variant['sub_category']}"
+
+    else:
+        info_string = f"SVTYPE={variant['sub_category']}"
+
+    info.append(info_string)
+
+    info = ';'.join(info)
+
+    return info
+
+def get_genotypes(variant):
+    """
+        Given a variant object from scout, write the genotypes column for each
+        sample.
+
+        Args:
+            variant (dict): dictionary of scout variant object
+        Returns:
+            samples (str): genotypes for each sample
+    """
+    samples = []
+    for sample in variant['samples']:
+
+        gt_calls = []
+        for ID in SCOUT_TO_FORMAT.keys():
+
+            if type(sample[ID]) == list:
+
+                ID_value = ','.join([str(element) for element in sample[ID]])
+
+            else:
+                ID_value = str(sample[ID])
+
+            gt_calls.append(ID_value)
+
+        gt_calls = ':'.join(gt_calls)
+        samples.append(gt_calls)
+
+    samples = TAB.join(samples)
+
+    return samples
diff --git a/mutacc_auto/parse/vcf_constants.py b/mutacc_auto/parse/vcf_constants.py
@@ -0,0 +1,60 @@
+#Scout fields name: vcf ID
+SCOUT_TO_FORMAT = {
+
+    'genotype_call': 'GT',
+    'allele_depths': 'AD',
+    'read_depth': 'DP',
+    'genotype_quality': 'GQ'
+
+}
+
+SCOUT_TO_INFO = {
+
+    'end': 'END',
+    'rank_score': 'RankScore'
+}
+
+HEADER = (
+
+    '##fileformat=VCFv4.2',
+
+    '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">',
+    '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
+    '##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">',
+    '##INFO=<ID=RankScore,Number=.,Type=String,Description="The rank score for this variant in this family. family_id:rank_score.">',
+
+
+    '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
+    '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
+    '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
+    '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'
+
+)
+
+NEWLINE = '\n'
+
+TAB = '\t'
+
+COLUMN_NAMES = [
+    '#CHROM',
+    'POS',
+    'ID',
+    'REF',
+    'ALT',
+    'QUAL',
+    'FILTER',
+    'INFO',
+    'FORMAT'
+]
+
+SCOUT_TO_COLUMNS = (
+
+    'chromosome', #CHROM
+    'position',   #POS
+    'dbsnp_id',   #ID
+    'reference',  #REF
+    'alternative',#ALT
+    'quality',    #QUAL
+    'filters'     #FILTER
+
+)
diff --git a/mutacc_auto/recipes/input_recipe.py b/mutacc_auto/recipes/input_recipe.py
@@ -5,7 +5,7 @@
 from mutacc_auto.utils.tmp_dir import TemporaryDirectory
 from mutacc_auto.commands.scout_command import ScoutExportCases, ScoutExportCausativeVariants
 from mutacc_auto.commands.housekeeper_command import HousekeeperCommand
-from mutacc_auto.parse.parse_scout import get_cases_from_scout
+from mutacc_auto.parse.parse_scout import get_cases_from_scout, get_vcf_from_json
 from mutacc_auto.parse.parse_housekeeper import get_bams_from_housekeeper
 from mutacc_auto.build_input.input_assemble import get_case
 
@@ -66,7 +66,8 @@ def write_vcf(case_id, directory):
         ) as vcf_handle:
 
         vcf_command = ScoutExportCausativeVariants(case_id)
-        vcf_content = vcf_command.check_output()
+        vcf_scout_output = vcf_command.check_output()
+        vcf_content = get_vcf_from_json(vcf_scout_output)
         vcf_handle.write(vcf_content)
 
         vcf_path = vcf_handle.name

diff --git a/tests/fixtures/scout_variant_output.json b/tests/fixtures/scout_variant_output.json
diff --git a/tests/recipes/test_input_recipe.py b/tests/recipes/test_input_recipe.py
@@ -11,6 +11,7 @@
 HK_OUT_FILE = "tests/fixtures/HK_output_test.txt"
 SCOUT_OUT_FILE = "tests/fixtures/scout_output.json"
 TEST_VCF = "tests/fixtures/test_vcf.vcf"
+TEST_SCOUT_VARIANT = "tests/fixtures/scout_variant_output.json"
 
 def mock_hk_output(case_id):
 
@@ -28,9 +29,9 @@ def mock_scout_output(case_id):
 
     return scout_out
 
-def mock_vcf(case_id):
+def mock_scout_variant(case_id):
 
-    with open(TEST_VCF) as vcf_handle:
+    with open(TEST_SCOUT_VARIANT) as vcf_handle:
 
         vcf_out = vcf_handle.read()
 
@@ -49,7 +50,7 @@ def test_get_bams():
 
     assert len(bams) == 3
 
-@patch.object(Command, 'check_output', mock_vcf)
+@patch.object(Command, 'check_output', mock_scout_variant)
 def test_write_vcf(tmpdir):
     tmp_dir = Path(tmpdir.mkdir('test_write_vcf'))
     vcf_path = write_vcf('case_id', tmp_dir)