Clinical-Genomics · adrosenbaum · Mar 13, 2019 · Mar 12, 2019 · Mar 12, 2019 · Mar 12, 2019
diff --git a/mutacc_auto/commands/scout_command.py b/mutacc_auto/commands/scout_command.py
@@ -29,10 +29,13 @@ def __init__(self, case_id=None):
 
 class ScoutExportCausativeVariants(ScoutCommand):
 
-    def __init__(self, case_id):
+    def __init__(self, case_id, json_output = True):
 
         super(ScoutExportCausativeVariants, self).__init__()
 
         self.add_subcommand('export')
         self.add_subcommand('variants')
+
+        if json_output: self.add_option('json')
+
         self.add_option('case-id', value=case_id)
diff --git a/mutacc_auto/parse/parse_scout.py b/mutacc_auto/parse/parse_scout.py
@@ -2,6 +2,7 @@
 from datetime import datetime, timedelta
 
 from mutacc_auto.commands.scout_command import ScoutExportCases
+from mutacc_auto.parse.vcf_constants import *
 
 #The timestamp in the scout database seems to be given with
 #millisecond precision, it is therefor necessary to divide the
@@ -19,7 +20,7 @@ def get_cases_from_scout(scout_output, days_ago=None):
             scout_output (str): output from scout command
             days_ago (int): number of days since case updated
 
-        Returns (list(dict)): list of dictionaries representing the cases 
+        Returns (list(dict)): list of dictionaries representing the cases
     """
 
     cases = json.loads(scout_output)
@@ -39,3 +40,98 @@ def get_cases_from_scout(scout_output, days_ago=None):
             recent_cases.append(case)
 
     return recent_cases
+
+
+
+
+
+def get_vcf_from_json(scout_vcf_output):
+
+    """
+        Reconstructs vcf from scout variant object
+
+        Args:
+            scout_vcf_output (str): string returned by command 'scout export variants --json'
+
+        Returns:
+            vcf_string (str): string with vcf content
+    """
+
+    scout_vcf_output = json.loads(scout_vcf_output)
+
+    vcf_string = ""
+
+    #Write header of vcf
+    for header_line in HEADER:
+        vcf_string += header_line + '\n'
+
+    #Get samples
+    samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']]
+    samples = '\t'.join(samples)
+
+    vcf_string += f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{samples}\n"
+
+
+    #Write variants
+    for variant in scout_vcf_output:
+
+        entry= []
+        entry.append(str(variant['chromosome'])) #CHROM
+        entry.append(str(variant['position'])) #POS
+        entry.append(str(variant['dbsnp_id'] or '.')) #ID
+        entry.append(str(variant['reference'])) #REF
+        entry.append(str(variant['alternative'])) #ALT
+        entry.append(str(variant['quality'])) #QUAL
+        entry.append('PASS') #FILTER
+
+        #write INFO
+        info = []
+        for ID in SCOUT_TO_INFO.keys():
+
+            info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
+            info.append(info_string)
+
+        if variant['category'].lower() == 'snv':
+            info_string = f"TYPE={variant['sub_category']}"
+
+        else:
+            info_string = f"SVTYPE={variant['sub_category']}"
+
+        info.append(info_string)
+
+        info = ';'.join(info)
+
+        entry.append(info)
+
+        #Write the format and genotype calls
+        format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
+
+        entry.append(format)
+
+        samples = []
+        for sample in variant['samples']:
+
+            gt_calls = []
+            for ID in SCOUT_TO_FORMAT.keys():
+
+                if type(sample[ID]) == list:
+
+                    ID_value = ','.join([str(element) for element in sample[ID]])
+
+                else:
+                    ID_value = str(sample[ID])
+
+                gt_calls.append(ID_value)
+
+            gt_calls = ':'.join(gt_calls)
+            samples.append(gt_calls)
+
+        samples = '\t'.join(samples)
+
+        entry.append(samples)
+
+        entry = '\t'.join(entry) + '\n'
+
+        vcf_string += entry
+
+    return vcf_string
diff --git a/mutacc_auto/parse/vcf_constants.py b/mutacc_auto/parse/vcf_constants.py
@@ -0,0 +1,32 @@
+#Scout fields name: vcf ID
+SCOUT_TO_FORMAT = {
+
+    'genotype_call': 'GT',
+    'allele_depths': 'AD',
+    'read_depth': 'DP',
+    'genotype_quality': 'GQ'
+
+}
+
+SCOUT_TO_INFO = {
+
+    'end': 'END',
+    'rank_score': 'RankScore'
+}
+
+HEADER = (
+
+    '##fileformat=VCFv4.2',
+
+    '##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">',
+    '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
+    '##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">',
+    '##INFO=<ID=RankScore,Number=.,Type=String,Description="The rank score for this variant in this family. family_id:rank_score.">',
+
+
+    '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
+    '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
+    '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
+    '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'
+
+)
diff --git a/mutacc_auto/recipes/input_recipe.py b/mutacc_auto/recipes/input_recipe.py
@@ -5,7 +5,7 @@
 from mutacc_auto.utils.tmp_dir import TemporaryDirectory
 from mutacc_auto.commands.scout_command import ScoutExportCases, ScoutExportCausativeVariants
 from mutacc_auto.commands.housekeeper_command import HousekeeperCommand
-from mutacc_auto.parse.parse_scout import get_cases_from_scout
+from mutacc_auto.parse.parse_scout import get_cases_from_scout, get_vcf_from_json
 from mutacc_auto.parse.parse_housekeeper import get_bams_from_housekeeper
 from mutacc_auto.build_input.input_assemble import get_case
 
@@ -66,7 +66,8 @@ def write_vcf(case_id, directory):
         ) as vcf_handle:
 
         vcf_command = ScoutExportCausativeVariants(case_id)
-        vcf_content = vcf_command.check_output()
+        vcf_scout_output = vcf_command.check_output()
+        vcf_content = get_vcf_from_json(vcf_scout_output)
         vcf_handle.write(vcf_content)
 
         vcf_path = vcf_handle.name

diff --git a/tests/fixtures/scout_variant_output.json b/tests/fixtures/scout_variant_output.json
diff --git a/tests/recipes/test_input_recipe.py b/tests/recipes/test_input_recipe.py
@@ -11,6 +11,7 @@
 HK_OUT_FILE = "tests/fixtures/HK_output_test.txt"
 SCOUT_OUT_FILE = "tests/fixtures/scout_output.json"
 TEST_VCF = "tests/fixtures/test_vcf.vcf"
+TEST_SCOUT_VARIANT = "tests/fixtures/scout_variant_output.json"
 
 def mock_hk_output(case_id):
 
@@ -28,9 +29,9 @@ def mock_scout_output(case_id):
 
     return scout_out
 
-def mock_vcf(case_id):
+def mock_scout_variant(case_id):
 
-    with open(TEST_VCF) as vcf_handle:
+    with open(TEST_SCOUT_VARIANT) as vcf_handle:
 
         vcf_out = vcf_handle.read()
 
@@ -49,7 +50,7 @@ def test_get_bams():
 
     assert len(bams) == 3
 
-@patch.object(Command, 'check_output', mock_vcf)
+@patch.object(Command, 'check_output', mock_scout_variant)
 def test_write_vcf(tmpdir):
     tmp_dir = Path(tmpdir.mkdir('test_write_vcf'))
     vcf_path = write_vcf('case_id', tmp_dir)