Fixes based on comments

Clinical-Genomics · Mar 12, 2019 · 9774198 · 9774198
1 parent 86d7066
commit 9774198
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 41 deletions.
diff --git a/mutacc_auto/parse/parse_scout.py b/mutacc_auto/parse/parse_scout.py
@@ -2,7 +2,13 @@
 from datetime import datetime, timedelta
 
 from mutacc_auto.commands.scout_command import ScoutExportCases
-from mutacc_auto.parse.vcf_constants import *
+from mutacc_auto.parse.vcf_constants import (SCOUT_TO_FORMAT,
+                                             SCOUT_TO_INFO,
+                                             HEADER,
+                                             NEWLINE,
+                                             TAB,
+                                             COLUMN_NAMES,
+                                             SCOUT_TO_COLUMNS)
 
 #The timestamp in the scout database seems to be given with
 #millisecond precision, it is therefor necessary to divide the
@@ -63,75 +69,122 @@ def get_vcf_from_json(scout_vcf_output):
 
     #Write header of vcf
     for header_line in HEADER:
-        vcf_string += header_line + '\n'
+        vcf_string += header_line + NEWLINE
 
     #Get samples
     samples = [sample['sample_id'] for sample in scout_vcf_output[0]['samples']]
-    samples = '\t'.join(samples)
 
-    vcf_string += f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{samples}\n"
+    #Append sample names to the COLUMN_NAMES list
+    column_names = COLUMN_NAMES + samples
+    column_names = TAB.join(column_names)
 
+    vcf_string += column_names + NEWLINE
 
     #Write variants
     for variant in scout_vcf_output:
 
-        entry= []
-        entry.append(str(variant['chromosome'])) #CHROM
-        entry.append(str(variant['position'])) #POS
-        entry.append(str(variant['dbsnp_id'] or '.')) #ID
-        entry.append(str(variant['reference'])) #REF
-        entry.append(str(variant['alternative'])) #ALT
-        entry.append(str(variant['quality'])) #QUAL
-        entry.append('PASS') #FILTER
+        #Write column values
+        record = get_columns(variant)
 
         #write INFO
-        info = []
-        for ID in SCOUT_TO_INFO.keys():
+        info = get_info(variant)
+        record.append(info)
 
-            info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
-            info.append(info_string)
+        #Write the format a
+        format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
+        record.append(format)
+
+        #write genotypes for each sample
+        samples = get_genotypes(variant)
+        record.append(samples)
+
+        record = TAB.join(record) + NEWLINE
+
+        #Add variant record to vcf_string
+        vcf_string += record
+
+    return vcf_string
+
+def get_columns(variant):
+    """
+        Given a variant object from scout, write the columns CHR - FILTER
+        as a string with values separated by tab
 
-        if variant['category'].lower() == 'snv':
-            info_string = f"TYPE={variant['sub_category']}"
+        Args:
+            variant (dict): dictionary of scout variant object
+        Returns:
+            record (str): values CHR-FILTER as a string
+    """
+    record = []
+
+    for column in SCOUT_TO_COLUMNS:
+
+        if type(variant[column]) == list:
+            column_value = ','.join([str(element) for element in variant[column]])
 
         else:
-            info_string = f"SVTYPE={variant['sub_category']}"
+            column_value = str(variant[column])
 
-        info.append(info_string)
+        record.append(column_value)
 
-        info = ';'.join(info)
+    return record
 
-        entry.append(info)
+def get_info(variant):
+    """
+        Given a variant object from scout, write the INFO column
+        for a variant.
 
-        #Write the format and genotype calls
-        format = ':'.join([SCOUT_TO_FORMAT[ID] for ID in SCOUT_TO_FORMAT.keys()])
+        Args:
+            variant (dict): dictionary of scout variant object
+        Returns:
+            info (str): INFO string
+    """
+    info = []
+    for ID in SCOUT_TO_INFO.keys():
+
+        info_string = f"{SCOUT_TO_INFO[ID]}={int(variant[ID])}"
+        info.append(info_string)
 
-        entry.append(format)
+    if variant['category'].lower() == 'snv':
+        info_string = f"TYPE={variant['sub_category']}"
 
-        samples = []
-        for sample in variant['samples']:
+    else:
+        info_string = f"SVTYPE={variant['sub_category']}"
 
-            gt_calls = []
-            for ID in SCOUT_TO_FORMAT.keys():
+    info.append(info_string)
 
-                if type(sample[ID]) == list:
+    info = ';'.join(info)
 
-                    ID_value = ','.join([str(element) for element in sample[ID]])
+    return info
 
-                else:
-                    ID_value = str(sample[ID])
+def get_genotypes(variant):
+    """
+        Given a variant object from scout, write the genotypes column for each
+        sample.
 
-                gt_calls.append(ID_value)
+        Args:
+            variant (dict): dictionary of scout variant object
+        Returns:
+            samples (str): genotypes for each sample
+    """
+    samples = []
+    for sample in variant['samples']:
 
-            gt_calls = ':'.join(gt_calls)
-            samples.append(gt_calls)
+        gt_calls = []
+        for ID in SCOUT_TO_FORMAT.keys():
 
-        samples = '\t'.join(samples)
+            if type(sample[ID]) == list:
 
-        entry.append(samples)
+                ID_value = ','.join([str(element) for element in sample[ID]])
 
-        entry = '\t'.join(entry) + '\n'
+            else:
+                ID_value = str(sample[ID])
 
-        vcf_string += entry
+            gt_calls.append(ID_value)
 
-    return vcf_string
+        gt_calls = ':'.join(gt_calls)
+        samples.append(gt_calls)
+
+    samples = TAB.join(samples)
+
+    return samples
diff --git a/mutacc_auto/parse/vcf_constants.py b/mutacc_auto/parse/vcf_constants.py
@@ -30,3 +30,31 @@
     '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'
 
 )
+
+NEWLINE = '\n'
+
+TAB = '\t'
+
+COLUMN_NAMES = [
+    '#CHROM',
+    'POS',
+    'ID',
+    'REF',
+    'ALT',
+    'QUAL',
+    'FILTER',
+    'INFO',
+    'FORMAT'
+]
+
+SCOUT_TO_COLUMNS = (
+
+    'chromosome', #CHROM
+    'position',   #POS
+    'dbsnp_id',   #ID
+    'reference',  #REF
+    'alternative',#ALT
+    'quality',    #QUAL
+    'filters'     #FILTER
+
+)